# Customer Sentiment Extraction: An Opinion Mining Pipeline

### Download Libraries

In [1]:
# For feature extraction and spell checking
try:
    from textblob import TextBlob, Word
except ImportError:
    print("The 'textblob' package is not installed. Installing it now...")
    !pip install textblob
    from textblob import TextBlob, Word
    
# For printing results in table format
try:
    from prettytable import PrettyTable
except ImportError:
    print("The 'prettytable' package is not installed. Installing it now...")
    !pip install prettytable
    from prettytable import PrettyTable

# For association rule mining
try:
    from apyori import apriori
except ImportError:
    print("The 'apyori' package is not installed. Installing it now...")
    !pip install apyori
    from apyori import apriori
    
# For reviews preprocessing
try:
    import nltk
except ImportError:
    print("The 'apyori' package is not installed. Installing it now...")
    !pip install nltk
    from prettytable import PrettyTable

### Import Libraries

In [2]:
import itertools
import os
import re
import string
import time

from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer

nltk.download('punkt')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\maq\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\maq\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

## Step 1: Data Analysis

### Get Reviews Files

In [3]:
class ReviewsReader:
    def __init__(self, folder_name):
        """
        Initialize a ReviewsReader instance.

        Parameters:
        - folder_name (str): The name of the folder containing reviews files.
        """
        self._folder_name = folder_name
        self._folder_path = folder_name

        self._reviews_files = {}
        self._product_names = set()

    def get_reviews_files(self):
        """
        Extract information (product and file path) about reviews files from the specified folder, excluding readme files.

        Returns:
        - dict: Dictionary containing the product name as keys and file path as values.
        """
        files = [file for file in os.listdir(self._folder_path) if file.endswith('.txt') and file.lower() != 'readme.txt']

        for file_name in files:
            # Product reviews file path
            reviews_file = os.path.join(self._folder_path, file_name)

            # Product name
            product_name = os.path.splitext(file_name)[0]
            self._product_names.add(product_name)
            
            self._reviews_files[product_name] = reviews_file

        return self._reviews_files

    def get_products(self):
        """
        Get a set of product names.

        Returns:
        - set: Set containing product names.
        """
        return self._product_names 

### Read Reviews Files

In [4]:
reviews_reader = ReviewsReader('Data')
reviews_files  = reviews_reader.get_reviews_files()

print('Products count:', len(reviews_files))
print('Products:', reviews_reader.get_products())

print('\n')

Products count: 3
Products: {'Speaker', 'Router', 'Computer'}




In [5]:
class Reviews:
    def __init__(self, product, reviews):
        """
        Represents a collection of reviews for a specific product.
        """
        self.product = product
        self.reviews = reviews

        # All extracted features
        self.extracted_features = []

        # Top extracted features
        self.top_extracted_features = []
        
        # Infrequent features
        self.infrequent_features = []
        
        # The sentiments (postive and negative) per feature
        self.feature_sentiments = {}

    def __str__(self):
        return f'Reviews(product="{self.product}", reviews={self.reviews})'
    
    def preprocess(self, spell_check=False, remove_stopwords=False, stemming=False, lemmatization=False):
        """
        Preprocesses the parsed reviews.
        """
        for review in self.reviews:
            review.preprocess(spell_check, remove_stopwords, stemming, lemmatization)

        return
    
    def get_infrequent_features(self):
        """
        Get all infrequent features from the reviews.
        """
        for review in self.reviews:
            for review_line in review.review_lines:
                # Check sentences which have no top extracted features
                if all(feature not in self.top_extracted_features for feature in review_line.extracted_features):
                                        
                    for feature in review_line.extracted_features:
                        opinions = review_line._get_adjacent_adjectives(feature)

                        if opinions and feature not in self.infrequent_features:
                            self.infrequent_features.append(feature)

        self.infrequent_features = list(set(self.infrequent_features))

        return

    def get_extracted_features(self):
        """
        Get all extracted features from the reviews.
        """
        for review in self.reviews:
            for review_line in review.review_lines:
                self.extracted_features.append(review_line.extracted_features)

        return

    def extract_top_features(self, min_support=0.005, min_confidence=0.2, min_lift=3, min_length=3, max_length=3):    
        """
        Extracts top features using the Apriori algorithm based on association rule mining.

        Parameters:
        - min_support (float): Minimum support threshold for association rules.
        - min_confidence (float): Minimum confidence threshold for association rules.
        - min_lift (float): Minimum lift threshold for association rules.
        - min_length (int): Minimum length of itemsets to consider.
        """

        # Apply association rule mining to get top extracted features
        association_rules = apriori(self.extracted_features,
                                    min_support=min_support,
                                    min_confidence=min_confidence,
                                    min_lift=min_lift,
                                    min_length=min_length,
                                    max_length=max_length)
        
        # Get top features
        for association_rule in list(association_rules):
            self.top_extracted_features.extend(association_rule.items)

        # Remove duplications
        self.top_extracted_features = list(set(self.top_extracted_features))

    def prune_top_features(self):
        """
        Apply redundancy and compactness pruning to filter out incorrect features
        """
        
        # Apply compactness pruning on feature phrases
        compact_features = self.compactness_pruning()
        
        # Apply redundancy pruning on one word features
        meaningful_features = self.redundancy_pruning(compact_features) 

        # Top features after pruning
        self.top_extracted_features = meaningful_features + compact_features

    def compactness_pruning(self):
        """
        Apply compactness pruning on feature phrases to identify and retain compact features.

        Returns:
        - list: A list of compact features.
        """
        compact_features = []

        for top_feature in self.top_extracted_features:
            # Exclude top one word features
            if len(top_feature.split()) < 2:
                continue

            # Check if the phrase feature is compact in the reviews
            if self._is_compact_feature(top_feature):
                compact_features.append(top_feature)

        return compact_features

    def _is_compact_feature(self, feature_phrase):
        """
        Check if a feature is compact in at least 2 sentences in the reviews.

        Args:
        - feature_phrase (str): Top feature phrase to be checked for compactness.

        Returns:
        - bool: True if the feature is compact in at least 2 sentences, False otherwise.
        """
        compact_count = 0

        for review in self.reviews:
            for review_line in review.review_lines:
                # If phrase features is not exists in the sentence, skip it
                if feature_phrase not in review_line.extracted_features:
                    continue

                # Find positions for each word in the feature_phrase                
                positions = {feature: [i for i, (word, pos) in enumerate(review_line.words) if word == feature]
                    for feature in feature_phrase.split()}
                
                compact_count += self._count_compact_occurrences(positions)

                if compact_count >= 2:
                    return True

        return False
    
    @staticmethod
    def _count_compact_occurrences(positions, min_p_support=3):
        """
        Count compact occurrences for a given set of word positions.

        Args:
        - positions (dict): Positions of each word in the feature phrases.
        - min_p_support (int): Minimum support threshold for feature phrases.

        Returns:
        - int: Number of compact occurrences.
        """
        compact_occurrences = 0
        for combination in itertools.combinations(positions.keys(), 2):
            for pos_combination in itertools.product(positions[combination[0]], positions[combination[1]]):
                # Calculate distances and increment compact_occurrences for each valid occurrence
                compact_occurrences = sum(distance <= min_p_support for distance in (abs(item[0] - item[1]) for item in [pos_combination]))

        return compact_occurrences

    def redundancy_pruning(self, compact_features, min_p_support=3):
        """
        Apply redundancy pruning on one-word features to filter out less significant features.

        Parameters:
        - compact_features (list): A list of compact features obtained through compactness pruning.
        - min_p_support (int): Minimum support threshold for one-word features.

        Returns:
        - list: A list of meaningful one-word features.
        """
        meaningful_features = []

        for top_feature in self.top_extracted_features:
            
            # Exclude top phrase features
            if len(top_feature.split()) > 1:
                continue
                
            superset_features = [feature for feature in compact_features if set([top_feature]).issubset(set(feature.split(' ')))]
            
            # Check if the top_feature is a not subset of any compact feature, then it will be meaningful features
            if not superset_features:
                meaningful_features.append(top_feature)
                continue

            p_support = 0

            for review in self.reviews:
                for review_line in review.review_lines:
                    # Check if the top_feature is mentioned alone in the sentence while superset_features not
                    if top_feature in review_line.extracted_features and superset_features not in review_line.extracted_features:
                        p_support += 1

            # Check the p-support condition
            if p_support >= min_p_support:
                meaningful_features.append(top_feature)

        return meaningful_features

    def _get_labelled_features(self):
        """
        Get all human-labelled features from the reviews.

        Returns:
        - list: A list containing all human-labelled features.
        """
        labelled_features = []

        for review in self.reviews:
            for review_line in review.review_lines:
                labelled_features.extend([feature[0] for feature in review_line.features])

        return labelled_features

    def evaluate_features_extraction(self):
        """
        Evaluate the top extracted features after pruning based on human-labelled features.

        Returns:
        - dict: Dictionary containing evaluation metrics (FP, FN, TP, TN).
        """
        true_positives  = 0
        false_positives = 0
        false_negatives = 0

        # Get human-labelled features as reference set
        labelled_set = set(self._get_labelled_features())
    
        # Get extracted_features
        if (self.top_extracted_features):
            extracted_set = set(self.top_extracted_features)
        else:
            extracted_set = set([item for sublist in self.extracted_features for item in sublist])

        # Calculate TP, FP, FN, TN
        true_positives  = len(labelled_set.intersection(extracted_set))

        false_positives = len(extracted_set.difference(labelled_set))
        false_negatives = len(labelled_set.difference(extracted_set))

        # Calculate precision, and recall
        precision = true_positives / max(1, (true_positives + false_positives))
        recall    = true_positives / max(1, (true_positives + false_negatives))

        return {
                'product': self.product,
                'precision': round(precision, 2),
                'recall':    round(recall, 2)
            }

    def analyze_sentiments(self):
        """
        Determine the sentiment (positive or negative) for each review inside the reviews.
        """
        for review in self.reviews:
            review.analyze_sentiments()
            
            for review_line in review.review_lines:
                for feature in self.top_extracted_features:
                    
                    if feature in review_line.extracted_features:
                        # Update the feature_sentiments based on the sentiment counts of the current feature
                        if feature not in self.feature_sentiments:
                            self.feature_sentiments[feature] = {'positive': 0, 'negative': 0}

                        sentiment = 'positive' if review_line.sentiment == '+' else 'negative'

                        # Update sentiment counts for the specified feature
                        self.feature_sentiments[feature][sentiment] += 1

    def evaluate_sentiments_analysis(self):
        """
        Evaluate the extracted sentiment analysis performance for each feature based on human-labelled sentiments
        on a per-line basis.

        Returns:
        - dict: Dictionary containing evaluation metrics for sentiments on a per-line basis.
        """  
        true_positives  = 0
        false_positives = 0
        false_negatives = 0

        for review in self.reviews:
            for review_line in review.review_lines:
                for feature, rank in review_line.features:
                    # Extract the sign from the rank to determine the sentiment
                    sentiment = rank[0]
                    
                    if not sentiment:
                        continue

                    # Get sentiments extracted from the current review line
                    if feature in self.top_extracted_features:
                        if sentiment == review_line.sentiment:
                            true_positives  += 1
                        else:
                            false_positives += 1
                    else:
                        false_negatives += 1   

        # Calculate precision and recall for all review lines
        precision = true_positives / max(1, (true_positives + false_positives))
        recall    = true_positives / max(1, (true_positives + false_negatives))

        return {
                'product': self.product,
                'precision': round(precision, 2),
                'recall':    round(recall, 2)
            }



class Review:
    def __init__(self, title, review_lines):
        """
        Represents a review, consisting of a title and a list of ReviewLine instances.
        """

        self.review_title = title.strip()
        self.review_lines = review_lines
        
        
    def __str__(self):
        review_lines_str = ', '.join(map(str, self.review_lines))
        return f'Review(title="{self.review_title}", review_lines=[{review_lines_str}])'
    
    def preprocess(self, spell_check=False, remove_stopwords=False, stemming=False, lemmatization=False):
        """
        Preprocesses the parsed review.

        Returns:
        - list: List of preprocessed ReviewLine instances.
        """
        for review_line in self.review_lines:
            review_line.preprocess(spell_check, remove_stopwords, stemming, lemmatization)

        return

    def analyze_sentiments(self):
        """
        Determine the sentiment (positive or negative) for each review line inside the reviews.

        """
        for review_line in self.review_lines:
            review_line.analyze_sentiments()



class ReviewLine:
    def __init__(self, sentence, features, notes):
        """
        Represents a line within a review, containing sentence, human labelled features and notes.
        """
        self.sentence = sentence
        self.features = features
        self.notes    = notes
        
        self.NOUN_TAGS  = ['NN','NNS','NNP','NNPS']
        
        self.ADVERB_ADJECTIVE_TAGS = ['JJ', 'JJR', 'JJS']
        
        self.words = [] # The words and their POS in sentence

        self.extracted_features  = [] # The nouns in sentence
        
        self.sentiment = ''
        
    def __str__(self):
        return f"ReviewLine(features={self.features},\
                            notes={self.notes}, sentence='{self.sentence}', nouns='{self.extracted_features}')"

    def preprocess(self, spell_check=False, remove_stopwords=False, stemming=False, lemmatization=False):
        """
        Preprocesses review lines by performing the following steps:
        1. Converting the sentence to lowercase
        2. Optionally corrects spelling using TextBlob.
        3. Tokenizes the sentence using TextBlob.
        4. Optionally removes stopwords.
        5. Optionally performs stemming.
        6. Optionally performs lemmatization.
        7. Extracts noun phrases from the preprocessed sentence.
        """

        # 1. Converting the sentence to lowercase
        sentence = self.sentence.lower()
        sentence = re.sub(r'[^A-Za-z0-9\-\s\.\']+', '', sentence)

        sentence = TextBlob(sentence)

        # 2. Spell checker if requested
        if spell_check:
            sentence = sentence.correct()

        # 3. Tokenize the sentence using TextBlob
        self.words = [(word, pos) for word, pos in sentence.tags if len(word) > 1]
        
        # 4. Remove stopwords if requested
        if remove_stopwords:
            stop_words = set(stopwords.words('english'))
            self.words = [(word, pos) for word, pos in self.words if word not in stop_words]

        # 5. Stemming if requested
        if stemming:   
            stemmer = SnowballStemmer('english')
            self.words = [(stemmer.stem(word), pos) for word, pos in self.words]

        # 6. Lemmatization if requested
        if lemmatization:
            lemmatizer = WordNetLemmatizer()
            self.words = [(lemmatizer.lemmatize(word), pos) for word, pos in self.words]
            
            # Lemmatize noun phrases separately
            sentence.noun_phrases = [Word(noun_phrase).lemmatize() for noun_phrase in sentence.noun_phrases]


        # 7. Extract noun phrases based on specific tags
        nouns = [word for word, pos in self.words if pos in self.NOUN_TAGS]

        # Add noun phrases from TextBlob
        self.extracted_features = sentence.noun_phrases + nouns

        return

    def analyze_sentiments(self):
        """
        Determine the sentiment (positive or negative) for each review line.

        """
        sid = SentimentIntensityAnalyzer()

        sentiment_score = sid.polarity_scores(self.sentence)['compound']

        # Update sentiment counts based on individual opinion sentiments
        if sentiment_score > 0:
            sentiment = '+'
        elif sentiment_score < 0:
            sentiment = '-'
        else:
            sentiment = ''

        self.sentiment = sentiment

    def _get_adjacent_adjectives(self, feature):
        """
        Retrieve the adjectives that precede and follow the given noun in the sentence.

        Parameters:
        - feature (str): The target feature for which to find the adjacent adjectives.

        Returns:
        - list: A list containing the adjectives that precede and follow the noun.
        """

        adjacent_adjectives = []

        noun_words = tuple(feature.split())

        # Preprocess the sentence to identify occurrences of multi-word nouns
        noun_positions = [i for i, (word, pos) in enumerate(self.words) if tuple(word for word, _ in self.words[i:i + len(noun_words)]) == noun_words]

        for noun_position in noun_positions:
            preceding_index = noun_position - 1 if noun_position > 0 else None
            following_index = noun_position + len(noun_words) if noun_position < len(self.words) - len(noun_words) else None

            if preceding_index is not None:
                if self.words[preceding_index][1] in self.ADVERB_ADJECTIVE_TAGS:
                    adjacent_adjectives.append(self.words[preceding_index][0])

            if following_index is not None:
                if self.words[following_index][1] in self.ADVERB_ADJECTIVE_TAGS:
                    adjacent_adjectives.append(self.words[following_index][0])

        return adjacent_adjectives



class ReviewsParser:
    def __init__(self, reviews_file):
        """
        Parses reviews from a file and extracts relevant information.
        """
        self.reviews_file = reviews_file
        
        self._NOTES  = ['u', 'p', 's', 'cc', 'cs', 'v', 'a']
        
        self.reviews = []

    def _read_file(self):
        """
        Reads the content of the reviews file and returns the lines as a list.
        """
        file_lines = []

        try:
            with open(self.reviews_file, 'r', encoding='utf-8', errors='ignore') as f:
                file_lines = f.readlines()
        except UnicodeDecodeError as e:
            print(f"Error decoding file {self.reviews_file}: {e}")
        
        return file_lines
    
    def _create_review(self, title, review_lines):
        """
        Creates a Review instance with the given title and list of ReviewLine instances.
        """
        return Review(title, review_lines)
    
    def _create_review_line(self, sentence, features, notes):
        """
        Creates a ReviewLine instance with the given sentence, notes, and features.
        """
        return ReviewLine(sentence, features, notes)

    def parse(self):
        """
        Parses reviews from the file and returns a list of Review instances.

        Returns:
        - list: List of Review instances.
        """
        file_lines = self._read_file()

        review_title = ''
        review_lines = []
        
        for line in file_lines:
            sentence = ''
            features = []
            notes    = []

            if line.startswith("*"): # Skip Comments
                continue

            if line.startswith("[t]"):  # New review
                # Create object for the previous review - if exists -
                if review_lines:
                    review = Review(review_title, review_lines)
                    self.reviews.append(review)

                review_title = line[3:]
                review_lines = []
            elif line.startswith("##"):  # Review sentence
                sentence = line[2:]
            elif not line.startswith("##") and "##" in line:  # Review sentence with feature and notes
                line_parts = line.split("##")
                line_part1 = line_parts[0].strip() # Has the features
                line_part2 = ''.join(line_parts[1:]) # Has the sentence
                
                sentence = line_part2.strip()

                # Get the list of features using regular expression
                features_list = re.findall(r'[\w-]+(?:\[[^\]]*\])?(?:\s+[\w-]+(?:\[[^\]]*\])?)*', line_part1)
            
                for feature in features_list:
                        
                    # To handle nested features
                    feature_info = re.findall(r'([\w\s-]+)\[([+\-]?\d+)\]', feature)

                    # Add the feature and its rank
                    for f_name, f_rank in feature_info:
                        
                        lemmatizer = WordNetLemmatizer()
                        f_name = f_name.lower()
                        f_name = ' '.join([lemmatizer.lemmatize(word) for word in f_name.split(' ')])

                        features.append((f_name, f_rank))
                    
                    # If feature has notes, add it to the review line
                    if feature in self._NOTES:
                        notes.append(feature)

            # Create ReviewLine instance if there is sentence
            if sentence:
                review_line = self._create_review_line(sentence, features, notes)
                review_lines.append(review_line)

        # Create Review instance if there are review lines
        if review_lines:
            review = self._create_review(review_title, review_lines)
            self.reviews.append(review)

        return
    
from prettytable import PrettyTable

def show_evaluation_results(evaluation_results):
    """
    Display the evaluation results with an average row for precision and recall.

    Parameters:
    - evaluation_results (list): List containing evaluation results for each product.
    """
    table = PrettyTable()
    table.field_names = ["Product", "Precision", "Recall"]

    # Initialize variables to calculate averages
    total_precision = 0
    total_recall    = 0

    products_count  = len(evaluation_results)

    for evaluation_result in evaluation_results:
        precision = evaluation_result['precision']
        recall    = evaluation_result['recall']
        
        # Add to total for averages
        total_precision += precision
        total_recall    += recall

        row_data = [
            evaluation_result['product'],
            "{:.2f}".format(precision),
            "{:.2f}".format(recall)
        ]

        table.add_row(row_data)
    
    table.add_row(["", "-----------", "--------"])  # Empty row for separation

    # Calculate and add average precision and recall
    avg_precision = total_precision / products_count if products_count else 0
    avg_recall    = total_recall / products_count if products_count else 0

    table.add_row(["Average", "{:.2f}".format(avg_precision), "{:.2f}".format(avg_recall)])

    # Set alignment for each column
    for column in table.field_names:
        alignment = "r" if column != "Product" else "l"
        table.align[column] = alignment

    print(table)

In [6]:
parsed_reviews = []
        
reviews_reader = ReviewsReader('Data')
reviews_files  = reviews_reader.get_reviews_files()

for product_name, file_path in reviews_files.items():

    print('Parsing the reviews of', product_name, '...')

    start_time = time.time()

    reviews_parser = ReviewsParser(file_path)
    reviews_parser.parse()

    parsed_reviews.append(Reviews(product_name, reviews_parser.reviews))

    end_time = time.time()

    print("Parsed {} reviews in {:.2f} seconds.\n".format(len(reviews_parser.reviews), end_time - start_time))

Parsing the reviews of Computer ...
Parsed 1 reviews in 1.31 seconds.

Parsing the reviews of Router ...
Parsed 1 reviews in 0.01 seconds.

Parsing the reviews of Speaker ...
Parsed 1 reviews in 0.01 seconds.



## Step 2: Data Preprocessing

In [7]:
processed_reviews = []

for product_reviews in parsed_reviews:

    product = product_reviews.product
    reviews = product_reviews.reviews

    print('Processing the reviews of', product, '...')

    start_time = time.time()

    product_reviews.preprocess(spell_check=True, remove_stopwords=True, stemming=False, lemmatization=True)

    processed_reviews.append(Reviews(product, reviews))

    end_time = time.time()

    print("Preprocessed {} reviews in {:.2f} seconds.\n".format(len(reviews), end_time - start_time))

Processing the reviews of Computer ...
Preprocessed 1 reviews in 49.79 seconds.

Processing the reviews of Router ...
Preprocessed 1 reviews in 111.01 seconds.

Processing the reviews of Speaker ...
Preprocessed 1 reviews in 58.57 seconds.



## Step 3: Product Features Extraction

In [8]:
extracted_reviews = []

for product_reviews in processed_reviews:

    product = product_reviews.product
    reviews = product_reviews.reviews

    print('Extracting the features of', product, '...')

    start_time = time.time()

    product_reviews.get_extracted_features()
    
    extracted_reviews.append(product_reviews)
    
    end_time = time.time()
        
    print("Extracted {} features in {:.2f} seconds.\n".
              format(len(product_reviews.extracted_features), end_time - start_time))

Extracting the features of Computer ...
Extracted 531 features in 0.00 seconds.

Extracting the features of Router ...
Extracted 879 features in 0.00 seconds.

Extracting the features of Speaker ...
Extracted 689 features in 0.00 seconds.



### Product Features Extraction Evaluation

In [9]:
evaluation_results = [product_reviews.evaluate_features_extraction() for product_reviews in extracted_reviews]

show_evaluation_results(evaluation_results)

+----------+-------------+----------+
| Product  |   Precision |   Recall |
+----------+-------------+----------+
| Computer |        0.06 |     0.46 |
| Router   |        0.04 |     0.49 |
| Speaker  |        0.06 |     0.53 |
|          | ----------- | -------- |
| Average  |        0.05 |     0.49 |
+----------+-------------+----------+


### Product Features Extraction (Association Mining Rule)

In [10]:
association_mined_reviews = []

for product_reviews in extracted_reviews:

    product = product_reviews.product
    reviews = product_reviews.reviews

    print('Extracting the features of', product, '...')

    start_time = time.time()

    product_reviews.extract_top_features()
    
    association_mined_reviews.append(product_reviews)
    
    end_time = time.time()
        
    print("Extracted top {} out of {} features in {:.2f} seconds.\n".
              format(len(product_reviews.top_extracted_features), len(product_reviews.extracted_features), end_time - start_time))

Extracting the features of Computer ...
Extracted top 33 out of 531 features in 0.03 seconds.

Extracting the features of Router ...
Extracted top 39 out of 879 features in 0.05 seconds.

Extracting the features of Speaker ...
Extracted top 26 out of 689 features in 0.03 seconds.



### Product Features Extraction Evaluation (Association Mining Rule)

In [11]:
evaluation_results = [product_reviews.evaluate_features_extraction() for product_reviews in association_mined_reviews]

show_evaluation_results(evaluation_results)

+----------+-------------+----------+
| Product  |   Precision |   Recall |
+----------+-------------+----------+
| Computer |        0.36 |     0.08 |
| Router   |        0.21 |     0.06 |
| Speaker  |        0.31 |     0.05 |
|          | ----------- | -------- |
| Average  |        0.29 |     0.06 |
+----------+-------------+----------+


### Product Features Extraction (Pruning)

In [12]:
pruned_reviews = []

for product_reviews in processed_reviews:

    product = product_reviews.product
    reviews = product_reviews.reviews
    
    print('Extracting the features of', product, '...')

    start_time = time.time()
    
    product_reviews.prune_top_features()
    
    pruned_reviews.append(product_reviews)

    end_time = time.time()
    
    print("Remained top {} features after pruning in {:.2f} seconds.\n".
              format(len(product_reviews.top_extracted_features), end_time - start_time))


Extracting the features of Computer ...
Remained top 33 features after pruning in 0.00 seconds.

Extracting the features of Router ...
Remained top 38 features after pruning in 0.00 seconds.

Extracting the features of Speaker ...
Remained top 26 features after pruning in 0.00 seconds.



### Product Features Extraction Evaluation (Pruning)

In [13]:
evaluation_results = [product_reviews.evaluate_features_extraction() for product_reviews in pruned_reviews]

show_evaluation_results(evaluation_results)

+----------+-------------+----------+
| Product  |   Precision |   Recall |
+----------+-------------+----------+
| Computer |        0.36 |     0.08 |
| Router   |        0.21 |     0.06 |
| Speaker  |        0.31 |     0.05 |
|          | ----------- | -------- |
| Average  |        0.29 |     0.06 |
+----------+-------------+----------+


### Product Features Extraction (Infrequent Features)

In [14]:
final_reviews = []

for product_reviews in pruned_reviews:

    product = product_reviews.product
    reviews = product_reviews.reviews
    
    print('Extracting the infrequent features of', product, '...')

    start_time = time.time()
    
    product_reviews.get_infrequent_features()
    
    product_reviews.top_extracted_features += product_reviews.infrequent_features
    
    final_reviews.append(product_reviews)

    end_time = time.time()
    
    print("Adding {} infrequent features in {:.2f} seconds.\n".
              format(len(product_reviews.infrequent_features), end_time - start_time))

Extracting the infrequent features of Computer ...
Adding 162 infrequent features in 0.01 seconds.

Extracting the infrequent features of Router ...
Adding 179 infrequent features in 0.01 seconds.

Extracting the infrequent features of Speaker ...
Adding 201 infrequent features in 0.01 seconds.



### Product Features Extraction Evaluation (With Infrequenct Features)

In [15]:
evaluation_results = [product_reviews.evaluate_features_extraction() for product_reviews in final_reviews]

show_evaluation_results(evaluation_results)

+----------+-------------+----------+
| Product  |   Precision |   Recall |
+----------+-------------+----------+
| Computer |        0.19 |     0.24 |
| Router   |        0.13 |     0.22 |
| Speaker  |        0.20 |     0.27 |
|          | ----------- | -------- |
| Average  |        0.17 |     0.24 |
+----------+-------------+----------+


## Step 4: Sentiment Analysis

In [16]:
for product_reviews in final_reviews:

    product = product_reviews.product
    reviews = product_reviews.reviews
   
    product_reviews.analyze_sentiments()
    
    print(f"Product: {product}")
    for feature, sentiments in product_reviews.feature_sentiments.items():
        positive_sentiments = sentiments['positive']
        negative_sentiments = sentiments['negative']
        print(f"  Feature: {feature}")
        print(f"    Positive: {positive_sentiments}")
        print(f"    Negative: {negative_sentiments}")

Product: Computer
  Feature: monitor
    Positive: 45
    Negative: 59
  Feature: inch
    Positive: 2
    Negative: 2
  Feature: purchase
    Positive: 8
    Negative: 3
  Feature: time
    Positive: 10
    Negative: 18
  Feature: item
    Positive: 5
    Negative: 4
  Feature: picture
    Positive: 7
    Negative: 4
  Feature: quality
    Positive: 9
    Negative: 4
  Feature: screen
    Positive: 15
    Negative: 10
  Feature: box
    Positive: 3
    Negative: 1
  Feature: day
    Positive: 1
    Negative: 8
  Feature: display
    Positive: 11
    Negative: 6
  Feature: brand
    Positive: 5
    Negative: 3
  Feature: gun
    Positive: 1
    Negative: 1
  Feature: color
    Positive: 16
    Negative: 2
  Feature: ebook
    Positive: 8
    Negative: 7
  Feature: -rib-
    Positive: 10
    Negative: 18
  Feature: computer
    Positive: 11
    Negative: 21
  Feature: -erb-
    Positive: 9
    Negative: 18
  Feature: well
    Positive: 3
    Negative: 2
  Feature: mind
    Positive: 4
 

Product: Router
  Feature: customer
    Positive: 10
    Negative: 6
  Feature: service
    Positive: 7
    Negative: 9
  Feature: lawton
    Positive: 13
    Negative: 16
  Feature: internet
    Positive: 15
    Negative: 11
  Feature: step
    Positive: 5
    Negative: 3
  Feature: problem
    Positive: 9
    Negative: 38
  Feature: product
    Positive: 21
    Negative: 20
  Feature: cable
    Positive: 4
    Negative: 5
  Feature: tireless
    Positive: 15
    Negative: 21
  Feature: frustration
    Positive: 1
    Negative: 4
  Feature: month
    Positive: 7
    Negative: 12
  Feature: route
    Positive: 5
    Negative: 5
  Feature: replacement
    Positive: 2
    Negative: 10
  Feature: nether
    Positive: 6
    Negative: 14
  Feature: knowledge
    Positive: 2
    Negative: 1
  Feature: average
    Positive: 1
    Negative: 0
  Feature: technical knowledge
    Positive: 1
    Negative: 0
  Feature: speed
    Positive: 10
    Negative: 12
  Feature: connection
    Positive: 13


Product: Speaker
  Feature: speaker
    Positive: 105
    Negative: 71
  Feature: outlet
    Positive: 3
    Negative: 1
  Feature: battery
    Positive: 8
    Negative: 3
  Feature: something
    Positive: 3
    Negative: 8
  Feature: power
    Positive: 4
    Negative: 5
  Feature: -rib-
    Positive: 27
    Negative: 14
  Feature: review
    Positive: 6
    Negative: 9
  Feature: sound
    Positive: 66
    Negative: 23
  Feature: quality
    Positive: 32
    Negative: 10
  Feature: sound quality
    Positive: 8
    Negative: 6
  Feature: bass
    Positive: 18
    Negative: 9
  Feature: -erb-
    Positive: 19
    Negative: 14
  Feature: feature
    Positive: 4
    Negative: 0
  Feature: line
    Positive: 7
    Negative: 3
  Feature: downside
    Positive: 0
    Negative: 1
  Feature: deal
    Positive: 1
    Negative: 1
  Feature: thing
    Positive: 8
    Negative: 14
  Feature: dock
    Positive: 2
    Negative: 5
  Feature: case
    Positive: 1
    Negative: 2
  Feature: music
  

## Step 5: Evaluation

### Sentiment Analysis Evaluation

In [17]:
evaluation_results = [product_reviews.evaluate_sentiments_analysis() for product_reviews in final_reviews]

show_evaluation_results(evaluation_results)

+----------+-------------+----------+
| Product  |   Precision |   Recall |
+----------+-------------+----------+
| Computer |        0.76 |     0.43 |
| Router   |        0.66 |     0.24 |
| Speaker  |        0.79 |     0.58 |
|          | ----------- | -------- |
| Average  |        0.74 |     0.42 |
+----------+-------------+----------+


## Alternative Approach: TF-IDF and Naive Bayes for Sentiment Analysis

In [18]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_score, recall_score

# Initialize lists to store evaluation results
evaluation_results  = []
naive_bayes_results = []

# Loop through each product's reviews
for product_reviews in final_reviews:
    product = product_reviews.product
    data = []

    # Preparing data with sentence, feature, and sentiment
    for review in product_reviews.reviews:
        for review_line in review.review_lines:
            if review_line.features:
                for feature, rank in review_line.features:
                    sentiment = 'positive' if int(rank) > 0 else 'negative'
                    data.append({'sentence': ' '.join(word_tag[0] for word_tag in review_line.words), 
                                 'feature': feature, 
                                 'sentiment': sentiment})

    df = pd.DataFrame(data)
    
    # Setting up TF-IDF Vectorizer
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=0.03, stop_words='english', ngram_range=(1,3))
    X = tfidf_vectorizer.fit_transform(df['sentence'])
    y = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

    # Identifying top TF-IDF features and labelled features
    top_tfidf_features = set(tfidf_vectorizer.get_feature_names_out())
    labelled_features  = set(df['feature'].unique())

    # Calculating precision and recall for TF-IDF features
    common_features = top_tfidf_features.intersection(labelled_features)
    false_positives = top_tfidf_features.difference(labelled_features)
    false_negatives = labelled_features.difference(top_tfidf_features)
    tfidf_precision = len(common_features) / (len(common_features) + len(false_positives)) if common_features else 0
    tfidf_recall    = len(common_features) / (len(common_features) + len(false_negatives)) if common_features else 0

    # Storing TF-IDF feature comparison results
    evaluation_results.append({
        'product': product,
        'precision': tfidf_precision,
        'recall': tfidf_recall
    })
    
    # Splitting the data for training and testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Grid Search for optimal alpha in Naive Bayes
    alpha_grid = {'alpha': [0.01, 0.1, 1, 10, 100]}
    grid_search = GridSearchCV(MultinomialNB(), param_grid=alpha_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    # Best classifier after grid search
    best_clf = grid_search.best_estimator_

    # Making predictions with the best classifier
    y_pred = best_clf.predict(X_test)

    # Calculating precision and recall for Naive Bayes classifier
    precision = precision_score(y_test, y_pred)
    recall    = recall_score(y_test, y_pred)

    # Storing results for Naive Bayes classifier
    naive_bayes_results.append({
        'product': product,
        'precision': precision,
        'recall': recall,
    })

### TF-IDF Features Extraction Evaluation

In [19]:
show_evaluation_results(evaluation_results)

+----------+-------------+----------+
| Product  |   Precision |   Recall |
+----------+-------------+----------+
| Computer |        0.41 |     0.13 |
| Router   |        0.47 |     0.15 |
| Speaker  |        0.47 |     0.13 |
|          | ----------- | -------- |
| Average  |        0.45 |     0.14 |
+----------+-------------+----------+


### Naive Bayes Classifier for Sentiment Analysis Evaluation

In [20]:
show_evaluation_results(naive_bayes_results)

+----------+-------------+----------+
| Product  |   Precision |   Recall |
+----------+-------------+----------+
| Computer |        0.72 |     1.00 |
| Router   |        0.71 |     0.81 |
| Speaker  |        0.89 |     1.00 |
|          | ----------- | -------- |
| Average  |        0.77 |     0.94 |
+----------+-------------+----------+
