# Text Summarization of Amazon Reviews Data

In [214]:
import numpy as np 
import pandas as pd
import json
import time
import re
import seaborn as sns
import random
import html

from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline

import textacy

# Text cleaning
import spacy 
nlp = spacy.load('en_core_web_sm')

# Cosine Similarity 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import GroupShuffleSplit

# Text summarization 
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
import textdistance
from sklearn.cluster import KMeans
from sklearn import metrics

from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer

## Load saved reviews and metadata csv files

In [215]:
merge_df = pd.read_csv('../data/merge_df.csv', low_memory = False, index_col = False)
merge_df = merge_df.drop(columns = {'Unnamed: 0'})
merge_df.columns

Index(['rating', 'title', 'text', 'asin', 'parent_asin', 'user_id',
       'timestamp', 'helpful_vote', 'verified_purchase', 'year',
       'main_category', 'average_rating', 'rating_number', 'features',
       'description', 'price', 'images', 'videos', 'store', 'categories',
       'details', 'bought_together', 'subtitle', 'author'],
      dtype='object')

In [216]:
# Remove null values in 'text' column 
merge_df = merge_df.dropna(subset=['text'])

In [217]:
print(f"Unique store count: {merge_df['parent_asin'].nunique()}")
print('-----------')
merge_df['store'].value_counts().head(15)

Unique store count: 245290
-----------


store
Coleman                   11886
Fitbit                     6787
CamelBak                   4897
Alvada                     4339
Franklin Sports            4142
BalanceFrom                3563
Amazon Basics              3516
WILSON                     3178
WinCraft                   3094
CAP Barbell                3026
Outdoorsman Lab            2971
FOCO                       2910
Schwinn                    2877
Contigo                    2867
Sunny Health & Fitness     2794
Name: count, dtype: int64

In [218]:
merge_df['store'].value_counts().head(35)

store
Coleman                   11886
Fitbit                     6787
CamelBak                   4897
Alvada                     4339
Franklin Sports            4142
BalanceFrom                3563
Amazon Basics              3516
WILSON                     3178
WinCraft                   3094
CAP Barbell                3026
Outdoorsman Lab            2971
FOCO                       2910
Schwinn                    2877
Contigo                    2867
Sunny Health & Fitness     2794
SHIMANO                    2727
adidas                     2611
Gaiam                      2563
Rico Industries            2452
Nalgene                    2407
Speedo                     2337
Razor                      2304
Sportneer                  2146
Yes4All                    2103
Northwest                  2095
BELL                       2075
New Era                    1968
OPAMOO                     1905
Siskiyou Sports            1872
Intex                      1835
KastKing                   1824
Un

## Filtering the dataframe to a specific brand or product

In [219]:
# Assigning a specific store name
store = 'Contigo'

filtered_df = merge_df.loc[merge_df['store'] == store].copy()
filtered_df.reset_index(drop=True, inplace=True)
filtered_df.columns

Index(['rating', 'title', 'text', 'asin', 'parent_asin', 'user_id',
       'timestamp', 'helpful_vote', 'verified_purchase', 'year',
       'main_category', 'average_rating', 'rating_number', 'features',
       'description', 'price', 'images', 'videos', 'store', 'categories',
       'details', 'bought_together', 'subtitle', 'author'],
      dtype='object')

In [220]:
def generate_brand_report(df, store):
    """
    Generates a report for a specified store, including counts of unique parent and product IDs, 
    and statistics on reviews and ratings.
    """
    # Filter the DataFrame by the specified store
    filtered_df = df[df['store'] == store]
    print(f"Store: {store}")
    print('-----')

    # Count number of reviews
    complete_reviews_count = filtered_df['rating_number'].unique().sum()
    print(f"Number of reviews in complete Amazon dataset: {complete_reviews_count}")
    print(f"Number of reviews in df: {len(filtered_df)}")  # Some customers only include a title and rating to their reviews
    
    # Calculate the number of unique parent_asin values
    parent_asin_count = filtered_df['parent_asin'].nunique()
    print(f"Count of unique parent product IDs: {parent_asin_count}")
    
    # Count the number of unique asin values
    asin_count = filtered_df['asin'].nunique()
    print(f"Count of unique product ID's: {asin_count}")
    
    # Calculate the number of reviews and average review rating for each asin
    asin_reviews_ratings = filtered_df.groupby('parent_asin').agg({'average_rating': 'mean', 'rating_number': 'mean'}).reset_index()

    # Create a composite score for each product
    asin_reviews_ratings['composite_score'] = asin_reviews_ratings['average_rating'] * asin_reviews_ratings['rating_number']

    
    # Find the top and bottom ten products with the best and worst rating
    top_ten_best = asin_reviews_ratings.nlargest(10, 'composite_score')
    bottom_ten_worst = asin_reviews_ratings.nsmallest(10, 'composite_score')
    print('-----')
    print(f"Top ten products: \n {top_ten_best}")
    print('-----')
    print(f"Bottom ten products: \n {bottom_ten_worst}")

# Apply function
generate_brand_report(filtered_df, store)

Store: Contigo
-----
Number of reviews in complete Amazon dataset: 192942
Number of reviews in df: 2867
Count of unique parent product IDs: 175
Count of unique product ID's: 293
-----
Top ten products: 
     parent_asin  average_rating  rating_number  composite_score
158  B08XXQSHH6             4.8        30463.0         146222.4
169  B0BTHXPZWZ             4.8        12267.0          58881.6
173  B0C4VWCZRV             4.5        12678.0          57051.0
165  B0BT9JQ4LJ             4.7        10820.0          50854.0
121  B07GBFYHNM             4.6         9365.0          43079.0
164  B0BT9G6ZJC             4.7         7715.0          36260.5
142  B07PDHRYSB             4.7         7354.0          34563.8
162  B09R9X84HF             4.7         6619.0          31109.3
65   B00YYBBYN8             4.6         5090.0          23414.0
155  B08PWGXDFM             4.8         4754.0          22819.2
-----
Bottom ten products: 
     parent_asin  average_rating  rating_number  composite_score

In [221]:
def generate_random_examples(df, column, interest):
    """
    Generate three examples of random records with the specified interest. 
    
    Parameters: 
    df: DataFrame where the search will occur. 
    column: Column name to search within. 
    interest: Value to search for in the specified column. 
    """
    # Print average rating for parent_asin 
    interest_average_rating = df[df[column] == interest]['average_rating'].unique()
    print(f'Average rating: {interest_average_rating[0]}\n')
    print(f'Categories: {df[df[column] == interest]["categories"].iloc[0]}\n')
    print(f'Details: {df[df[column] == interest]["details"].iloc[0]}\n')
    
    # Filter the Datadf[df[column] == interest]Frame based on the interest
    filtered_df = df[df[column] == interest]

    # Set number of examples 
    num_of_ex = 3 
    
    # Check if there are enough records to sample
    if len(filtered_df) < num_of_ex:
        print(f"Not enough records found for interest '{interest}'. Found {len(filtered_df)} records.")
        return
    
    # Randomly select three records
    random_examples = filtered_df.sample(n=num_of_ex)
    
    for i, row in random_examples.iterrows():
        print('-----')
        print(f'Rating: {row["rating"]}\n')
        print(f'Title: {row["title"]}\n')
        print(f'Text: {row["text"]}\n')


In [222]:
# Apply random example generator for best products 
df = filtered_df
column = 'parent_asin'
interest = 'B08XXQSHH6'
generate_random_examples(df, column, interest)

Average rating: 4.8

Categories: ['Sports & Outdoors', 'Sports & Outdoor Recreation Accessories', 'Sports Water Bottles']

Details: {'Brand': 'Contigo', 'Capacity': '1.5 Pounds', 'Color': 'Silver and Blue', 'Recommended Uses For Product': 'Home', 'Age Range (Description)': 'Adult', 'Product Dimensions': '2.75 x 2.75 x 11 inches', 'Model Name': 'Auto Seal Chill', 'Item Weight': '8.6 ounces', 'Theme': 'Sport', 'Material': 'Aluminum', 'Number of Items': '1', 'Included Components': 'Bottle^Lid', 'Product Care Instructions': 'Hand Wash Only', 'Cap Type': 'Loop Cap', 'Manufacturer': 'Contigo', 'Item model number': '2001714', 'Best Sellers Rank': {'Sports & Outdoors': 30233, 'Water Bottles': 801}, 'Is Discontinued By Manufacturer': 'No', 'Date First Available': 'December 24, 2016'}

-----
Rating: 5

Title: Great water bottle

Text: This turned out to be much better than I expected.  What I really like is that it is easy to drink out of but doesnt leak at all when I set it down on its side.<br

## Preprocess text column

### 1. Remove the noise from the text data.

In [223]:
def regex_clean(text):
    # convert html escapes like &amp; to characters.
    text = html.unescape(text)
    # tags like <tab>
    text = re.sub(r'<[^<>]*>', ' ', text)
    # markdown URLs like [Some text](https://....)
    text = re.sub(r'\[([^\[\]]*)\]\([^\(\)]*\)', r'\1', text)
    # text or code in brackets like [0]
    text = re.sub(r'\[[^\[\]]*\]', ' ', text)
    # standalone sequences of specials, matches &# but not #cool
    text = re.sub(r'(?:^|\s)[&#<>{}\[\]+|\\:-]{1,}(?:\s|$)', ' ', text)
    # standalone sequences of hyphens like --- or ==
    text = re.sub(r'(?:^|\s)[\-=\+]{2,}(?:\s|$)', ' ', text)
    # Remove all punctuation except periods and apostrophes
    text = re.sub(r"[^\w\s'.]", '', text)
    # sequences of white spaces
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

filtered_df['text'] = filtered_df['text'].apply(regex_clean)
filtered_df[['rating', 'text']].sample(2)

Unnamed: 0,rating,text
1326,5,Will get another.
137,5,Doesn't leak which was the reason I bought it.


### 2. Lemmatize and tokenize the text. 

In [224]:
text = "[[VIDEOID:c87e962bc893a948856b0f1b285ce6cc]] I wanted to love this bc I previously bought a matching turquoise teapot, but the loose lid (defective or design flaw? Idk) on the cups is driving me batty. I’m disabled so my gait is not great to begin with & the lid just bangs non-stop while I walk from my kitchen to wherever I’m going with my tea. It’s incredibly annoying.  I had hoped it was just a one-off so I purchased it in another color & sadly it has the same exact problem.  They could fix the problem by adding a rubber gasket or flange to the lid imo & I even thought of doing so myself until I accidentally knocked the cup over due to a design flaw that has a small base on the cup.  I like the lid bc I run a fan continuously & I live with 2 service dogs so I like to keep my drinks covered beyond just the brew times so I really hope they update this cup bc it does keep the tea warm & the size is perfect for a 2 cup brew.<br /><br />I wish they would fix the obvious design flaw of the base that is too small for a cup that tall & wide bc it is also very easy to accidentally knock over as I mentioned above.  I did it several times before I yet boxed them both back up.<br /><br />Overall I like the product as it seems as it was made with quality in mind, but missed the mark in the design phase.  The colors are pretty & the cups I purchased keep the tea warm, but the lids are either defective or a design flaw.  I really hope the company fixes these issue bc the colors match teapots I already own. Lol.  I intend to see if the company has another design that works better for me in the meantime."
print(text)

[[VIDEOID:c87e962bc893a948856b0f1b285ce6cc]] I wanted to love this bc I previously bought a matching turquoise teapot, but the loose lid (defective or design flaw? Idk) on the cups is driving me batty. I’m disabled so my gait is not great to begin with & the lid just bangs non-stop while I walk from my kitchen to wherever I’m going with my tea. It’s incredibly annoying.  I had hoped it was just a one-off so I purchased it in another color & sadly it has the same exact problem.  They could fix the problem by adding a rubber gasket or flange to the lid imo & I even thought of doing so myself until I accidentally knocked the cup over due to a design flaw that has a small base on the cup.  I like the lid bc I run a fan continuously & I live with 2 service dogs so I like to keep my drinks covered beyond just the brew times so I really hope they update this cup bc it does keep the tea warm & the size is perfect for a 2 cup brew.<br /><br />I wish they would fix the obvious design flaw of the

In [225]:
def extract_lemmas_with_pos(text, pos_to_keep=('ADJ', 'NOUN')):
    """Extract lemmas while keeping nouns and adjectives based on their position."""
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc if token.pos_ in pos_to_keep]
    all_lemmas = " ".join(lemmas)
    return all_lemmas
    
# Extract lemmas with nouns and adjectives
extract_lemmas_with_pos(text)

'matching turquoise teapot loose lid defective design flaw cup batty disabled gait great lid non - kitchen tea annoying off color same exact problem problem rubber gasket flange lid cup design flaw small base cup lid fan service dog drink brew time cup tea warm size perfect cup obvious design flaw base small cup tall wide easy several time product quality mind mark design phase color pretty cup tea warm lid defective design flaw company issue color teapot lol company design meantime'

In [226]:
stopwords = spacy.lang.en.stop_words.STOP_WORDS
include_stopwords = {'would', 'I'}
stop_words |= include_stopwords
print('Original stopwords count:', len(stopwords))

def clean_data(doc):
    """
    Cleans and preprocesses a text document using spaCy.
    
    This function takes a text document as input, converts it to lowercase, 
    lemmatizes the words, removes non-alphabetic characters, and filters out stopwords. 
    The resulting cleaned text is returned as a single string. 
    """
    doc = doc.lower()
    doc = nlp(doc)
    # Lemmatize words 
    lemmas = [token.lemma_ for token in doc]
    # Removing non-alphabetic characters and stopwords
    tokens = [lemma for lemma in lemmas if lemma.isalpha() and lemma not in stopwords]
    cleaned_doc = " ".join(tokens)
    
    return cleaned_doc
    
clean_data(text)

Original stopwords count: 301


'videoid want love bc previously buy matching turquoise teapot loose lid defective design flaw idk cup drive batty disable gait not great begin lid bang non stop walk kitchen tea incredibly annoying hope off purchase color sadly same exact problem fix problem add rubber gasket flange lid imo think accidentally knock cup due design flaw small base cup like lid bc run fan continuously live service dog like drink cover brew time hope update cup bc tea warm size perfect cup wish fix obvious design flaw base small cup tall wide bc very easy accidentally knock mention several time box like product quality mind miss mark design phase color pretty cup purchase tea warm lid defective design flaw hope company fix issue bc color match teapot own lol intend company design work well meantime'

In [227]:
# Clean the text
filtered_df['text'] = filtered_df['text'].apply(clean_data)
# Extract lemmas 
filtered_df['lemmas'] = filtered_df['text'].apply(extract_lemmas_with_pos)
filtered_df[['rating', 'text', 'lemmas']].sample(2)

Unnamed: 0,rating,text,lemmas
2077,5,contigo good not leak replace hot cold beverag...,leak hot cold beverage holder brand
413,5,love easy bottle fill water buy mom specifical...,easy bottle fill water mom mother day mom coup...


### Using TF-IDF 

Use TF-IDF vectorizer to transform the text into vectors based on the frequency of words in the text.

In [228]:
# Combine preprocessed sentences back into the original review structure 
combined_sentences = nlp_application['preprocessed_sentences'].apply(lambda sentences: [' '.join(sentence) for sentence in sentences])
nlp_application['combined_preprocessed'] = combined_sentences.apply(lambda x: ' '.join(x))

NameError: name 'nlp_application' is not defined

In [None]:
tfidfvect = TfidfVectorizer()
tfidf_text = tfidfvect.fit_transform(nlp_application['combined_preprocessed'])

In [None]:
tfidf_text

In [None]:
combined_sentences

In [None]:
# Parameter to specify number of summary sentences 
num_summary_sentence = 10

# Sum the TF-IDF values for each sentence
sent_sum = tfidf_text.sum(axis = 1)
important_sent = np.argsort(sent_sum, axis = 0)[::-1]

# Print three most import sentences in the order they appear in the article 
print("Most Important Sentences Based on TF-IDF:")
for i in range(0, len(nlp_application['combined_preprocessed'])):
    if i in important_sent[:num_summary_sentence]:
        print(nlp_application['combined_preprocessed'])

In [None]:
def summarize_with_tfidf(doc, num_summary_sentence = num_summary_sentence):
    """
    Apply the TF-IDF vectorization and then aggregate the value to a sentence level.
    Generate a score for each sentence as a sum of the TF-IDF values for each word in that sentence. 
    A sentence with a high score contains many important words as compared to other sentences in the column.
    """

    # Tokenize the text into sentences 
    sentences = []
    for text in doc:
        sentences.extend(sent_tokenize(text))

    # Compute TF-IDF for the sentences 
    tfidf = TfidfVectorizer()
    tfidf_text = tfidf.fit_transform(sentences)
    
    # Sort the sentences in descending order by the sum of TF-IDF values 
    sent_sum = tfidf_text.sum(axis = 1)
    important_sent = np.argsort(sent_sum, axis = 0)[::-1].flatten()
    
    # Collect the most important sentences 
    summary_sentences = [sentences[i] for i in important_sent[:num_summary_sentence]]

    return summary_sentences

In [None]:
num_summary_sentence = 10
summarize_with_tfidf(filtered_df['text'])

### Using LSA 
Latent semantic analysis (LSA) assumes that words that are close in meaning will occur in the same documents. Use package sumy to provide multiple summarization methods. 

In [None]:
LANGUAGE = 'english'
stemmer = Stemmer(LANGUAGE)

parser = PlaintextParser.from_string(filtered_df['text'], Tokenizer(LANGUAGE))
summarizer = LsaSummarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)

for sentence in summarizer(parser.document, num_summary_sentence):
    print (str(sentence))

In [None]:
summary_sentence = tfidf_summary = 

## Extractive Summarization with Machine Learning

### 1. Create target labels 
The target label defines whether a particular post should be included in the summary. 

In [229]:
gss = GroupShuffleSplit(n_splits=1, test_size=0.2)
train_split, test_split = next(gss.split(filtered_df, groups=filtered_df['parent_asin']))
train_df = filtered_df.iloc[train_split]
test_df = filtered_df.iloc[test_split]
print('Number of threads for Training: ', train_df['parent_asin'].nunique())
print('Number of threads for Testing: ', test_df['parent_asin'].nunique())

Number of threads for Training:  140
Number of threads for Testing:  35


### Vectorize the sentences

In [230]:
# Convert text data into numerical form using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(filtered_df['text'])


# Convert the sparse matrix to a dense format 

TypeError: sparse array length is ambiguous; use getnnz() or shape[0]

### Sentence Ranking

In [None]:
def rank_sentences(filtered_df['text'], X):
    sentence_scores = tfidf_matrix.sum(axis=1).A1
    ranked_sentences = [sentences[i] for i in sentence_scores.argsort()[::-1]]
    return ranked_sentences

filtered_df['ranked_sentences'] = filtered_df.apply(lambda x: rank_sentences(x['sentences'], x['tfidf_matrix']), axis=1)

### K-means clustering
Use either methods to determine the optimal cluster: 
1. Elbow method:
    - Involves plotting the within-cluster sum of squares (WCSS) against the number of clusters.
    - The elbow point (where the WCSS starts to level off) shows the optimal number of clusters. Adding more clusters beyond that point doesn't significantly reduce WCSS.
2. Silhouette method:
    - Evaluates how well each point lies within its cluster.
    - The optimal number of clusters is the one with the highest average silhouette score. 

In [None]:
# Using the elbow method to find the optimal number of clusters
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
    
# Plotting the elbow graph
plt.figure(figsize=(5,3))
plt.plot(range(1, 11), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
km = KMeans(random_state=42)
visualizer = KElbowVisualizer(km, k=(1,11))
 
visualizer.fit(X_dense) 
visualizer.show()  

In [None]:
km = KMeans(random_state=42)
visualizer = SilhouetteVisualizer(km, k=(1,11))
 
visualizer.fit(X_dense) 
visualizer.show()  

In [None]:
k = 8  # Choose based on the elbow plot
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=300, n_init=10, random_state=0)
y_kmeans = kmeans.fit_predict(X_dense)

In [None]:
filtered_df['cluster'] = y_kmeans
print(filtered_df['cluster'].value_counts())

In [None]:
for i in range(k):
    print(f"Cluster {i}")
    print(filtered_df[filtered_df['cluster'] == i]['text'].head(5))
    print("\n")

### 