# Text Summarization of Amazon Reviews Data

In [155]:
import numpy as np 
import pandas as pd
import json
import time
import re
import seaborn as sns
import random
import html

from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline

import textacy

# Text cleaning
import spacy 
nlp = spacy.load('en_core_web_sm')

# Cosine Similarity 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import GroupShuffleSplit

# Text summarization 
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.nlp.stemmers import Stemmer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.utils import get_stop_words
from sumy.summarizers.lex_rank import LexRankSummarizer 
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
import textdistance
from sklearn.cluster import KMeans
from sklearn import metrics

from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer
from nltk import sent_tokenize, word_tokenize, regexp_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import gensim.models.keyedvectors as word2vec
import gc
import string 
import nltk 
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\MJ\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Load saved reviews and metadata csv files

In [3]:
merge_df = pd.read_csv('../data/merge_df.csv', low_memory = False, index_col = False)
merge_df = merge_df.drop(columns = {'Unnamed: 0'})
merge_df.columns

Index(['rating', 'title', 'text', 'asin', 'parent_asin', 'user_id',
       'timestamp', 'helpful_vote', 'verified_purchase', 'year',
       'main_category', 'average_rating', 'rating_number', 'features',
       'description', 'price', 'images', 'videos', 'store', 'categories',
       'details', 'bought_together', 'subtitle', 'author'],
      dtype='object')

In [4]:
# Remove null values in 'text' column 
merge_df = merge_df.dropna(subset=['text'])

## Preprocess text column

1. Remove the noise from the text data.
2. Lemmatize and tokenize the text. 

In [10]:
text = "I love my Autoseal Bottles. I do agree that they are &#34;sippy cups for grown ups&#34; but it prevents many MANY liquid spills. I drink a lot of water every day and this will keep my water cold for 8 hours easily - that includes putting ice cubes in the bottle at the start of the day - has even kept my cold water with ice cubes cold inside a car on an 80 degree Farenheit Los Angeles day."
print(text)

I love my Autoseal Bottles. I do agree that they are &#34;sippy cups for grown ups&#34; but it prevents many MANY liquid spills. I drink a lot of water every day and this will keep my water cold for 8 hours easily - that includes putting ice cubes in the bottle at the start of the day - has even kept my cold water with ice cubes cold inside a car on an 80 degree Farenheit Los Angeles day.


### 1. Remove noise with regular expression. 

In [123]:
def regex_clean(text):
    # convert html escapes like &amp; to characters.
    text = html.unescape(text)
    # tags like <tab>
    text = re.sub(r'<[^<>]*>', ' ', text)
    # markdown URLs like [Some text](https://....)
    text = re.sub(r'\[([^\[\]]*)\]\([^\(\)]*\)', r'\1', text)
    # text or code in brackets like [0]
    text = re.sub(r'\[[^\[\]]*\]', ' ', text)
    # standalone sequences of specials, matches &# but not #cool
    text = re.sub(r'(?:^|\s)[&#<>{}\[\]+|\\:-]{1,}(?:\s|$)', ' ', text)
    # standalone sequences of hyphens like --- or ==
    text = re.sub(r'(?:^|\s)[\-=\+]{2,}(?:\s|$)', ' ', text)
    # Remove all punctuation except periods and apostrophes
    text = re.sub(r"[^\w\s'.]", '', text)
    # sequences of white spaces
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

regex_clean(text)

'I love my Autoseal Bottles. I do agree that they are sippy cups for grown ups but it prevents many MANY liquid spills. I drink a lot of water every day and this will keep my water cold for 8 hours easily that includes putting ice cubes in the bottle at the start of the day has even kept my cold water with ice cubes cold inside a car on an 80 degree Farenheit Los Angeles day.'

In [124]:
# Clean the text
merge_df['text'] = merge_df['text'].apply(regex_clean)
merge_df[['rating', 'text']].sample(2)

Unnamed: 0,rating,text
357738,5,LOVE this product We keep it in the car for pi...
814286,5,My wife and I are HUGE Rangers fan this tree t...


### 2. Lemmatize and tokenize the text

In [None]:
stopwords = spacy.lang.en.stop_words.STOP_WORDS
include_stopwords = {'would', 'I'}
stopwords |= include_stopwords
print('Original stopwords count:', len(stopwords))

def clean_data(text):
    """
    Cleans and preprocesses a text document using spaCy.
    
    This function takes a text document as input, converts it to lowercase, 
    lemmatizes the words, removes non-alphabetic characters, and filters out stopwords. 
    The resulting cleaned text is returned as a single string. 
    """
    #  convert to lowercase
    text = text.lower()
    # process the text with spaCy
    doc = nlp(text)
    # Lemmatize words 
    lemmas = [token.lemma_ for token in doc]
    tokens = [lemma for lemma in lemmas if lemma not in stopwords]
    # # Removing non-alphabetic characters and stopwords
    # tokens = [lemma for lemma in lemmas if lemma.isalpha() and lemma not in stopwords]
    cleaned_text = " ".join(tokens)
    
    return cleaned_text
    
clean_data(text)

## Feature Engingeering
1. Identify positive and negative reviews based on rating.
- Ratings of <= 4 are positive. Ratings of 3 => are negative. 

In [57]:
merge_df['positive_rating'] = 0

# Classify records with rating higher than or equal to 4, positive (1)
merge_df.loc[merge_df['rating'] >= 4, 'positive_rating'] = 1

# Classify records with rating less than and equal to 3, negative (0)
merge_df.loc[merge_df['rating'] < 4 , 'positive_rating'] = 0

In [61]:
merge_df[['rating', 'text', 'positive_rating']].sample(2)

Unnamed: 0,rating,text,positive_rating
1830,5,I originally purchased a 3 pack of these and l...,1
653,2,Triangle screws Really Hubby undid the top thi...,0


## Filtering the dataframe to a specific brand or product

In [5]:
print(f"Unique store count: {merge_df['parent_asin'].nunique()}")
print('-----------')
merge_df['store'].value_counts().head(25)

Unique store count: 245290
-----------


store
Coleman                   11886
Fitbit                     6787
CamelBak                   4897
Alvada                     4339
Franklin Sports            4142
BalanceFrom                3563
Amazon Basics              3516
WILSON                     3178
WinCraft                   3094
CAP Barbell                3026
Outdoorsman Lab            2971
FOCO                       2910
Schwinn                    2877
Contigo                    2867
Sunny Health & Fitness     2794
SHIMANO                    2727
adidas                     2611
Gaiam                      2563
Rico Industries            2452
Nalgene                    2407
Speedo                     2337
Razor                      2304
Sportneer                  2146
Yes4All                    2103
Northwest                  2095
Name: count, dtype: int64

In [125]:
# Assigning a specific store name
store = 'Contigo'

filtered_df = merge_df.loc[merge_df['store'] == store].copy()
filtered_df.reset_index(drop=True, inplace=True)
filtered_df.columns

Index(['rating', 'title', 'text', 'asin', 'parent_asin', 'user_id',
       'timestamp', 'helpful_vote', 'verified_purchase', 'year',
       'main_category', 'average_rating', 'rating_number', 'features',
       'description', 'price', 'images', 'videos', 'store', 'categories',
       'details', 'bought_together', 'subtitle', 'author'],
      dtype='object')

In [7]:
def generate_brand_report(df, store):
    """
    Generates a report for a specified store, including counts of unique parent and product IDs, 
    and statistics on reviews and ratings.
    """
    # Filter the DataFrame by the specified store
    filtered_df = df[df['store'] == store]
    print(f"Store: {store}")
    print('-----')

    # Count number of reviews
    complete_reviews_count = filtered_df['rating_number'].unique().sum()
    print(f"Number of reviews in complete Amazon dataset: {complete_reviews_count}")
    print(f"Number of reviews in df: {len(filtered_df)}")  # Some customers only include a title and rating to their reviews
    
    # Calculate the number of unique parent_asin values
    parent_asin_count = filtered_df['parent_asin'].nunique()
    print(f"Count of unique parent product IDs: {parent_asin_count}")
    
    # Count the number of unique asin values
    asin_count = filtered_df['asin'].nunique()
    print(f"Count of unique product ID's: {asin_count}")
    
    # Calculate the number of reviews and average review rating for each asin
    asin_reviews_ratings = filtered_df.groupby('parent_asin').agg({'average_rating': 'mean', 'rating_number': 'mean'}).reset_index()

    # Create a composite score for each product
    asin_reviews_ratings['composite_score'] = asin_reviews_ratings['average_rating'] * asin_reviews_ratings['rating_number']

    
    # Find the top and bottom ten products with the best and worst rating
    top_ten_best = asin_reviews_ratings.nlargest(10, 'composite_score')
    bottom_ten_worst = asin_reviews_ratings.nsmallest(10, 'composite_score')
    print('-----')
    print(f"Top ten products: \n {top_ten_best}")
    print('-----')
    print(f"Bottom ten products: \n {bottom_ten_worst}")

# Apply function
generate_brand_report(filtered_df, store)

Store: Contigo
-----
Number of reviews in complete Amazon dataset: 192942
Number of reviews in df: 2867
Count of unique parent product IDs: 175
Count of unique product ID's: 293
-----
Top ten products: 
     parent_asin  average_rating  rating_number  composite_score
158  B08XXQSHH6             4.8        30463.0         146222.4
169  B0BTHXPZWZ             4.8        12267.0          58881.6
173  B0C4VWCZRV             4.5        12678.0          57051.0
165  B0BT9JQ4LJ             4.7        10820.0          50854.0
121  B07GBFYHNM             4.6         9365.0          43079.0
164  B0BT9G6ZJC             4.7         7715.0          36260.5
142  B07PDHRYSB             4.7         7354.0          34563.8
162  B09R9X84HF             4.7         6619.0          31109.3
65   B00YYBBYN8             4.6         5090.0          23414.0
155  B08PWGXDFM             4.8         4754.0          22819.2
-----
Bottom ten products: 
     parent_asin  average_rating  rating_number  composite_score

In [8]:
def generate_random_examples(df, column, interest):
    """
    Generate three examples of random records with the specified interest. 
    
    Parameters: 
    df: DataFrame where the search will occur. 
    column: Column name to search within. 
    interest: Value to search for in the specified column. 
    """
    # Print average rating for parent_asin 
    interest_average_rating = df[df[column] == interest]['average_rating'].unique()
    print(f'Average rating: {interest_average_rating[0]}\n')
    print(f'Categories: {df[df[column] == interest]["categories"].iloc[0]}\n')
    print(f'Details: {df[df[column] == interest]["details"].iloc[0]}\n')
    
    # Filter the Datadf[df[column] == interest]Frame based on the interest
    filtered_df = df[df[column] == interest]

    # Set number of examples 
    num_of_ex = 3 
    
    # Check if there are enough records to sample
    if len(filtered_df) < num_of_ex:
        print(f"Not enough records found for interest '{interest}'. Found {len(filtered_df)} records.")
        return
    
    # Randomly select three records
    random_examples = filtered_df.sample(n=num_of_ex)
    
    for i, row in random_examples.iterrows():
        print('-----')
        print(f'Rating: {row["rating"]}\n')
        print(f'Title: {row["title"]}\n')
        print(f'Text: {row["text"]}\n')


In [9]:
# Apply random example generator for best products 
df = filtered_df
column = 'parent_asin'
interest = 'B08XXQSHH6'
generate_random_examples(df, column, interest)

Average rating: 4.8

Categories: ['Sports & Outdoors', 'Sports & Outdoor Recreation Accessories', 'Sports Water Bottles']

Details: {'Brand': 'Contigo', 'Capacity': '1.5 Pounds', 'Color': 'Silver and Blue', 'Recommended Uses For Product': 'Home', 'Age Range (Description)': 'Adult', 'Product Dimensions': '2.75 x 2.75 x 11 inches', 'Model Name': 'Auto Seal Chill', 'Item Weight': '8.6 ounces', 'Theme': 'Sport', 'Material': 'Aluminum', 'Number of Items': '1', 'Included Components': 'Bottle^Lid', 'Product Care Instructions': 'Hand Wash Only', 'Cap Type': 'Loop Cap', 'Manufacturer': 'Contigo', 'Item model number': '2001714', 'Best Sellers Rank': {'Sports & Outdoors': 30233, 'Water Bottles': 801}, 'Is Discontinued By Manufacturer': 'No', 'Date First Available': 'December 24, 2016'}

-----
Rating: 5

Title: Holds water

Text: Great

-----
Rating: 5

Title: Great water bottle

Text: This keeps water cold most of the day.  Keeps ice very well.  I like the vacuum seal much like there coffee mugs.

### Filtering to a specific product

In [126]:
product = 'B08XXQSHH6'

filtered_df = filtered_df.loc[filtered_df['parent_asin'] == product]
filtered_df.reset_index(drop=True, inplace=True)
len(filtered_df)

249

## Summarization Techniques

### Using TF-IDF 

Use TF-IDF vectorizer to transform the text into vectors based on the frequency of words in the text.

In [None]:
# Combine preprocessed sentences back into the original review structure 
combined_sentences = nlp_application['preprocessed_sentences'].apply(lambda sentences: [' '.join(sentence) for sentence in sentences])
nlp_application['combined_preprocessed'] = combined_sentences.apply(lambda x: ' '.join(x))

In [None]:
tfidfvect = TfidfVectorizer()
tfidf_text = tfidfvect.fit_transform(nlp_application['combined_preprocessed'])

In [None]:
tfidf_text

In [None]:
combined_sentences

In [None]:
# Parameter to specify number of summary sentences 
num_summary_sentence = 10

# Sum the TF-IDF values for each sentence
sent_sum = tfidf_text.sum(axis = 1)
important_sent = np.argsort(sent_sum, axis = 0)[::-1]

# Print three most import sentences in the order they appear in the article 
print("Most Important Sentences Based on TF-IDF:")
for i in range(0, len(nlp_application['combined_preprocessed'])):
    if i in important_sent[:num_summary_sentence]:
        print(nlp_application['combined_preprocessed'])

In [None]:
def summarize_with_tfidf(doc, num_summary_sentence = num_summary_sentence):
    """
    Apply the TF-IDF vectorization and then aggregate the value to a sentence level.
    Generate a score for each sentence as a sum of the TF-IDF values for each word in that sentence. 
    A sentence with a high score contains many important words as compared to other sentences in the column.
    """

    # Tokenize the text into sentences 
    sentences = []
    for text in doc:
        sentences.extend(sent_tokenize(text))

    # Compute TF-IDF for the sentences 
    tfidf = TfidfVectorizer()
    tfidf_text = tfidf.fit_transform(sentences)
    
    # Sort the sentences in descending order by the sum of TF-IDF values 
    sent_sum = tfidf_text.sum(axis = 1)
    important_sent = np.argsort(sent_sum, axis = 0)[::-1].flatten()
    
    # Collect the most important sentences 
    summary_sentences = [sentences[i] for i in important_sent[:num_summary_sentence]]

    return summary_sentences

In [None]:
num_summary_sentence = 10
summarize_with_tfidf(filtered_df['text'])

## Summarizing with Sumy package

In [127]:
# Create a list of reviews 
reviews = filtered_df['text'].tolist()
# Combine the list of reviews into one string
combined_text = ' '.join(reviews)

### LSA algorithm

Latent semantic analysis (LSA) assumes that words that are close in meaning will occur in the same documents. Use package sumy to provide multiple summarization methods. 

In [153]:
LANGUAGE = 'english'

def lsa_summarize(reviews, sentence_count=5):
    """
    Summarizes the given string of reviews by extracting important sentences.
    """
    parser = PlaintextParser.from_string(reviews, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = LsaSummarizer(stemmer)
    
    # Generate the summary
    summary = summarizer(parser.document, sentence_count)
    
    return ' '.join(str(sentence) for sentence in summary)

lsa_summarize(combined_text)

"For me the most important feature of any travel bottlemug is a leak proof lid and on that front Contigo has nailed it again. We love these and have several Keep cool but leaks Handy way to get extra water into my body while I'm driving outside gardening at gkids sporting events. Since it seals itself automatically I dont have to worry about those colored sports drinks pouring out or leaking all over my car after the game. My only gripe is that the autoseal button had a minor cosmetic blemish so make sure you check your bottle upon receipt. It is perfectly designed and we love them I actually dumped most of our other water bottles out this week because these are so much better."

### Lex Rank

A graphical based text summarizer

In [151]:
def lexrank_summarize(reviews, sentence_count=5):
    """
    Summarizes the given string of reviews by extracting important sentences.
    """
    parser = PlaintextParser.from_string(reviews, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = LexRankSummarizer(stemmer)
    
    # Generate the summary
    summary = summarizer(parser.document, sentence_count)
    
    return ' '.join(str(sentence) for sentence in summary)

lexrank_summarize(combined_text)

"I like that they have a cap over the piece that you drink from but even when that is open it doesn't leak. I used to have a plastic contigo with the rubber spout you sucked from so I wasn't sure if I would like having to push a button and turn it upside down but its a great water bottle keeps my water cold for longer periods of time I can take it on the beach and it stays cold for a couple hours. Just an FYI in case you are interested the little leaflet that was inside the bottle when I got it which I actually didn't realize was in there and ended up washing along with the inside of the bottle before the first use....oops states the following paraphrased It is NOT for use with hot contents Do not use with carbonated or pulpy beverages Do not microwave duh...it's metal or freeze interesting If you wash the lid by hand you're supposed to soak it in hotsoapy water for 10 minutes Hand wash the body it should not be put in the dishwasher and you might want to get a bottle brush if you don'

### Luhn Summarizer

It scores sentences based on frequency of the most important words. 

In [156]:
def luhn_summarize(reviews, sentence_count=5):
    """
    Summarizes the given string of reviews by extracting important sentences.
    """
    parser = PlaintextParser.from_string(reviews, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = LuhnSummarizer(stemmer)
    
    # Generate the summary
    summary = summarizer(parser.document, sentence_count)
    
    return ' '.join(str(sentence) for sentence in summary)

luhn_summarize(combined_text)

"I used to have a plastic contigo with the rubber spout you sucked from so I wasn't sure if I would like having to push a button and turn it upside down but its a great water bottle keeps my water cold for longer periods of time I can take it on the beach and it stays cold for a couple hours. Would definitely recommend to a friend and I have been singing its praises to all my family who are also hooked in Contigo products christmas gift looks great Great bottle Wonderful for the summer and winter so water doesn't freeze but stays a nice and cool temperature and for my husband he likes to keep the water room temperature no matter the weather outside so this works perfectly for him as well. Just an FYI in case you are interested the little leaflet that was inside the bottle when I got it which I actually didn't realize was in there and ended up washing along with the inside of the bottle before the first use....oops states the following paraphrased It is NOT for use with hot contents Do 

### Text Rank

In [159]:
def textrank_summarize(reviews, sentence_count=5):
    """
    Summarizes the given string of reviews by extracting important sentences.
    """
    parser = PlaintextParser.from_string(reviews, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = TextRankSummarizer(stemmer)
    
    # Generate the summary
    summary = summarizer(parser.document, sentence_count)
    
    return ' '.join(str(sentence) for sentence in summary)

textrank_summarize(combined_text)

"I used to have a plastic contigo with the rubber spout you sucked from so I wasn't sure if I would like having to push a button and turn it upside down but its a great water bottle keeps my water cold for longer periods of time I can take it on the beach and it stays cold for a couple hours. Would definitely recommend to a friend and I have been singing its praises to all my family who are also hooked in Contigo products christmas gift looks great Great bottle Wonderful for the summer and winter so water doesn't freeze but stays a nice and cool temperature and for my husband he likes to keep the water room temperature no matter the weather outside so this works perfectly for him as well. Just an FYI in case you are interested the little leaflet that was inside the bottle when I got it which I actually didn't realize was in there and ended up washing along with the inside of the bottle before the first use....oops states the following paraphrased It is NOT for use with hot contents Do 

## Extractive Summarization with Machine Learning

### 1. Create target labels 
The target label defines whether a particular post should be included in the summary. 

In [None]:
gss = GroupShuffleSplit(n_splits=1, test_size=0.2)
train_split, test_split = next(gss.split(filtered_df, groups=filtered_df['parent_asin']))
train_df = filtered_df.iloc[train_split]
test_df = filtered_df.iloc[test_split]
print('Number of threads for Training: ', train_df['parent_asin'].nunique())
print('Number of threads for Testing: ', test_df['parent_asin'].nunique())

### Vectorize the sentences

In [None]:
# Convert text data into numerical form using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(filtered_df['text'])

# Convert the sparse matrix to a dense format 

### Sentence Ranking

In [None]:
def rank_sentences(filtered_df['text'], X):
    sentence_scores = tfidf_matrix.sum(axis=1).A1
    ranked_sentences = [sentences[i] for i in sentence_scores.argsort()[::-1]]
    return ranked_sentences

filtered_df['ranked_sentences'] = filtered_df.apply(lambda x: rank_sentences(x['sentences'], x['tfidf_matrix']), axis=1)

### K-means clustering
Use either methods to determine the optimal cluster: 
1. Elbow method:
    - Involves plotting the within-cluster sum of squares (WCSS) against the number of clusters.
    - The elbow point (where the WCSS starts to level off) shows the optimal number of clusters. Adding more clusters beyond that point doesn't significantly reduce WCSS.
2. Silhouette method:
    - Evaluates how well each point lies within its cluster.
    - The optimal number of clusters is the one with the highest average silhouette score. 

In [None]:
# Using the elbow method to find the optimal number of clusters
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
    
# Plotting the elbow graph
plt.figure(figsize=(5,3))
plt.plot(range(1, 11), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
km = KMeans(random_state=42)
visualizer = KElbowVisualizer(km, k=(1,11))
 
visualizer.fit(X_dense) 
visualizer.show()  

In [None]:
km = KMeans(random_state=42)
visualizer = SilhouetteVisualizer(km, k=(1,11))
 
visualizer.fit(X_dense) 
visualizer.show()  

In [None]:
k = 8  # Choose based on the elbow plot
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=300, n_init=10, random_state=0)
y_kmeans = kmeans.fit_predict(X_dense)

In [None]:
filtered_df['cluster'] = y_kmeans
print(filtered_df['cluster'].value_counts())

In [None]:
for i in range(k):
    print(f"Cluster {i}")
    print(filtered_df[filtered_df['cluster'] == i]['text'].head(5))
    print("\n")

## Extractive Summarization with Pre-trained Sentence Embeddings

In [36]:
def split_sentences(reviews):
    """
    Splits the reviews into individual sentences.
    """
    n_reviews = len(reviews)
    for i in range(n_reviews):
        review = reviews[i]
        #print(email)
        sentences = sent_tokenize(review)
        #print(sentences)
        for j in reversed(range(len(sentences))):
            sent = sentences[j]
            sentences[j] = sent.strip()
            if sent == '':
                sentences.pop(j)
        reviews[i] = sentences

In [37]:
rev_list = list(filtered_df['text'])
split_sentences(rev_list)
# Adding split reviews in the data frame
filtered_df['sent_tokens'] = rev_list
# Calculating lenght of sentences in each review
filtered_df['length_of_rv'] = filtered_df['sent_tokens'].map(lambda x: len(x))

In [53]:
filtered_df['length_of_rv'].value_counts()

length_of_rv
1     854
2     575
3     444
4     308
5     199
6     132
7      96
8      57
9      44
10     31
11     20
13     16
15     15
12     14
17     10
14      9
16      8
18      6
26      3
28      3
0       3
19      3
20      3
21      3
24      2
30      2
27      2
22      1
34      1
23      1
39      1
36      1
Name: count, dtype: int64

In [55]:
filtered_df[['rating', 'text', 'sent_tokens', 'length_of_rv']].sample(4)

Unnamed: 0,rating,text,sent_tokens,length_of_rv
2640,5,Really doesnt spill unless you flip it upside ...,[Really doesnt spill unless you flip it upside...,2
866,4,Not bad just different than what were used to....,[Not bad just different than what were used to...,4
1401,5,Great for everyone. Kids can use them and they...,"[Great for everyone., Kids can use them and th...",2
1047,4,Worked well but handle broke off after only ha...,[Worked well but handle broke off after only h...,1


In [56]:
# Making vocabulary with reviews with max vocabs=5000. 
list_sentences_train = filtered_df['text']
max_features = 5000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
maxlen = 200
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)

In [None]:
def loadEmbeddingMatrix(typeToLoad):
        if(typeToLoad=="glove"):
            EMBEDDING_FILE='glove.twitter.27B.25d.txt/glove.twitter.27B.25d.txt'
            embed_size = 25
        
        elif(typeToLoad=="fasttext"):
            EMBEDDING_FILE='wiki.simple.vec/wiki.simple.vec'
            embed_size = 300

        if(typeToLoad=="glove" or typeToLoad=="fasttext" ):
            embeddings_index = dict()
            #Transfer the embedding weights into a dictionary by iterating through every line of the file.
            f = open(EMBEDDING_FILE, encoding='utf-8')
            for line in f:
                #split up line into an indexed array
                values = line.split()
                #first index is word
                word = values[0]
                #store the rest of the values in the array as a new array
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs #50 dimensions
            f.close()
            print('Loaded %s word vectors.' % len(embeddings_index))
       # else:
        #    embeddings_index = dict()
         #   for word in word2vecDict.wv.vocab:
          #      embeddings_index[word] = word2vecDict.word_vec(word)
           # print('Loaded %s word vectors.' % len(embeddings_index))
            
        gc.collect()
        return embeddings_index #, embedding_matrix

In [59]:
# Function to generate summaries
def generate_summary(review_text):
    # Tokenize the review text
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts([review_text])
    sequence = tokenizer.texts_to_sequences([review_text])[0]
    
    # Create a matrix of word embeddings for the review
    review_embedding_matrix = np.zeros((len(sequence), embedding_dim))
    for i, word_idx in enumerate(sequence):
        review_embedding_matrix[i] = embedding_matrix_vocab.get(word_idx, np.zeros(embedding_dim))
    
    # Calculate similarity matrix
    similarity_matrix = cosine_similarity(review_embedding_matrix)
    
    # Calculate sentence scores based on similarity
    sentence_scores = similarity_matrix.sum(axis=1)
    
    # Select top sentences for summary (e.g., top 3 sentences)
    top_sentence_indices = sentence_scores.argsort()[-3:][::-1]
    
    # Construct summary from top sentences
    summary = ""
    for idx in top_sentence_indices:
        summary += review_text.split('.')[idx] + ". "
    
    return summary.strip()

# Apply summarization to each review and store the summaries in a new column
filtered_df['summary'] = filtered_df['text'].apply(generate_summary)

NameError: name 'embedding_dim' is not defined