In [63]:
import pandas as pd
import re
import math
import time
import datetime
import numpy as np
import nltk
import gensim
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.util import bigrams
from nltk.corpus import stopwords
from nltk.stem.porter import *
from gensim.models import Word2Vec
from gensim import models
from gensim import corpora
from gensim import similarities
from openai.embeddings_utils import cosine_similarity
from sklearn.metrics.pairwise import cosine_similarity

## Unigram model

In [64]:
df = pd.read_excel('amazon_review_processed_full.xlsx')
#df.columns

In [65]:
# Tokenisation (Full Review contains processed text - applied stemming and stopword removal, spelling check)
df['Tokenized Full review'] = df['Full review'].apply(lambda x: word_tokenize(str(x)) if isinstance(x, str) else [])
#df[['Full review', 'Tokenized Full review']].head(5)

### Raw term frequency

In [66]:
# Create a dictionary from the tokenized content, bag of words and reverse index
dictionary = corpora.Dictionary(df['Tokenized Full review'])
corpus = [dictionary.doc2bow(text) for text in df['Tokenized Full review']]
Index = similarities.SparseMatrixSimilarity(corpus, len(dictionary))

In [67]:
def query_raw(text):
    qList = text.split()  
    qLower = [w.lower() for w in qList]
    stemmer = PorterStemmer()
    qStemmed = [stemmer.stem(w) for w in qLower]
    qVector = dictionary.doc2bow(qStemmed)
    return qVector

In [68]:
qVector = query_raw('low quality')
simRaw = Index[qVector]
df['Similarity_Raw'] = simRaw
df_raw = df.sort_values(by = 'Similarity_Raw', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_Raw"]
pd.set_option('display.max_colwidth', None)
df_raw[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_Raw
686,Canon PIXMA TS3520,2022-11-13,5,good quality low price,Very pleased with printer. Good quality for a low price.,0.685994
5238,Canon PIXMA TR4720,2023-08-16,2,Low quality printing,"It was easy to install, but the printing job sucked. I decided to use it to print PP slides for the first time and the quality was very low.",0.685994
3805,HP DeskJet 2755e,2023-04-29,4,Quality?,Print quality not what I would expect from HP.,0.57735
9777,Canon PIXMA TR4720,2023-07-25,3,"Loud and not high quality, but it’s cheap","Beware the quality is very low on this product, but it does the job that I need. It’s not the quietest printer I’ve had. You get what you pay for.",0.547723
7068,Epson - ET-4850,2023-05-14,1,Very Poor Quality,I have had this printer for several months. It is cheap quality. Paper constantly jams. The interface is baffling and nearly unusable. Now it jams every time I use it. Just a very low quality piece of equipment.,0.544331


In [69]:
qVector = query_raw('Paper jam')
simRaw = Index[qVector]
df['Similarity_Raw'] = simRaw
df_raw = df.sort_values(by = 'Similarity_Raw', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_Raw"]
pd.set_option('display.max_colwidth', None)
df_raw[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_Raw
2374,Canon PIXMA TR4720,2022-12-01,2,Printer Jam,"The printer jams each time I print, very annoying. Ink runs out quickly. I barely use the printer due to paper jamming.",0.547723
3944,HP ENVY 6455e,2022-01-22,4,Paper tends to jam,Paper tends to jam,0.5
2470,HP Smart Tank 7301,2023-01-28,1,Jams constantly,The printer consistently jams. It wont print anything. It wont even print a page without jamming.,0.486664
8904,HP ENVY Inspire 7955e,2023-08-21,1,paper jams CONSTANTLY,"Every time I try to print more than one page - the pages get jammed (although no paper actually gets stuck - it partially prints and then says jammed). I've tried different papers - thinner, thicker - it doesn't matter. It prints 2 pages and then jams - prints 2 more and jams. It partially prints and says it's jammed and ended up wasting over 30 pieces of paper trying to print an 11 page document.",0.4741
400,HP OfficeJet Pro 9015e,2022-03-22,3,continues to have paper jams,"what can I do, this printer continues to have paper jams and is not working",0.471405


### TF-IDF

In [70]:
# Create a TFIDF reverse index
TFIDF = models.TfidfModel(corpus)
corpus_TFIDF = [TFIDF[vec] for vec in corpus]
IndexTFIDF = similarities.SparseMatrixSimilarity(corpus_TFIDF, len(dictionary))

In [71]:
def query_tfidf(text):
    qList = text.split()  
    qLower = [w.lower() for w in qList]
    stemmer = PorterStemmer()
    qStemmed = [stemmer.stem(w) for w in qLower]
    qVector = dictionary.doc2bow(qStemmed)
    qVectorTFIDF = TFIDF[qVector]
    return qVectorTFIDF

In [72]:
qVectorTFIDF = query_tfidf('low quality')
simTFIDF = IndexTFIDF[qVectorTFIDF]
df['Similarity_TFIDF'] = simTFIDF
df_tfidf = df.sort_values(by = 'Similarity_TFIDF', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_TFIDF"]
pd.set_option('display.max_colwidth', None)
df_tfidf[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_TFIDF
686,Canon PIXMA TS3520,2022-11-13,5,good quality low price,Very pleased with printer. Good quality for a low price.,0.751272
5238,Canon PIXMA TR4720,2023-08-16,2,Low quality printing,"It was easy to install, but the printing job sucked. I decided to use it to print PP slides for the first time and the quality was very low.",0.597942
3187,HP DeskJet 2755e,2023-08-17,5,low price,setup took time,0.558956
1593,HP ENVY 6055e,2021-12-03,3,Low Quality,this printer disconnects all the time. I spend a lot of time trying to print. the printer came with toner but my printer is reading as it is low ink?? it's really frustrating trying to use this.,0.540725
1627,HP Smart Tank 5101,2023-09-10,5,It's works,This one is worth it it has where I can see the when the ink gets low,0.512525


In [73]:
qVectorTFIDF = query_tfidf('paper jam')
simTFIDF = IndexTFIDF[qVectorTFIDF]
df['Similarity_TFIDF'] = simTFIDF
df_tfidf = df.sort_values(by = 'Similarity_TFIDF', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_TFIDF"]
pd.set_option('display.max_colwidth', None)
df_tfidf[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_TFIDF
6611,Epson - ET-4850,2023-08-16,1,Paper Jam ALL THE TIME,"I need a copier that I can feed at least a few papers through at a time but this one doesn't copy even one paper without a paper jam, constantly, over and over again. If you need a copier that will consistently work, this one is not it.",0.454711
6179,Canon Pixma TS6420a,2022-08-10,4,Works fine,"I switched from HP to Canon. This printer prints well and offers affordable subscription plans. It is very easy to set up if you follow the instructions in the box and on the screen. I like the fact that it has wireless connection. The printer prints fast but 'moves'. The quality of the photos is excellent. It prints well on papers as well, but depending on the picture quality or paper quality, it sometimes prints papers as bluish with gray. As a student, I use it mostly for printing research papers.",0.443145
8904,HP ENVY Inspire 7955e,2023-08-21,1,paper jams CONSTANTLY,"Every time I try to print more than one page - the pages get jammed (although no paper actually gets stuck - it partially prints and then says jammed). I've tried different papers - thinner, thicker - it doesn't matter. It prints 2 pages and then jams - prints 2 more and jams. It partially prints and says it's jammed and ended up wasting over 30 pieces of paper trying to print an 11 page document.",0.424191
1473,Canon PIXMA TR4720,2023-03-05,1,Jams and Doesn't Print Correctly,"This printer is awful. Just after the return window was closed, it stopped working! It tries to suck up too many papers at once and jams every single time I print more than one sheet. I had high hopes for this printer, but it was just a waste of money. I'm super disappointed! I have even tried different brands of paper but it still jams. It prints blurry, but I'm not sure if that's from being a crappy printer or from wrestling jammed paper out of it all the time. Either way, avoid this one!!",0.405851
1554,Epson - Workforce 3820,2023-07-12,3,Pick Roller ineffective,"This printer has a pick roller issue. It will often get papers jam, picking multiple papers at once and it is a bit slower compared to similar printers.",0.397878


## Bigram model

In [74]:
#preprocess data to extract bigrams from original reviews
df['Original_full'] = df['Original title'].str.cat(df['Original review'], sep=' ', na_rep='')
#df['Original_full'].head(5)

In [75]:
# Define a function to preprocess and tokenize the text
def preprocess_text(text):
    if isinstance(text, str):
        # Lowercase the text
        text = text.lower()

        # Remove non-alphanumeric characters
        text = re.sub(r'[^\w\s]', ' ', text)

        # Tokenize the text
        unigrams = word_tokenize(text)
        bigrams_list = list(bigrams(unigrams))

        # Stemming
        stemmer = PorterStemmer()
        stemmed_unigrams = [stemmer.stem(word) for word in unigrams]
        stemmed_bigrams = [tuple(stemmer.stem(word) for word in bigram) for bigram in bigrams_list]

        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        unigrams_without_stopwords = [word for word in stemmed_unigrams if word not in stop_words]
        bigrams_without_stopwords = [' '.join(bigram) for bigram in stemmed_bigrams if not any(word in stop_words for word in bigram)]

        # Join both unigrams and bigrams
        tokens = unigrams_without_stopwords + bigrams_without_stopwords
    else:
        tokens = [] 

    return tokens

# Apply the modified preprocessing function to review data
df['Processed_bigram'] = df['Original_full'].apply(preprocess_text)

### Raw Term Frequency

In [76]:
dictionary_bi = corpora.Dictionary(df['Processed_bigram'])
corpus_bi = [dictionary_bi.doc2bow(text) for text in df['Processed_bigram']]
Index_bi = similarities.SparseMatrixSimilarity(corpus_bi, len(dictionary_bi))

df['Vector_Raw_Bigram'] = corpus_bi
print(len(dictionary_bi))

97667


In [77]:
def query_raw_bi(query):
    preprocessed_query = preprocess_text(query)
    qVector_bi = dictionary_bi.doc2bow(preprocessed_query)
    return qVector_bi

In [78]:
qVector_bi = query_raw_bi('low quality')
SimRaw_bi = Index_bi[qVector_bi]
df['Similarity_Raw_Bigram'] = SimRaw_bi
df_raw_bi = df.sort_values(by = 'Similarity_Raw_Bigram', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_Raw_Bigram"]
pd.set_option('display.max_colwidth', None)
df_raw_bi[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_Raw_Bigram
5238,Canon PIXMA TR4720,2023-08-16,2,Low quality printing,"It was easy to install, but the printing job sucked. I decided to use it to print PP slides for the first time and the quality was very low.",0.440225
686,Canon PIXMA TS3520,2022-11-13,5,good quality low price,Very pleased with printer. Good quality for a low price.,0.414781
7068,Epson - ET-4850,2023-05-14,1,Very Poor Quality,I have had this printer for several months. It is cheap quality. Paper constantly jams. The interface is baffling and nearly unusable. Now it jams every time I use it. Just a very low quality piece of equipment.,0.412393
2076,Epson - ET-2800,2023-02-28,2,,There’s only normal or high quality. There’s no draft or low quality mode to save ink. This really bothers me. I’ll return to post about this printer yield. At least I got rid of cartridges.,0.379663
3449,Epson - ET-4850,2023-02-27,5,Color and print quality,Quality is excellent and fast,0.365148


In [81]:
qVector_bi = query_raw_bi('Paper jam')
SimRaw_bi = Index_bi[qVector_bi]
df['Similarity_Raw_Bigram'] = SimRaw_bi
df_raw_bi = df.sort_values(by = 'Similarity_Raw_Bigram', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_Raw_Bigram"]
pd.set_option('display.max_colwidth', None)
df_raw_bi[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_Raw_Bigram
4545,Canon PIXMA MG3620,2023-06-28,1,Consistent paper jams,"Most of the paper jams I could fix. However, I had a paper jam so bad, it destroyed the rollers so I used the warranty. Then, it happened again. Do yourself a favor, buy a different printer.",0.766131
400,HP OfficeJet Pro 9015e,2022-03-22,3,continues to have paper jams,"what can I do, this printer continues to have paper jams and is not working",0.755929
5691,HP OfficeJet Pro 8025e,2021-10-17,2,Poor quality.,"If you print something? Paper jams. Reload the paper the only way it can go? Paper jam.WiFi blinks? Paper jam. You leave the printer alone too long? Paper jam. It’s incredible how hard it must be to make a printer that just prints when it’s told to, because no one can seem to do it. I’ve had this for 3 months and it’s a very high maintenance printer.",0.712396
7614,HP ENVY Inspire 7255e,2022-05-14,2,Can't find paper jam,"I can't set it up with my laptop, keeps saying I have a paper jam.I need a printer manual.",0.707107
5705,Epson - Workforce 4820,2022-06-19,3,Paper jam,"If you don’t mind a paper jam several times after filling the paper tray, then this machine is for you. I purchased 2 months ago. Fill the paper tray, use about 1/2 the paper in the tray, and bam…. A paper jam!Of course the jam is in the back of printer…. What a pain!!!I do not recommend this printer",0.699896


In [82]:
qVector_bi = query_raw_bi('Customer support')
SimRaw_bi = Index_bi[qVector_bi]
df['Similarity_Raw_Bigram'] = SimRaw_bi
df_raw_bi = df.sort_values(by = 'Similarity_Raw_Bigram', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_Raw_Bigram"]
pd.set_option('display.max_colwidth', None)
df_raw_bi[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_Raw_Bigram
6889,HP OfficeJet Pro 8034e,2023-06-12,1,Worst Customer Support Ever!,I just loaded brand new cartridges and my printing is completely faded. Absolutely horrible getting in touch with customer support to help. Do not buy!,0.57735
3330,Canon PIXMA MG3620,2023-07-26,3,Powers completely off when not in use,"Returning this. For 1. when not in use it shuts down completely. I don’t know how long it stays idle before shutting down and there’s no way to find out because it’s not in the manual nor can you call customer support without having an account and providing your email and phone number. I also tried to call customer support to get an answer of is a Google Chromebook compatible? It does not appear to be but that is not listed anywhere in the instructions and again, I can’t speak to anyone in customer support unless I create an account. It took me awhile to set it up wirelessly but I succeeded and have printed from my iPhone a few times, but the Chromebook is a no go.",0.495434
7061,HP OfficeJet Pro 9025e,2022-09-22,1,"Won't connect to scan, no customer support",I have had three of these same printers for 5 years. This one will not connect to my wifi network or scan. The customer support number is not a working number. HP tries to charge to answer any questions. I would not recommend!,0.485071
8300,HP DeskJet 2755e,2023-08-08,3,Help printer woes,Had to call customer support to set up/ link to my devices,0.480384
738,Canon PIXMA TR4720,2021-10-01,1,"Cheap, flimsy and the worst customer support","Very fragile and flimsy printer. I had a simple question for Customer support and it took me an hour to get through. Finally I found out I had to register the product in order to even talk to someone. When you try to register online they ask you for your product through voice recognition. I said my model 3 times and they don't recognize my model number and hang up on you. I did this for an hour, had my wife and son talk into the phone and apparently my model number TR4720 does not exist, even though I have it in front of me.Bottom line if you need customer support you best figure it out on your own because nobody is there to help you.Lets see how long the paper tray holds up, maybe a month or so? I think a piece of tape to hold it together would be better.I will be returning this printer.",0.412082


### TF-IDF

In [83]:
# Create a TFIDF reverse index
TFIDF_bi = models.TfidfModel(corpus_bi)
corpus_TFIDF_bi = [TFIDF_bi[vec] for vec in corpus_bi]
IndexTFIDF_bi = similarities.SparseMatrixSimilarity(corpus_TFIDF_bi, len(dictionary_bi))

df['Vector_TFIDF_Bigram'] = corpus_TFIDF_bi
print(len(dictionary_bi))

97667


In [84]:
def query_tfidf_bi(query):
    preprocessed_query = preprocess_text(query)
    qVector_bi = dictionary_bi.doc2bow(preprocessed_query)
    qVectorTFIDF_bi = TFIDF_bi[qVector_bi]
    return qVectorTFIDF_bi

In [85]:
qVectorTFIDF_bi = query_tfidf_bi('low quality')
simTFIDF_bi = IndexTFIDF_bi[qVectorTFIDF_bi]
df['Similarity_TFIDF_Bigram'] = simTFIDF_bi
df_tfidf_bi = df.sort_values(by = 'Similarity_TFIDF_Bigram', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_TFIDF_Bigram"]
pd.set_option('display.max_colwidth', None)
df_tfidf_bi[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_TFIDF_Bigram
1593,HP ENVY 6055e,2021-12-03,3,Low Quality,this printer disconnects all the time. I spend a lot of time trying to print. the printer came with toner but my printer is reading as it is low ink?? it's really frustrating trying to use this.,0.392702
5238,Canon PIXMA TR4720,2023-08-16,2,Low quality printing,"It was easy to install, but the printing job sucked. I decided to use it to print PP slides for the first time and the quality was very low.",0.360529
7068,Epson - ET-4850,2023-05-14,1,Very Poor Quality,I have had this printer for several months. It is cheap quality. Paper constantly jams. The interface is baffling and nearly unusable. Now it jams every time I use it. Just a very low quality piece of equipment.,0.265875
2076,Epson - ET-2800,2023-02-28,2,,There’s only normal or high quality. There’s no draft or low quality mode to save ink. This really bothers me. I’ll return to post about this printer yield. At least I got rid of cartridges.,0.262055
3767,Canon PIXMA TR4720,2023-06-07,1,Item had obviously been used. Ink cartridges were dried up. Very disappointing !,"Printer did not produce quality copies. It arrived with original packaging already removed and Ink installed and dry. With new ink, the copies were low quality. Very disappointed in Cannon.",0.238766


In [86]:
qVectorTFIDF_bi = query_tfidf_bi('paper jam')
simTFIDF_bi = IndexTFIDF_bi[qVectorTFIDF_bi]
df['Similarity_TFIDF_Bigram'] = simTFIDF_bi
df_tfidf_bi = df.sort_values(by = 'Similarity_TFIDF_Bigram', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_TFIDF_Bigram"]
pd.set_option('display.max_colwidth', None)
df_tfidf_bi[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_TFIDF_Bigram
400,HP OfficeJet Pro 9015e,2022-03-22,3,continues to have paper jams,"what can I do, this printer continues to have paper jams and is not working",0.688106
3766,Epson - ET-2850,2023-06-13,1,Did not work.,"Printer did not work. It had a paper jam when I was setting up the printer. I cleared the paper jam as directed but the printer would not clear the paper jam error on the printer screen. I called Epson for help, and they said to just send it back.",0.62867
4545,Canon PIXMA MG3620,2023-06-28,1,Consistent paper jams,"Most of the paper jams I could fix. However, I had a paper jam so bad, it destroyed the rollers so I used the warranty. Then, it happened again. Do yourself a favor, buy a different printer.",0.552715
5705,Epson - Workforce 4820,2022-06-19,3,Paper jam,"If you don’t mind a paper jam several times after filling the paper tray, then this machine is for you. I purchased 2 months ago. Fill the paper tray, use about 1/2 the paper in the tray, and bam…. A paper jam!Of course the jam is in the back of printer…. What a pain!!!I do not recommend this printer",0.536228
5398,HP ENVY 6455e,2021-10-05,2,Printer jams …. All the time!,It is difficult to connect to the internet and the paper jams constantly,0.505307


In [87]:
qVectorTFIDF_bi = query_tfidf_bi('customer support')
simTFIDF_bi = IndexTFIDF_bi[qVectorTFIDF_bi]
df['Similarity_TFIDF_Bigram'] = simTFIDF_bi
df_tfidf_bi = df.sort_values(by = 'Similarity_TFIDF_Bigram', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_TFIDF_Bigram"]
pd.set_option('display.max_colwidth', None)
df_tfidf_bi[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_TFIDF_Bigram
6889,HP OfficeJet Pro 8034e,2023-06-12,1,Worst Customer Support Ever!,I just loaded brand new cartridges and my printing is completely faded. Absolutely horrible getting in touch with customer support to help. Do not buy!,0.389979
7061,HP OfficeJet Pro 9025e,2022-09-22,1,"Won't connect to scan, no customer support",I have had three of these same printers for 5 years. This one will not connect to my wifi network or scan. The customer support number is not a working number. HP tries to charge to answer any questions. I would not recommend!,0.38967
3330,Canon PIXMA MG3620,2023-07-26,3,Powers completely off when not in use,"Returning this. For 1. when not in use it shuts down completely. I don’t know how long it stays idle before shutting down and there’s no way to find out because it’s not in the manual nor can you call customer support without having an account and providing your email and phone number. I also tried to call customer support to get an answer of is a Google Chromebook compatible? It does not appear to be but that is not listed anywhere in the instructions and again, I can’t speak to anyone in customer support unless I create an account. It took me awhile to set it up wirelessly but I succeeded and have printed from my iPhone a few times, but the Chromebook is a no go.",0.362506
6289,Epson - Workforce 4820,2020-10-21,1,Software issues - do not order this printer,"I received my first printer and kept getting an error saying ""insert paper cassette correctly"". I am no novice to printers but I called customer support because no matter what I did it wasn't working. Customer support determined it was a software problem and to replace the unit. I did so and received my new one. Yayyy, I finally have a printer after months of back ordering and running to UPS for my print jobs. Nope, this one had the exact same issue, and I was told the exact same thing by customer support.Apparently this is a new model that was just released and they haven't worked out the bugs yet. DO NOT PURCHASE THIS PRINTER.",0.339225
10119,HP ENVY Inspire 7955e,2023-08-14,1,No customer support,A little over a year old and it’s saying there’s a paper jam but there’s no paper in it. Once your warranty expires HP will not talk to you,0.317486


## Export Data

In [88]:

columns = ['Review Model', 'Review date', 'Review name',
       'Review rating', 'Full review',
       'Verified Purchase or not', 'People_find_helpful', 'vine or not',
       'list price', 'rating count', 'overall rating', 'Original title',
       'Original review',  'Brand',
        'Vector_Raw_Bigram', 'Vector_TFIDF_Bigram', 'Processed_bigram', 'Original_full'
      ]
df_final = df[columns]
df_final.to_csv('document_retrieval.csv') 

In [89]:
# Save the processed data using joblib to feed on Streamlit app 
import joblib

processed_data = df_final
joblib.dump(processed_data, 'processed_data.joblib')

['processed_data.joblib']

# Use tensorflow to embed

In [26]:
#Use tensorflow to embed
# df['Review Content new'] = df['Review Content'].apply(lambda x: str(x) if x is not None else '') 

# def clean_text(text):
#     text = re.sub(r'[^a-zA-Z\s]', '', text)
#     text = text.lower()
#     return text

# df['Cleaned Reviews'] = df['Review Content new'].apply(clean_text)

# def remove_stopwords(text):
#     stop_words = set(stopwords.words('english'))
#     words = text.split()
#     filtered_words = [word for word in words if word.lower() not in stop_words]
#     return ' '.join(filtered_words)

# df['Final Reviews'] = df['Cleaned Reviews'].apply(remove_stopwords)

# model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# def embed(text,model):
#     embeddings = model(text)
#     return [embedding.numpy() for embedding in embeddings]


# df['Embed_sentence'] = embed(df['Final Reviews'], model)

# df['Embed_sentence']

In [27]:
def embed_label(text,model):
    embeddings = model([text])
    return embeddings.numpy()[0]

text = 'Connectivity'
input_embedding_vector = embed_label(text,model)

df['similarity'] = df['Embed_sentence'].apply(lambda x: cosine_similarity(input_embedding_vector,x))

NameError: name 'model' is not defined

In [None]:
pd.set_option('display.max_colwidth', None)
df_new = df.sort_values(by = 'similarity',ascending = False)
columns = ['Brand','Review Model','Review rating','Review Content']
df_select = df_new[columns]
df_final = df_select[df_select['Brand'] == 'HP'].head(4).reset_index(drop = True)
df_final

In [None]:
text = 'Print Quality'
input_embedding_vector = embed_label(text,model)

df['similarity'] = df['Embed_sentence'].apply(lambda x: cosine_similarity(input_embedding_vector,x))

pd.set_option('display.max_colwidth', None)
df_new = df.sort_values(by = 'similarity',ascending = False)
columns = ['Brand','Review Model','Review rating','Review Content']
df_select = df_new[columns]
df_final = df_select[df_select['Brand'] == 'HP'].head(4).reset_index(drop = True)
df_final

In [None]:
text = 'Paper jam'
input_embedding_vector = embed_label(text,model)

df['similarity'] = df['Embed_sentence'].apply(lambda x: cosine_similarity(input_embedding_vector,x))

pd.set_option('display.max_colwidth', None)
df_new = df.sort_values(by = 'similarity',ascending = False)
columns = ['Brand','Review Model','Review rating','Review Content']
df_select = df_new[columns]
df_final = df_select[df_select['Brand'] == 'HP'].head(4).reset_index(drop = True)
df_final

# Other (Draft)

In [None]:
# labels = [
#     'Setup', 
#     'Connectivity', 
#     'Customer Support', 
#     'Print Quality', 
#     'Print Speed', 
#     'Ink supply and Cartridge', 
#     'Printer Hardware Robustness and sturdiness', 
#     'Control Panel', 
#     'Ease of Use', 
#     'Firmware', 
#     'Business Services and Subscription', 
#      'Paper jam',
#      'Control Panel', 
#     'Other'
# ]

# df_labels = pd.DataFrame({'Labels': labels})
# df_labels

In [None]:
# def docs2vecs(docs,dictionary):
#     vec1 = [dictionary.doc2bow(doc) for doc in docs]
#     tfid = gensim.models.TfidfModel(vec1)
#     vec2 = [tfidf[vec] for vec in vec1]
#     return vec2

# vectorizer = TfidfVectorizer(max_df = 0.5, min_df = 2, stop_words = 'english')
# X = vectorizer.fit_transform(dataset.data)

In [None]:
# model = Word2Vec(sentences = label_token, vector_size=100, window=5, min_count=1, sg=0)

In [None]:
# from gensim.models import Word2Vec
# import pandas as pd

# # List of labels
# labels = [
#     'Setup', 
#     'Connectivity', 
#     'Customer Support', 
#     'Print Quality', 
#     'Print Speed', 
#     'Ink supply and Cartridge', 
#     'Printer Hardware Robustness and sturdiness', 
#     'Control Panel', 
#     'Ease of Use', 
#     'Firmware', 
#     'Business Services and Subscription', 
#     'Paper jam',
#     'Control Panel', 
#     'Other'
# ]

# # Create a DataFrame with labels
# df_labels = pd.DataFrame({'Labels': labels})

# # Split the labels into words (tokens)
# label_tokens = [label.split() for label in labels]

# # Train a Word2Vec model on label tokens
# model = Word2Vec(sentences=label_tokens, vector_size=100, window=5, min_count=1, sg=0)

# # Function to get the embedding of a label
# def label_embedding(label, model):
#     return model.wv[label]  # Use indexing to retrieve the embedding

# # Apply the label_embedding function to embed labels
# df_labels['Label Embeddings'] = df_labels['Labels'].apply(lambda x: label_embedding(x, model))

In [None]:
# def search_reviews(df, product_description, pprint=True):
#     product_embedding = get_embedding(
#         product_description,
#         engine=embedding_model
#     )
#     df[f"similarity_{product_description}"] = df.embedding.apply(lambda x: cosine_similarity(x, product_embedding))

#     results = df.sort_values(f"similarity_{product_description}", ascending=False)
                       
#     return results



# summary_df = []

# for label in labels:
#     label_results = search_reviews(df_filter, label, pprint=True)
    
#     # Apply a similarity threshold to filter rows
#     threshold = 0.85
#     filtered_results = label_results[label_results[f"similarity_{label}"] > threshold]
    
#     # Add the label column to the filtered results
#     filtered_results['topic'] = label
    
#     # Append the filtered results to the summary_df
#     summary_df.append(filtered_results)
    
#     time.sleep(5)

# # Concatenate the filtered results into the final summary DataFrame
# summary_df = pd.concat(summary_df)


In [None]:
# columns_to_drop = ['embedding', 'embedding_splitted']

# # Use the drop method to remove the specified columns
# df_phrase_filtered = df_phrase.drop(columns=columns_to_drop)

In [None]:
# def find_most_similar_word(review_embedding, word_vectors, word_list):
#     similarities = [cosine_similarity(review_embedding, word_vector) for word_vector in word_vectors]
#     most_similar_index = similarities.index(max(similarities))
#     return word_list[most_similar_index]  # Return the actual word

# # Assuming you have a DataFrame 'df' with 'embedding' column containing review embeddings
# df_phrase['Topic'] = df_phrase['embedding_splitted'].apply(lambda x: find_most_similar_word(x, label_embeddings, labels))


In [None]:
# df_phrase['Topic2'] = df_phrase['embedding'].apply(lambda x: find_most_similar_word(x, label_embeddings, labels))

In [None]:
df_phrase 