In [1]:
import pandas as pd
import re
import math
import time
import datetime
import numpy as np
import nltk
import gensim
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.util import bigrams
from nltk.corpus import stopwords
from nltk.stem.porter import *
from gensim.models import Word2Vec
from gensim import models
from gensim import corpora
from gensim import similarities
from openai.embeddings_utils import cosine_similarity
from sklearn.metrics.pairwise import cosine_similarity

## Unigram model

In [2]:
df = pd.read_excel('amazon_review_processed_full.xlsx')
#df.columns

In [3]:
# Tokenisation (Full Review contains processed text - applied stemming and stopword removal, spelling check)
df['Tokenized Full review'] = df['Full review'].apply(lambda x: word_tokenize(str(x)) if isinstance(x, str) else [])
#df[['Full review', 'Tokenized Full review']].head(5)

### Raw term frequency

In [4]:
# Create a dictionary from the tokenized content, bag of words and reverse index
dictionary = corpora.Dictionary(df['Tokenized Full review'])
corpus = [dictionary.doc2bow(text) for text in df['Tokenized Full review']]
Index = similarities.SparseMatrixSimilarity(corpus, len(dictionary))

In [5]:
def query_raw(text):
    qList = text.split()  
    qLower = [w.lower() for w in qList]
    stemmer = PorterStemmer()
    qStemmed = [stemmer.stem(w) for w in qLower]
    qVector = dictionary.doc2bow(qStemmed)
    return qVector

In [6]:
qVector = query_raw('low quality')
simRaw = Index[qVector]
df['Similarity_Raw'] = simRaw
df_raw = df.sort_values(by = 'Similarity_Raw', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_Raw"]
pd.set_option('display.max_colwidth', None)
df_raw[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_Raw
686,Canon PIXMA TS3520,2022-11-13,5,good quality low price,Very pleased with printer. Good quality for a low price.,0.685994
5238,Canon PIXMA TR4720,2023-08-16,2,Low quality printing,"It was easy to install, but the printing job sucked. I decided to use it to print PP slides for the first time and the quality was very low.",0.685994
3805,HP DeskJet 2755e,2023-04-29,4,Quality?,Print quality not what I would expect from HP.,0.57735
9777,Canon PIXMA TR4720,2023-07-25,3,"Loud and not high quality, but it’s cheap","Beware the quality is very low on this product, but it does the job that I need. It’s not the quietest printer I’ve had. You get what you pay for.",0.547723
7068,Epson - ET-4850,2023-05-14,1,Very Poor Quality,I have had this printer for several months. It is cheap quality. Paper constantly jams. The interface is baffling and nearly unusable. Now it jams every time I use it. Just a very low quality piece of equipment.,0.544331


In [7]:
qVector = query_raw('Paper jam')
simRaw = Index[qVector]
df['Similarity_Raw'] = simRaw
df_raw = df.sort_values(by = 'Similarity_Raw', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_Raw"]
pd.set_option('display.max_colwidth', None)
df_raw[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_Raw
2374,Canon PIXMA TR4720,2022-12-01,2,Printer Jam,"The printer jams each time I print, very annoying. Ink runs out quickly. I barely use the printer due to paper jamming.",0.547723
3944,HP ENVY 6455e,2022-01-22,4,Paper tends to jam,Paper tends to jam,0.5
2470,HP Smart Tank 7301,2023-01-28,1,Jams constantly,The printer consistently jams. It wont print anything. It wont even print a page without jamming.,0.486664
8904,HP ENVY Inspire 7955e,2023-08-21,1,paper jams CONSTANTLY,"Every time I try to print more than one page - the pages get jammed (although no paper actually gets stuck - it partially prints and then says jammed). I've tried different papers - thinner, thicker - it doesn't matter. It prints 2 pages and then jams - prints 2 more and jams. It partially prints and says it's jammed and ended up wasting over 30 pieces of paper trying to print an 11 page document.",0.4741
400,HP OfficeJet Pro 9015e,2022-03-22,3,continues to have paper jams,"what can I do, this printer continues to have paper jams and is not working",0.471405


### TF-IDF

In [8]:
# Create a TFIDF reverse index
TFIDF = models.TfidfModel(corpus)
corpus_TFIDF = [TFIDF[vec] for vec in corpus]
IndexTFIDF = similarities.SparseMatrixSimilarity(corpus_TFIDF, len(dictionary))

In [9]:
def query_tfidf(text):
    qList = text.split()  
    qLower = [w.lower() for w in qList]
    stemmer = PorterStemmer()
    qStemmed = [stemmer.stem(w) for w in qLower]
    qVector = dictionary.doc2bow(qStemmed)
    qVectorTFIDF = TFIDF[qVector]
    return qVectorTFIDF

In [10]:
qVectorTFIDF = query_tfidf('low quality')
simTFIDF = IndexTFIDF[qVectorTFIDF]
df['Similarity_TFIDF'] = simTFIDF
df_tfidf = df.sort_values(by = 'Similarity_TFIDF', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_TFIDF"]
pd.set_option('display.max_colwidth', None)
df_tfidf[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_TFIDF
686,Canon PIXMA TS3520,2022-11-13,5,good quality low price,Very pleased with printer. Good quality for a low price.,0.751272
5238,Canon PIXMA TR4720,2023-08-16,2,Low quality printing,"It was easy to install, but the printing job sucked. I decided to use it to print PP slides for the first time and the quality was very low.",0.597942
3187,HP DeskJet 2755e,2023-08-17,5,low price,setup took time,0.558956
1593,HP ENVY 6055e,2021-12-03,3,Low Quality,this printer disconnects all the time. I spend a lot of time trying to print. the printer came with toner but my printer is reading as it is low ink?? it's really frustrating trying to use this.,0.540725
1627,HP Smart Tank 5101,2023-09-10,5,It's works,This one is worth it it has where I can see the when the ink gets low,0.512525


In [11]:
qVectorTFIDF = query_tfidf('paper jam')
simTFIDF = IndexTFIDF[qVectorTFIDF]
df['Similarity_TFIDF'] = simTFIDF
df_tfidf = df.sort_values(by = 'Similarity_TFIDF', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_TFIDF"]
pd.set_option('display.max_colwidth', None)
df_tfidf[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_TFIDF
6611,Epson - ET-4850,2023-08-16,1,Paper Jam ALL THE TIME,"I need a copier that I can feed at least a few papers through at a time but this one doesn't copy even one paper without a paper jam, constantly, over and over again. If you need a copier that will consistently work, this one is not it.",0.454711
6179,Canon Pixma TS6420a,2022-08-10,4,Works fine,"I switched from HP to Canon. This printer prints well and offers affordable subscription plans. It is very easy to set up if you follow the instructions in the box and on the screen. I like the fact that it has wireless connection. The printer prints fast but 'moves'. The quality of the photos is excellent. It prints well on papers as well, but depending on the picture quality or paper quality, it sometimes prints papers as bluish with gray. As a student, I use it mostly for printing research papers.",0.443145
8904,HP ENVY Inspire 7955e,2023-08-21,1,paper jams CONSTANTLY,"Every time I try to print more than one page - the pages get jammed (although no paper actually gets stuck - it partially prints and then says jammed). I've tried different papers - thinner, thicker - it doesn't matter. It prints 2 pages and then jams - prints 2 more and jams. It partially prints and says it's jammed and ended up wasting over 30 pieces of paper trying to print an 11 page document.",0.424191
1473,Canon PIXMA TR4720,2023-03-05,1,Jams and Doesn't Print Correctly,"This printer is awful. Just after the return window was closed, it stopped working! It tries to suck up too many papers at once and jams every single time I print more than one sheet. I had high hopes for this printer, but it was just a waste of money. I'm super disappointed! I have even tried different brands of paper but it still jams. It prints blurry, but I'm not sure if that's from being a crappy printer or from wrestling jammed paper out of it all the time. Either way, avoid this one!!",0.405851
1554,Epson - Workforce 3820,2023-07-12,3,Pick Roller ineffective,"This printer has a pick roller issue. It will often get papers jam, picking multiple papers at once and it is a bit slower compared to similar printers.",0.397878


## Bigram model

In [12]:
#preprocess data to extract bigrams from original reviews
df['Original_full'] = df['Original title'].str.cat(df['Original review'], sep=' ', na_rep='')
#df['Original_full'].head(5)

In [13]:
# Define a function to preprocess and tokenize the text
def preprocess_text(text):
    if isinstance(text, str):
        # Lowercase the text
        text = text.lower()

        # Remove non-alphanumeric characters
        text = re.sub(r'[^\w\s]', ' ', text)

        # Tokenize the text
        unigrams = word_tokenize(text)
        bigrams_list = list(bigrams(unigrams))

        # Stemming
        stemmer = PorterStemmer()
        stemmed_unigrams = [stemmer.stem(word) for word in unigrams]
        stemmed_bigrams = [tuple(stemmer.stem(word) for word in bigram) for bigram in bigrams_list]

        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        unigrams_without_stopwords = [word for word in stemmed_unigrams if word not in stop_words]
        bigrams_without_stopwords = [' '.join(bigram) for bigram in stemmed_bigrams if not any(word in stop_words for word in bigram)]

        # Join both unigrams and bigrams
        tokens = unigrams_without_stopwords + bigrams_without_stopwords
    else:
        tokens = [] 

    return tokens

# Apply the modified preprocessing function to review data
df['Processed_bigram'] = df['Original_full'].apply(preprocess_text)

### Raw Term Frequency

In [14]:
dictionary_bi = corpora.Dictionary(df['Processed_bigram'])
corpus_bi = [dictionary_bi.doc2bow(text) for text in df['Processed_bigram']]
Index_bi = similarities.SparseMatrixSimilarity(corpus_bi, len(dictionary_bi))

df['Vector_Raw_Bigram'] = corpus_bi
print(len(dictionary_bi))

97667


In [15]:
def query_raw_bi(query):
    preprocessed_query = preprocess_text(query)
    qVector_bi = dictionary_bi.doc2bow(preprocessed_query)
    return qVector_bi

In [16]:
qVector_bi = query_raw_bi('low quality')
SimRaw_bi = Index_bi[qVector_bi]
df['Similarity_Raw_Bigram'] = SimRaw_bi
df_raw_bi = df.sort_values(by = 'Similarity_Raw_Bigram', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_Raw_Bigram"]
pd.set_option('display.max_colwidth', None)
df_raw_bi[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_Raw_Bigram
5238,Canon PIXMA TR4720,2023-08-16,2,Low quality printing,"It was easy to install, but the printing job sucked. I decided to use it to print PP slides for the first time and the quality was very low.",0.440225
686,Canon PIXMA TS3520,2022-11-13,5,good quality low price,Very pleased with printer. Good quality for a low price.,0.414781
7068,Epson - ET-4850,2023-05-14,1,Very Poor Quality,I have had this printer for several months. It is cheap quality. Paper constantly jams. The interface is baffling and nearly unusable. Now it jams every time I use it. Just a very low quality piece of equipment.,0.412393
2076,Epson - ET-2800,2023-02-28,2,,There’s only normal or high quality. There’s no draft or low quality mode to save ink. This really bothers me. I’ll return to post about this printer yield. At least I got rid of cartridges.,0.379663
3449,Epson - ET-4850,2023-02-27,5,Color and print quality,Quality is excellent and fast,0.365148


In [17]:
qVector_bi = query_raw_bi('Paper jam')
SimRaw_bi = Index_bi[qVector_bi]
df['Similarity_Raw_Bigram'] = SimRaw_bi
df_raw_bi = df.sort_values(by = 'Similarity_Raw_Bigram', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_Raw_Bigram"]
pd.set_option('display.max_colwidth', None)
df_raw_bi[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_Raw_Bigram
4545,Canon PIXMA MG3620,2023-06-28,1,Consistent paper jams,"Most of the paper jams I could fix. However, I had a paper jam so bad, it destroyed the rollers so I used the warranty. Then, it happened again. Do yourself a favor, buy a different printer.",0.766131
400,HP OfficeJet Pro 9015e,2022-03-22,3,continues to have paper jams,"what can I do, this printer continues to have paper jams and is not working",0.755929
5691,HP OfficeJet Pro 8025e,2021-10-17,2,Poor quality.,"If you print something? Paper jams. Reload the paper the only way it can go? Paper jam.WiFi blinks? Paper jam. You leave the printer alone too long? Paper jam. It’s incredible how hard it must be to make a printer that just prints when it’s told to, because no one can seem to do it. I’ve had this for 3 months and it’s a very high maintenance printer.",0.712396
7614,HP ENVY Inspire 7255e,2022-05-14,2,Can't find paper jam,"I can't set it up with my laptop, keeps saying I have a paper jam.I need a printer manual.",0.707107
5705,Epson - Workforce 4820,2022-06-19,3,Paper jam,"If you don’t mind a paper jam several times after filling the paper tray, then this machine is for you. I purchased 2 months ago. Fill the paper tray, use about 1/2 the paper in the tray, and bam…. A paper jam!Of course the jam is in the back of printer…. What a pain!!!I do not recommend this printer",0.699896


In [18]:
qVector_bi = query_raw_bi('Customer support')
SimRaw_bi = Index_bi[qVector_bi]
df['Similarity_Raw_Bigram'] = SimRaw_bi
df_raw_bi = df.sort_values(by = 'Similarity_Raw_Bigram', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_Raw_Bigram"]
pd.set_option('display.max_colwidth', None)
df_raw_bi[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_Raw_Bigram
6889,HP OfficeJet Pro 8034e,2023-06-12,1,Worst Customer Support Ever!,I just loaded brand new cartridges and my printing is completely faded. Absolutely horrible getting in touch with customer support to help. Do not buy!,0.57735
3330,Canon PIXMA MG3620,2023-07-26,3,Powers completely off when not in use,"Returning this. For 1. when not in use it shuts down completely. I don’t know how long it stays idle before shutting down and there’s no way to find out because it’s not in the manual nor can you call customer support without having an account and providing your email and phone number. I also tried to call customer support to get an answer of is a Google Chromebook compatible? It does not appear to be but that is not listed anywhere in the instructions and again, I can’t speak to anyone in customer support unless I create an account. It took me awhile to set it up wirelessly but I succeeded and have printed from my iPhone a few times, but the Chromebook is a no go.",0.495434
7061,HP OfficeJet Pro 9025e,2022-09-22,1,"Won't connect to scan, no customer support",I have had three of these same printers for 5 years. This one will not connect to my wifi network or scan. The customer support number is not a working number. HP tries to charge to answer any questions. I would not recommend!,0.485071
8300,HP DeskJet 2755e,2023-08-08,3,Help printer woes,Had to call customer support to set up/ link to my devices,0.480384
738,Canon PIXMA TR4720,2021-10-01,1,"Cheap, flimsy and the worst customer support","Very fragile and flimsy printer. I had a simple question for Customer support and it took me an hour to get through. Finally I found out I had to register the product in order to even talk to someone. When you try to register online they ask you for your product through voice recognition. I said my model 3 times and they don't recognize my model number and hang up on you. I did this for an hour, had my wife and son talk into the phone and apparently my model number TR4720 does not exist, even though I have it in front of me.Bottom line if you need customer support you best figure it out on your own because nobody is there to help you.Lets see how long the paper tray holds up, maybe a month or so? I think a piece of tape to hold it together would be better.I will be returning this printer.",0.412082


### TF-IDF

In [19]:
# Create a TFIDF reverse index
TFIDF_bi = models.TfidfModel(corpus_bi)
corpus_TFIDF_bi = [TFIDF_bi[vec] for vec in corpus_bi]
IndexTFIDF_bi = similarities.SparseMatrixSimilarity(corpus_TFIDF_bi, len(dictionary_bi))

df['Vector_TFIDF_Bigram'] = corpus_TFIDF_bi
print(len(dictionary_bi))

97667


In [20]:
def query_tfidf_bi(query):
    preprocessed_query = preprocess_text(query)
    qVector_bi = dictionary_bi.doc2bow(preprocessed_query)
    qVectorTFIDF_bi = TFIDF_bi[qVector_bi]
    return qVectorTFIDF_bi

In [21]:
qVectorTFIDF_bi = query_tfidf_bi('low quality')
simTFIDF_bi = IndexTFIDF_bi[qVectorTFIDF_bi]
df['Similarity_TFIDF_Bigram'] = simTFIDF_bi
df_tfidf_bi = df.sort_values(by = 'Similarity_TFIDF_Bigram', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_TFIDF_Bigram"]
pd.set_option('display.max_colwidth', None)
df_tfidf_bi[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_TFIDF_Bigram
1593,HP ENVY 6055e,2021-12-03,3,Low Quality,this printer disconnects all the time. I spend a lot of time trying to print. the printer came with toner but my printer is reading as it is low ink?? it's really frustrating trying to use this.,0.392702
5238,Canon PIXMA TR4720,2023-08-16,2,Low quality printing,"It was easy to install, but the printing job sucked. I decided to use it to print PP slides for the first time and the quality was very low.",0.360529
7068,Epson - ET-4850,2023-05-14,1,Very Poor Quality,I have had this printer for several months. It is cheap quality. Paper constantly jams. The interface is baffling and nearly unusable. Now it jams every time I use it. Just a very low quality piece of equipment.,0.265875
2076,Epson - ET-2800,2023-02-28,2,,There’s only normal or high quality. There’s no draft or low quality mode to save ink. This really bothers me. I’ll return to post about this printer yield. At least I got rid of cartridges.,0.262055
3767,Canon PIXMA TR4720,2023-06-07,1,Item had obviously been used. Ink cartridges were dried up. Very disappointing !,"Printer did not produce quality copies. It arrived with original packaging already removed and Ink installed and dry. With new ink, the copies were low quality. Very disappointed in Cannon.",0.238766


In [22]:
qVectorTFIDF_bi = query_tfidf_bi('paper jam')
simTFIDF_bi = IndexTFIDF_bi[qVectorTFIDF_bi]
df['Similarity_TFIDF_Bigram'] = simTFIDF_bi
df_tfidf_bi = df.sort_values(by = 'Similarity_TFIDF_Bigram', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_TFIDF_Bigram"]
pd.set_option('display.max_colwidth', None)
df_tfidf_bi[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_TFIDF_Bigram
400,HP OfficeJet Pro 9015e,2022-03-22,3,continues to have paper jams,"what can I do, this printer continues to have paper jams and is not working",0.688106
3766,Epson - ET-2850,2023-06-13,1,Did not work.,"Printer did not work. It had a paper jam when I was setting up the printer. I cleared the paper jam as directed but the printer would not clear the paper jam error on the printer screen. I called Epson for help, and they said to just send it back.",0.62867
4545,Canon PIXMA MG3620,2023-06-28,1,Consistent paper jams,"Most of the paper jams I could fix. However, I had a paper jam so bad, it destroyed the rollers so I used the warranty. Then, it happened again. Do yourself a favor, buy a different printer.",0.552715
5705,Epson - Workforce 4820,2022-06-19,3,Paper jam,"If you don’t mind a paper jam several times after filling the paper tray, then this machine is for you. I purchased 2 months ago. Fill the paper tray, use about 1/2 the paper in the tray, and bam…. A paper jam!Of course the jam is in the back of printer…. What a pain!!!I do not recommend this printer",0.536228
5398,HP ENVY 6455e,2021-10-05,2,Printer jams …. All the time!,It is difficult to connect to the internet and the paper jams constantly,0.505307


In [23]:
qVectorTFIDF_bi = query_tfidf_bi('customer support')
simTFIDF_bi = IndexTFIDF_bi[qVectorTFIDF_bi]
df['Similarity_TFIDF_Bigram'] = simTFIDF_bi
df_tfidf_bi = df.sort_values(by = 'Similarity_TFIDF_Bigram', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_TFIDF_Bigram"]
pd.set_option('display.max_colwidth', None)
df_tfidf_bi[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_TFIDF_Bigram
6889,HP OfficeJet Pro 8034e,2023-06-12,1,Worst Customer Support Ever!,I just loaded brand new cartridges and my printing is completely faded. Absolutely horrible getting in touch with customer support to help. Do not buy!,0.389979
7061,HP OfficeJet Pro 9025e,2022-09-22,1,"Won't connect to scan, no customer support",I have had three of these same printers for 5 years. This one will not connect to my wifi network or scan. The customer support number is not a working number. HP tries to charge to answer any questions. I would not recommend!,0.38967
3330,Canon PIXMA MG3620,2023-07-26,3,Powers completely off when not in use,"Returning this. For 1. when not in use it shuts down completely. I don’t know how long it stays idle before shutting down and there’s no way to find out because it’s not in the manual nor can you call customer support without having an account and providing your email and phone number. I also tried to call customer support to get an answer of is a Google Chromebook compatible? It does not appear to be but that is not listed anywhere in the instructions and again, I can’t speak to anyone in customer support unless I create an account. It took me awhile to set it up wirelessly but I succeeded and have printed from my iPhone a few times, but the Chromebook is a no go.",0.362506
6289,Epson - Workforce 4820,2020-10-21,1,Software issues - do not order this printer,"I received my first printer and kept getting an error saying ""insert paper cassette correctly"". I am no novice to printers but I called customer support because no matter what I did it wasn't working. Customer support determined it was a software problem and to replace the unit. I did so and received my new one. Yayyy, I finally have a printer after months of back ordering and running to UPS for my print jobs. Nope, this one had the exact same issue, and I was told the exact same thing by customer support.Apparently this is a new model that was just released and they haven't worked out the bugs yet. DO NOT PURCHASE THIS PRINTER.",0.339225
10119,HP ENVY Inspire 7955e,2023-08-14,1,No customer support,A little over a year old and it’s saying there’s a paper jam but there’s no paper in it. Once your warranty expires HP will not talk to you,0.317486


In [24]:
qVectorTFIDF_bi = query_tfidf_bi('bad connection')
simTFIDF_bi = IndexTFIDF_bi[qVectorTFIDF_bi]
df['Similarity_TFIDF_Bigram'] = simTFIDF_bi
df_tfidf_bi = df.sort_values(by = 'Similarity_TFIDF_Bigram', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_TFIDF_Bigram"]
pd.set_option('display.max_colwidth', None)
df_tfidf_bi[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_TFIDF_Bigram
4570,Canon PIXMA TR4720,2022-02-27,2,Master Set Up Didn't,"Master setup would not go past ""Install ink cartridges"", So it was essentially frozen. After reinstalling the color and black inks over and over, it finally continued and finished the installation and printed a test sheet! very frustrating and time consuming. It appears it was a bad connection on one of the ink cartridges.",0.238567
4216,HP OfficeJet Pro 9015e,2021-12-30,4,printer,Not bad printer for price,0.146873
2343,Epson - Workforce 4820,2023-03-26,3,,"The printer seems like a reasonable printer for the price and once it’s set up performs reasonably well. However, when setting up the printer and installing the software for Mac users, it doesn’t always install smoothly and corrupts. The scanner software for installation says to install smart, scan software, but actually that’s the wrong software. It should be scan 2 software, which also corrupts and doesn’t completely install all the time. EPSON technical support is terrible. I can’t say it’s even close to acceptable. There is a big English problem of understanding each other, their phone lines being overseas for north Americans always has a bad connection and will disconnect during your conversation. The technicians don’t know what they’re doing and do not know their products and are constantly putting you on hold to talk to somebody by computer in the USA to try and solve the problems you’re calling about. EPSON really needs to get their act together and bring back English-speaking technical support.",0.139453
807,Epson - Workforce 3820,2023-08-06,1,unable to connect with wifi,very bad,0.113909
6108,Canon Pixma TS6420a,2022-11-25,5,Works great with phone,I use my phone for almost everything so printer is good for that. Setting up was not too bad,0.099082


## Export Data

In [25]:

# 

In [26]:
#df.head(5)

Unnamed: 0,Review Model,Retailer,Review date,Review name,Review rating,Review title,Review Content,Verified Purchase or not,People_find_helpful,vine or not,...,Topic4,Tokenized Full review,Similarity_Raw,Similarity_TFIDF,Original_full,Processed_bigram,Vector_Raw_Bigram,Similarity_Raw_Bigram,Vector_TFIDF_Bigram,Similarity_TFIDF_Bigram
0,Canon Pixma TS6420a,Amazon,2023-08-10,Ernest Birkholz,5,work great,easi instal work great,True,0,False,...,0.192308,"[work, great, easi, instal, work, great]",0.0,0.0,Works great 🖨 was easy to install and works great.,"[work, great, wa, easi, instal, work, great, work great, great wa, wa easi, work great]","[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2), (7, 2)]",0.0,"[(0, 0.1323376887898121), (1, 0.2607215313182922), (2, 0.7005864390258342), (3, 0.19512399230158903), (4, 0.0855002918129666), (5, 0.28767906435325574), (6, 0.16606343043510147), (7, 0.5176578806693327)]",0.0
1,HP OfficeJet Pro 9015e,Amazon,2022-06-04,mattey,3,spunki mid size,slower speed im use old model 8600 new 9015e sound like break apart insid setup annoyingli complex luckili research watch review thing opt enrol import know detail enrol instant program terrif hope dont ruin overal spunki mid size good enough typic home offic,True,0,False,...,0.211975,"[spunki, mid, size, slower, speed, im, use, old, model, 8600, new, 9015e, sound, like, break, apart, insid, setup, annoyingli, complex, luckili, research, watch, review, thing, opt, enrol, import, know, detail, enrol, instant, program, terrif, hope, dont, ruin, overal, spunki, mid, size, good, enough, typic, home, offic]",0.0,0.0,spunky mid size printer 🔆Slower print speed than what I’m used to (old model..HP-OJ-Pro. 8600) and when it’s (new 9015e) printing it sounds like it’s breaking apart inside. Setup was annoyingly complex luckily I researched the HP+ & watched reviews (not my thing so I opted NOT to enroll - important to know details before enrolling) But the instant ink program - its terrific hope they don’t ruin it🥴 overall it’s a spunky mid size printer good enough for the typical home office🥴☀️🔆,"[spunki, mid, size, printer, slower, print, speed, use, old, model, hp, oj, pro, 8600, new, 9015e, print, sound, like, break, apart, insid, setup, wa, annoyingli, complex, luckili, research, hp, watch, review, thing, opt, enrol, import, know, detail, befor, enrol, instant, ink, program, terrif, hope, ruin, overal, spunki, mid, size, printer, good, enough, typic, home, offic, spunki mid, mid size, size printer, printer slower, slower print, print speed, old model, model hp, hp oj, oj pro, pro 8600, new 9015e, 9015e print, sound like, break apart, apart insid, insid setup, setup wa, wa annoyingli, annoyingli complex, complex luckili, hp watch, watch review, enrol import, know detail, detail befor, befor enrol, instant ink, ink program, terrif hope, spunki mid, mid size, size printer, printer good, good enough, typic home, home offic]","[(4, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 2), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 2), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 2), (46, 2), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 2), (59, 1), (60, 2), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 2), (72, 2), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 2), (79, 2), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1)]",0.0,"[(4, 0.015376909347413105), (8, 0.08749847382917576), (9, 0.08511017195475666), (10, 0.1322798545852455), (11, 0.11287042902051138), (12, 0.14301898796068524), (13, 0.08065302889419217), (14, 0.14301898796068524), (15, 0.040849798410150365), (16, 0.14301898796068524), (17, 0.0667922747186218), (18, 0.1322798545852455), (19, 0.09032323539575443), (20, 0.14301898796068524), (21, 0.07532184573648203), (22, 0.14301898796068524), (23, 0.05026912424706038), (24, 0.15916820404062937), (25, 0.14301898796068524), (26, 0.023232705304028024), (27, 0.08666079444917475), (28, 0.035856285425446575), (29, 0.05704551817319756), (30, 0.05019146346057413), (31, 0.03907333436912139), (32, 0.1322798545852455), (33, 0.14301898796068524), (34, 0.07214164368216013), (35, 0.01599505267401988), (36, 0.06256297832999969), (37, 0.07230219739317155), (38, 0.14301898796068524), (39, 0.051483319280764966), (40, 0.051652646588740116), (41, 0.04340145076527359), (42, 0.14301898796068524), (43, 0.025969340258891506), (44, 0.08404124508087917), (45, 0.1733215888983495), (46, 0.2860379759213705), (47, 0.04789194861689983), (48, 0.10897674058005524), (49, 0.03341761305324666), (50, 0.14301898796068524), (51, 0.044649907973154636), (52, 0.1259978642703702), (53, 0.1259978642703702), (54, 0.03918354252286347), (55, 0.12154072120980575), (56, 0.07787444707704597), (57, 0.05191015879598444), (58, 0.017756148845285626), (59, 0.07400647365405189), (60, 0.009608987735650961), (61, 0.09084848211865877), (62, 0.14301898796068524), (63, 0.05797725947372688), (64, 0.10586769040492591), (65, 0.05078363596557601), (66, 0.06580650307686056), (67, 0.039529990871620645), (68, 0.0913921622696319), (69, 0.033829728408732024), (70, 0.06054802691806103), (71, 0.10469818335492861), (72, 0.21795348116011048), (73, 0.08093219490574984), (74, 0.1259978642703702), (75, 0.07033455378734528), (76, 0.08474560702949162), (77, 0.06054802691806103), (78, 0.2860379759213705), (79, 0.2860379759213705), (80, 0.10106236877119416), (81, 0.14301898796068524), (82, 0.034589017262809225), (83, 0.07932800867789704), (84, 0.1259978642703702), (85, 0.0157163432136981), (86, 0.14301898796068524), (87, 0.0726283840613487), (88, 0.1322798545852455)]",0.0
2,Canon PIXMA MG3620,Amazon,2023-03-15,Maria D,4,,i,True,0,False,...,0.199992,[i],0.0,0.0,𝙲𝚕𝚎𝚊𝚛 𝚙𝚛𝚒𝚗𝚝𝚜 𝙻𝚘𝚟𝚎 𝚒𝚝,"[𝙲𝚕𝚎𝚊𝚛, 𝚙𝚛𝚒𝚗𝚝𝚜, 𝙻𝚘𝚟𝚎, 𝚒𝚝, 𝙲𝚕𝚎𝚊𝚛 𝚙𝚛𝚒𝚗𝚝𝚜, 𝚙𝚛𝚒𝚗𝚝𝚜 𝙻𝚘𝚟𝚎, 𝙻𝚘𝚟𝚎 𝚒𝚝]","[(89, 1), (90, 1), (91, 1), (92, 1), (93, 1), (94, 1), (95, 1)]",0.0,"[(89, 0.37796447300922725), (90, 0.37796447300922725), (91, 0.37796447300922725), (92, 0.37796447300922725), (93, 0.37796447300922725), (94, 0.37796447300922725), (95, 0.37796447300922725)]",0.0
3,Epson - ET-3830,Amazon,2022-11-17,Ryan H,5,shag know he talk,yup slam upgrad paid littl front equival throw away type cartridg save hundr year tank system longer tri find absolut lowest possibl usag set alway be macbook iphon ipad easi breezi theyv done great job engin fill process complet mess free,True,12,False,...,0.263412,"[shag, know, he, talk, yup, slam, upgrad, paid, littl, front, equival, throw, away, type, cartridg, save, hundr, year, tank, system, longer, tri, find, absolut, lowest, possibl, usag, set, alway, be, macbook, iphon, ipad, easi, breezi, theyv, done, great, job, engin, fill, process, complet, mess, free]",0.0,0.0,"Shaq knows what he's talking about Yup, this printer is a slam dunk. :)What an upgrade! Paid a little more up front than an equivalent printer with the throw away types of cartridges, but will save hundreds a year with the tank system. No longer miserly with what I print by trying to find the absolute lowest possible ink usage settings and always in b&w. I can print from my macbooks, iphone, ipad easy breezy. They've done a great job engineering the ink filling process and it's completely mess free.","[shaq, know, talk, yup, thi, printer, slam, dunk, upgrad, paid, littl, front, equival, printer, throw, away, type, cartridg, save, hundr, year, tank, system, longer, miserli, print, tri, find, absolut, lowest, possibl, ink, usag, set, alway, b, w, print, macbook, iphon, ipad, easi, breezi, done, great, job, engin, ink, fill, process, complet, mess, free, shaq know, yup thi, thi printer, slam dunk, upgrad paid, equival printer, throw away, away type, save hundr, tank system, longer miserli, absolut lowest, lowest possibl, possibl ink, ink usag, usag set, b w, macbook iphon, iphon ipad, ipad easi, easi breezi, great job, job engin, ink fill, fill process, complet mess, mess free]","[(0, 1), (1, 1), (35, 2), (41, 1), (58, 2), (60, 2), (96, 1), (97, 1), (98, 1), (99, 1), (100, 1), (101, 1), (102, 1), (103, 1), (104, 1), (105, 1), (106, 1), (107, 1), (108, 1), (109, 1), (110, 1), (111, 1), (112, 1), (113, 1), (114, 1), (115, 1), (116, 1), (117, 1), (118, 1), (119, 1), (120, 1), (121, 1), (122, 1), (123, 1), (124, 1), (125, 1), (126, 1), (127, 1), (128, 1), (129, 1), (130, 1), (131, 1), (132, 1), (133, 1), (134, 1), (135, 1), (136, 1), (137, 1), (138, 1), (139, 1), (140, 1), (141, 1), (142, 1), (143, 1), (144, 1), (145, 1), (146, 1), (147, 1), (148, 1), (149, 1), (150, 1), (151, 1), (152, 1), (153, 1), (154, 1), (155, 1), (156, 1), (157, 1), (158, 1), (159, 1), (160, 1), (161, 1), (162, 1), (163, 1), (164, 1), (165, 1), (166, 1)]",0.0,"[(0, 0.02962319808391107), (1, 0.02918067270786798), (35, 0.03981645679232264), (41, 0.0540195779388243), (58, 0.022100175213698888), (60, 0.01195981822604247), (96, 0.0677629233820636), (97, 0.17800845894892542), (98, 0.05704408353377176), (99, 0.06537918702623174), (100, 0.17800845894892542), (101, 0.08568783033362029), (102, 0.09256584847023591), (103, 0.1568231321549411), (104, 0.03381246627504262), (105, 0.06103135648789218), (106, 0.1469725161679824), (107, 0.06085321466234726), (108, 0.17800845894892542), (109, 0.1568231321549411), (110, 0.10335732010110397), (111, 0.12454264689508822), (112, 0.1646420059354661), (113, 0.07331050261499791), (114, 0.1568231321549411), (115, 0.050949475458660715), (116, 0.0573004709920897), (117, 0.07426404908889923), (118, 0.10786228242244147), (119, 0.10004340864191648), (120, 0.13009022612802254), (121, 0.10837654531143867), (122, 0.08271523248327722), (123, 0.17800845894892542), (124, 0.07085216765365204), (125, 0.12711762827767945), (126, 0.05417572223710592), (127, 0.17800845894892542), (128, 0.051542038478964235), (129, 0.06683415405348365), (130, 0.17800845894892542), (131, 0.12122873543590076), (132, 0.17800845894892542), (133, 0.09501009229797941), (134, 0.17800845894892542), (135, 0.08505552071976727), (136, 0.13790909990854752), (137, 0.1646420059354661), (138, 0.07622679760643356), (139, 0.08490063060609648), (140, 0.17800845894892542), (141, 0.06276220842455056), (142, 0.059986384888524355), (143, 0.1568231321549411), (144, 0.024939497863710018), (145, 0.1336060631545231), (146, 0.17800845894892542), (147, 0.14345667914148183), (148, 0.17800845894892542), (149, 0.07339526633599051), (150, 0.08050525345786026), (151, 0.06067670344846945), (152, 0.12854670532725396), (153, 0.010317103151523042), (154, 0.022921529740345347), (155, 0.08244170001795016), (156, 0.1130744842372745), (157, 0.032896557226432324), (158, 0.07545534611618908), (159, 0.0806284725063511), (160, 0.17800845894892542), (161, 0.08826281171621153), (162, 0.1568231321549411), (163, 0.0841443095328517), (164, 0.039520683218601235), (165, 0.1512755529220068), (166, 0.17800845894892542)]",0.0
4,HP ENVY 6055e,Amazon,2022-04-09,Sam,3,user friendli,would think someth simpl reconnect wifi would easi nope first reset wifi let sit two hour serious differ direct said restart router comput step back wifi direct support super confus also ambigu languag specifi thing someon tech savvi would like confus tri troubl shoot alway option call someon would prefer troubl shoot lengthi process also notic lot page issu three page great three hour troubl shoot reconnect wifi,True,0,False,...,0.140233,"[user, friendli, would, think, someth, simpl, reconnect, wifi, would, easi, nope, first, reset, wifi, let, sit, two, hour, serious, differ, direct, said, restart, router, comput, step, back, wifi, direct, support, super, confus, also, ambigu, languag, specifi, thing, someon, tech, savvi, would, like, confus, tri, troubl, shoot, alway, option, call, someon, would, prefer, troubl, shoot, lengthi, process, also, notic, lot, page, issu, three, page, great, three, hour, troubl, shoot, reconnect, wifi]",0.0,0.0,"Not User Friendly You would think something as simple as reconnecting to the wifi would be easy. Nope. First I had to reset wifi and let the printer sit for two hours. seriously? Then different directions said I have to restart my router, my computer and my printer and do a few other steps just to set it back up with the wifi. The directions for support are super confusing and wordy. They also use ambiguous language and do not specify what things are, so someone who is not very tech savvy would most likely be confused trying to trouble shoot. I do not always have the option to call someone so I would prefer to trouble shoot myself, which is a very lengthy process with this printer. I also noticed that there were a lot pages that had issues with the ink after printing out three pages. This printer is great if you have three hours to trouble shoot reconnecting to wifi.","[user, friendli, would, think, someth, simpl, reconnect, wifi, would, easi, nope, first, reset, wifi, let, printer, sit, two, hour, serious, differ, direct, said, restart, router, comput, printer, step, set, back, wifi, direct, support, super, confus, wordi, also, use, ambigu, languag, specifi, thing, someon, veri, tech, savvi, would, like, confus, tri, troubl, shoot, alway, option, call, someon, would, prefer, troubl, shoot, veri, lengthi, process, thi, printer, also, notic, lot, page, issu, ink, print, three, page, thi, printer, great, three, hour, troubl, shoot, reconnect, wifi, user friendli, would think, think someth, wifi would, easi nope, nope first, reset wifi, printer sit, two hour, hour serious, differ direct, direct said, super confus, also use, use ambigu, ambigu languag, veri tech, ...]","[(0, 1), (1, 1), (35, 1), (43, 1), (58, 1), (60, 4), (82, 1), (85, 1), (98, 1), (141, 1), (144, 1), (153, 2), (154, 2), (157, 1), (167, 2), (168, 1), (169, 1), (170, 1), (171, 1), (172, 1), (173, 1), (174, 1), (175, 1), (176, 2), (177, 1), (178, 1), (179, 1), (180, 2), (181, 1), (182, 1), (183, 1), (184, 1), (185, 2), (186, 1), (187, 1), (188, 1), (189, 1), (190, 1), (191, 1), (192, 1), (193, 1), (194, 1), (195, 1), (196, 1), (197, 1), (198, 2), (199, 1), (200, 1), (201, 1), (202, 2), (203, 1), (204, 1), (205, 1), (206, 1), (207, 1), (208, 1), (209, 1), (210, 1), (211, 3), (212, 1), (213, 1), (214, 1), (215, 2), (216, 1), (217, 1), (218, 1), (219, 1), (220, 1), (221, 1), (222, 1), (223, 1), (224, 1), (225, 1), (226, 2), (227, 1), (228, 1), (229, 3), (230, 3), (231, 1), (232, 1), (233, 1), (234, 1), (235, 1), (236, 2), (237, 1), (238, 1), (239, 4), (240, 1), (241, 1), (242, 4), (243, 1), (244, 1)]",0.041451,"[(0, 0.025195886690473106), (1, 0.024819498590820343), (35, 0.01693286003276504), (43, 0.027491950961893632), (58, 0.009398605595295036), (60, 0.020344746303788432), (82, 0.03661700901637897), (85, 0.016637809533237673), (98, 0.04851860562143047), (141, 0.053382132726840656), (144, 0.02121218514325822), (153, 0.017550337491810557), (154, 0.038991621665840165), (157, 0.027980028558564775), (167, 0.07171209027169226), (168, 0.15140434698908461), (169, 0.09860936547320612), (170, 0.14003556652768112), (171, 0.15140434698908461), (172, 0.035128826684858304), (173, 0.04522164276385069), (174, 0.14003556652768112), (175, 0.03522486358186492), (176, 0.14442374943053374), (177, 0.15140434698908461), (178, 0.048303397693523785), (179, 0.15140434698908461), (180, 0.11681751270093227), (181, 0.15140434698908461), (182, 0.15140434698908461), (183, 0.03815481233615677), (184, 0.06199814120289909), (185, 0.08165821603184205), (186, 0.15140434698908461), (187, 0.03578470233874853), (188, 0.10070604793415601), (189, 0.10311059167278892), (190, 0.14003556652768112), (191, 0.055751658180076), (192, 0.042286195257722634), (193, 0.15140434698908461), (194, 0.08037303074998196), (195, 0.15140434698908461), (196, 0.07082853791855456), (197, 0.05179619452608427), (198, 0.06501003645421644), (199, 0.10592922514347068), (200, 0.0763713868218617), (201, 0.10493487930276499), (202, 0.14891669055676682), (203, 0.06393878278660381), (204, 0.14003556652768112), (205, 0.06604987797536394), (206, 0.06378107307926381), (207, 0.05105725500514124), (208, 0.0735794734491763), (209, 0.15140434698908461), (210, 0.08268695722390015), (211, 0.24047587982833837), (212, 0.15140434698908461), (213, 0.05301169000739727), (214, 0.07035309352222505), (215, 0.12617912963395986), (216, 0.0448463018263053), (217, 0.11064769535602179), (218, 0.0577981324813117), (219, 0.05504559436304957), (220, 0.14003556652768112), (221, 0.03387819309233555), (222, 0.04100432054117226), (223, 0.07688666531959884), (224, 0.04821202317757079), (225, 0.14003556652768112), (226, 0.123437913751963), (227, 0.1069877655208895), (228, 0.11948814520297121), (229, 0.17147082809223144), (230, 0.25034989134737845), (231, 0.04288974184079921), (232, 0.09010027403131189), (233, 0.15140434698908461), (234, 0.0532990860522408), (235, 0.0667871563981495), (236, 0.048817609271331476), (237, 0.15140434698908461), (238, 0.11207482839555949), (239, 0.15248777288224802), (240, 0.13338525627882872), (241, 0.15140434698908461), (242, 0.11048507380263807), (243, 0.12201647581742525), (244, 0.10493487930276499)]",0.0


In [34]:
# Get the list of columns to pivot 
columns_to_pivot = ['Topic0', 'Topic1', 'Topic2', 'Topic3', 'Topic4']

# Create a list of id_vars columns by excluding the columns to pivot
id_vars = [col for col in df.columns if col not in columns_to_pivot]

# Melt the DataFrame without listing all id_vars columns explicitly
melted_df = pd.melt(df, id_vars=id_vars, value_vars=columns_to_pivot, var_name='Topic', value_name='Probability')

print(melted_df)

                 Review Model Retailer Review date          Review name  \
0         Canon Pixma TS6420a   Amazon  2023-08-10      Ernest Birkholz   
1      HP OfficeJet Pro 9015e   Amazon  2022-06-04               mattey   
2          Canon PIXMA MG3620   Amazon  2023-03-15              Maria D   
3             Epson - ET-3830   Amazon  2022-11-17               Ryan H   
4               HP ENVY 6055e   Amazon  2022-04-09                  Sam   
...                       ...      ...         ...                  ...   
51040         Epson - XP-6100   Amazon  2020-10-17     MD ZAKIR HOSSAIN   
51041  Epson - Workforce 4820   Amazon  2022-10-13           mario aste   
51042  Epson - Workforce 3820   Amazon  2023-07-26                  111   
51043         Epson - ET-2800   Amazon  2022-07-12  Margarita Rodriguez   
51044      Canon PIXMA TS3520   Amazon  2021-09-22        Paul Bridgman   

       Review rating            Review title  \
0                  5              work great   
1  

In [38]:
columns = ['Review Model', 'Review date', 'Review name',
        'Review rating', 'Full review',
        'Verified Purchase or not', 'People_find_helpful', 'vine or not',
        'list price', 'rating count', 'overall rating', 'Original title',
        'Original review',  'Brand',
         'Vector_Raw_Bigram', 'Vector_TFIDF_Bigram', 'Processed_bigram', 'Original_full',
          'ID', 'Topic', 'Probability'
       ]
df_final = melted_df[columns]
df_final.to_csv('document_retrieval.csv') 

In [39]:
# Save the processed data using joblib to feed on Streamlit app 
import joblib

processed_data = df_final
joblib.dump(processed_data, 'processed_data.joblib')

['processed_data.joblib']

# Use tensorflow to embed

In [29]:
#Use tensorflow to embed
# df['Review Content new'] = df['Review Content'].apply(lambda x: str(x) if x is not None else '') 

# def clean_text(text):
#     text = re.sub(r'[^a-zA-Z\s]', '', text)
#     text = text.lower()
#     return text

# df['Cleaned Reviews'] = df['Review Content new'].apply(clean_text)

# def remove_stopwords(text):
#     stop_words = set(stopwords.words('english'))
#     words = text.split()
#     filtered_words = [word for word in words if word.lower() not in stop_words]
#     return ' '.join(filtered_words)

# df['Final Reviews'] = df['Cleaned Reviews'].apply(remove_stopwords)

# model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# def embed(text,model):
#     embeddings = model(text)
#     return [embedding.numpy() for embedding in embeddings]


# df['Embed_sentence'] = embed(df['Final Reviews'], model)

# df['Embed_sentence']

In [30]:
def embed_label(text,model):
    embeddings = model([text])
    return embeddings.numpy()[0]

text = 'Connectivity'
input_embedding_vector = embed_label(text,model)

df['similarity'] = df['Embed_sentence'].apply(lambda x: cosine_similarity(input_embedding_vector,x))

NameError: name 'model' is not defined

In [None]:
pd.set_option('display.max_colwidth', None)
df_new = df.sort_values(by = 'similarity',ascending = False)
columns = ['Brand','Review Model','Review rating','Review Content']
df_select = df_new[columns]
df_final = df_select[df_select['Brand'] == 'HP'].head(4).reset_index(drop = True)
df_final

In [None]:
text = 'Print Quality'
input_embedding_vector = embed_label(text,model)

df['similarity'] = df['Embed_sentence'].apply(lambda x: cosine_similarity(input_embedding_vector,x))

pd.set_option('display.max_colwidth', None)
df_new = df.sort_values(by = 'similarity',ascending = False)
columns = ['Brand','Review Model','Review rating','Review Content']
df_select = df_new[columns]
df_final = df_select[df_select['Brand'] == 'HP'].head(4).reset_index(drop = True)
df_final

In [None]:
text = 'Paper jam'
input_embedding_vector = embed_label(text,model)

df['similarity'] = df['Embed_sentence'].apply(lambda x: cosine_similarity(input_embedding_vector,x))

pd.set_option('display.max_colwidth', None)
df_new = df.sort_values(by = 'similarity',ascending = False)
columns = ['Brand','Review Model','Review rating','Review Content']
df_select = df_new[columns]
df_final = df_select[df_select['Brand'] == 'HP'].head(4).reset_index(drop = True)
df_final

# Other (Draft)

In [None]:
# labels = [
#     'Setup', 
#     'Connectivity', 
#     'Customer Support', 
#     'Print Quality', 
#     'Print Speed', 
#     'Ink supply and Cartridge', 
#     'Printer Hardware Robustness and sturdiness', 
#     'Control Panel', 
#     'Ease of Use', 
#     'Firmware', 
#     'Business Services and Subscription', 
#      'Paper jam',
#      'Control Panel', 
#     'Other'
# ]

# df_labels = pd.DataFrame({'Labels': labels})
# df_labels

In [None]:
# def docs2vecs(docs,dictionary):
#     vec1 = [dictionary.doc2bow(doc) for doc in docs]
#     tfid = gensim.models.TfidfModel(vec1)
#     vec2 = [tfidf[vec] for vec in vec1]
#     return vec2

# vectorizer = TfidfVectorizer(max_df = 0.5, min_df = 2, stop_words = 'english')
# X = vectorizer.fit_transform(dataset.data)

In [None]:
# model = Word2Vec(sentences = label_token, vector_size=100, window=5, min_count=1, sg=0)

In [None]:
# from gensim.models import Word2Vec
# import pandas as pd

# # List of labels
# labels = [
#     'Setup', 
#     'Connectivity', 
#     'Customer Support', 
#     'Print Quality', 
#     'Print Speed', 
#     'Ink supply and Cartridge', 
#     'Printer Hardware Robustness and sturdiness', 
#     'Control Panel', 
#     'Ease of Use', 
#     'Firmware', 
#     'Business Services and Subscription', 
#     'Paper jam',
#     'Control Panel', 
#     'Other'
# ]

# # Create a DataFrame with labels
# df_labels = pd.DataFrame({'Labels': labels})

# # Split the labels into words (tokens)
# label_tokens = [label.split() for label in labels]

# # Train a Word2Vec model on label tokens
# model = Word2Vec(sentences=label_tokens, vector_size=100, window=5, min_count=1, sg=0)

# # Function to get the embedding of a label
# def label_embedding(label, model):
#     return model.wv[label]  # Use indexing to retrieve the embedding

# # Apply the label_embedding function to embed labels
# df_labels['Label Embeddings'] = df_labels['Labels'].apply(lambda x: label_embedding(x, model))

In [None]:
# def search_reviews(df, product_description, pprint=True):
#     product_embedding = get_embedding(
#         product_description,
#         engine=embedding_model
#     )
#     df[f"similarity_{product_description}"] = df.embedding.apply(lambda x: cosine_similarity(x, product_embedding))

#     results = df.sort_values(f"similarity_{product_description}", ascending=False)
                       
#     return results



# summary_df = []

# for label in labels:
#     label_results = search_reviews(df_filter, label, pprint=True)
    
#     # Apply a similarity threshold to filter rows
#     threshold = 0.85
#     filtered_results = label_results[label_results[f"similarity_{label}"] > threshold]
    
#     # Add the label column to the filtered results
#     filtered_results['topic'] = label
    
#     # Append the filtered results to the summary_df
#     summary_df.append(filtered_results)
    
#     time.sleep(5)

# # Concatenate the filtered results into the final summary DataFrame
# summary_df = pd.concat(summary_df)


In [None]:
# columns_to_drop = ['embedding', 'embedding_splitted']

# # Use the drop method to remove the specified columns
# df_phrase_filtered = df_phrase.drop(columns=columns_to_drop)

In [None]:
# def find_most_similar_word(review_embedding, word_vectors, word_list):
#     similarities = [cosine_similarity(review_embedding, word_vector) for word_vector in word_vectors]
#     most_similar_index = similarities.index(max(similarities))
#     return word_list[most_similar_index]  # Return the actual word

# # Assuming you have a DataFrame 'df' with 'embedding' column containing review embeddings
# df_phrase['Topic'] = df_phrase['embedding_splitted'].apply(lambda x: find_most_similar_word(x, label_embeddings, labels))


In [None]:
# df_phrase['Topic2'] = df_phrase['embedding'].apply(lambda x: find_most_similar_word(x, label_embeddings, labels))

In [None]:
df_phrase 