In [118]:
import pandas as pd
import re
import math
import time
import datetime
import numpy as np
import nltk
import gensim
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.util import bigrams
from nltk.corpus import stopwords
from nltk.stem.porter import *
from gensim.models import Word2Vec
from gensim import models
from gensim import corpora
from gensim import similarities
from openai.embeddings_utils import cosine_similarity
from sklearn.metrics.pairwise import cosine_similarity
import tensorflow as tf
import tensorflow_hub as hub 
from numpy.linalg import norm

## Unigram model

In [119]:
df = pd.read_excel('amazon_review_processed_full.xlsx')
#df.columns

In [120]:
# Tokenisation (Full Review contains processed text - applied stemming and stopword removal, spelling check)
df['Tokenized Full review'] = df['Full review'].apply(lambda x: word_tokenize(str(x)) if isinstance(x, str) else [])
df[['Full review', 'Tokenized Full review']].head(5)

Unnamed: 0,Full review,Tokenized Full review
0,must epson util io iphon ipad must epson smart panel app iphon ipad annoy frustrat smart app doesnt allow appl note app least work basic appl app eco tank nice reason dont need epson app check everi time whether left sinc happen everi 6 month year print photo okay noth write home,"[must, epson, util, io, iphon, ipad, must, epson, smart, panel, app, iphon, ipad, annoy, frustrat, smart, app, doesnt, allow, appl, note, app, least, work, basic, appl, app, eco, tank, nice, reason, dont, need, epson, app, check, everi, time, whether, left, sinc, happen, everi, 6, month, year, print, photo, okay, noth, write, home]"
1,major load issu worst major issu load keep get jam doesnt load regular new fan still doesnt load worst custom servic well promis send new epson refurbish cours send refurbish go hassl age custom servic tri anoth new promis purchas januari 2023,"[major, load, issu, worst, major, issu, load, keep, get, jam, doesnt, load, regular, new, fan, still, doesnt, load, worst, custom, servic, well, promis, send, new, epson, refurbish, cours, send, refurbish, go, hassl, age, custom, servic, tri, anoth, new, promis, purchas, januari, 2023]"
2,jam time worst ever purchas jam feeder time also jam feeder bottom make cope scan huge hassl purchas top line perform like bottl line purchas 45 epson printer last 20 year love epson product hate piec junk highli recommend,"[jam, time, worst, ever, purchas, jam, feeder, time, also, jam, feeder, bottom, make, cope, scan, huge, hassl, purchas, top, line, perform, like, bottl, line, purchas, 45, epson, printer, last, 20, year, love, epson, product, hate, piec, junk, highli, recommend]"
3,easi instal work well easi instal bright overal decent,"[easi, instal, work, well, easi, instal, bright, overal, decent]"
4,love work great dont lot everyth work expect,"[love, work, great, dont, lot, everyth, work, expect]"


### Raw term frequency

In [121]:
# Create a dictionary from the tokenized content, bag of words and reverse index
dictionary = corpora.Dictionary(df['Tokenized Full review'])
corpus = [dictionary.doc2bow(text) for text in df['Tokenized Full review']]
Index = similarities.SparseMatrixSimilarity(corpus, len(dictionary))

In [122]:
def query_raw(text):
    qList = text.split()  
    qLower = [w.lower() for w in qList]
    stemmer = PorterStemmer()
    qStemmed = [stemmer.stem(w) for w in qLower]
    qVector = dictionary.doc2bow(qStemmed)
    return qVector

In [123]:
qVector = query_raw('low quality')
simRaw = Index[qVector]
df['Similarity_Raw'] = simRaw
df_raw = df.sort_values(by = 'Similarity_Raw', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_Raw"]
pd.set_option('display.max_colwidth', None)
df_raw[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_Raw
8821,Canon PIXMA TR4720,2023-08-16,2,Low quality printing,"It was easy to install, but the printing job sucked. I decided to use it to print PP slides for the first time and the quality was very low.",0.685994
9416,Canon PIXMA TS3520,2022-11-13,5,good quality low price,Very pleased with printer. Good quality for a low price.,0.685994
6771,HP DeskJet 2755e,2023-04-29,4,Quality?,Print quality not what I would expect from HP.,0.57735
9316,Canon PIXMA TR4720,2023-07-25,3,"Loud and not high quality, but it’s cheap","Beware the quality is very low on this product, but it does the job that I need. It’s not the quietest printer I’ve had. You get what you pay for.",0.547723
191,Epson - ET-4850,2023-05-14,1,Very Poor Quality,I have had this printer for several months. It is cheap quality. Paper constantly jams. The interface is baffling and nearly unusable. Now it jams every time I use it. Just a very low quality piece of equipment.,0.544331


In [17]:
qVector = query_raw('Paper jam')
simRaw = Index[qVector]
df['Similarity_Raw'] = simRaw
df_raw = df.sort_values(by = 'Similarity_Raw', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_Raw"]
pd.set_option('display.max_colwidth', None)
df_raw[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_Raw
2374,Canon PIXMA TR4720,2022-12-01,2,Printer Jam,"The printer jams each time I print, very annoying. Ink runs out quickly. I barely use the printer due to paper jamming.",0.547723
3944,HP ENVY 6455e,2022-01-22,4,Paper tends to jam,Paper tends to jam,0.5
2470,HP Smart Tank 7301,2023-01-28,1,Jams constantly,The printer consistently jams. It wont print anything. It wont even print a page without jamming.,0.486664
8904,HP ENVY Inspire 7955e,2023-08-21,1,paper jams CONSTANTLY,"Every time I try to print more than one page - the pages get jammed (although no paper actually gets stuck - it partially prints and then says jammed). I've tried different papers - thinner, thicker - it doesn't matter. It prints 2 pages and then jams - prints 2 more and jams. It partially prints and says it's jammed and ended up wasting over 30 pieces of paper trying to print an 11 page document.",0.4741
400,HP OfficeJet Pro 9015e,2022-03-22,3,continues to have paper jams,"what can I do, this printer continues to have paper jams and is not working",0.471405


### TF-IDF

In [124]:
# Create a TFIDF reverse index
TFIDF = models.TfidfModel(corpus)
corpus_TFIDF = [TFIDF[vec] for vec in corpus]
IndexTFIDF = similarities.SparseMatrixSimilarity(corpus_TFIDF, len(dictionary))

In [125]:
def query_tfidf(text):
    qList = text.split()  
    qLower = [w.lower() for w in qList]
    stemmer = PorterStemmer()
    qStemmed = [stemmer.stem(w) for w in qLower]
    qVector = dictionary.doc2bow(qStemmed)
    qVectorTFIDF = TFIDF[qVector]
    return qVectorTFIDF

In [126]:
qVectorTFIDF = query_tfidf('low quality')
simTFIDF = IndexTFIDF[qVectorTFIDF]
df['Similarity_TFIDF'] = simTFIDF
df_tfidf = df.sort_values(by = 'Similarity_TFIDF', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_TFIDF"]
pd.set_option('display.max_colwidth', None)
df_tfidf[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_TFIDF
9416,Canon PIXMA TS3520,2022-11-13,5,good quality low price,Very pleased with printer. Good quality for a low price.,0.751272
8821,Canon PIXMA TR4720,2023-08-16,2,Low quality printing,"It was easy to install, but the printing job sucked. I decided to use it to print PP slides for the first time and the quality was very low.",0.597942
6728,HP DeskJet 2755e,2023-08-17,5,low price,setup took time,0.558956
7422,HP ENVY 6055e,2021-12-03,3,Low Quality,this printer disconnects all the time. I spend a lot of time trying to print. the printer came with toner but my printer is reading as it is low ink?? it's really frustrating trying to use this.,0.540725
1881,HP Smart Tank 5101,2023-09-10,5,It's works,This one is worth it it has where I can see the when the ink gets low,0.512525


In [103]:
qVectorTFIDF = query_tfidf('paper jam')
simTFIDF = IndexTFIDF[qVectorTFIDF]
df['Similarity_TFIDF'] = simTFIDF
df_tfidf = df.sort_values(by = 'Similarity_TFIDF', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_TFIDF"]
pd.set_option('display.max_colwidth', None)
df_tfidf[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_TFIDF
6611,Epson - ET-4850,2023-08-16,1,Paper Jam ALL THE TIME,"I need a copier that I can feed at least a few papers through at a time but this one doesn't copy even one paper without a paper jam, constantly, over and over again. If you need a copier that will consistently work, this one is not it.",0.454711
6179,Canon Pixma TS6420a,2022-08-10,4,Works fine,"I switched from HP to Canon. This printer prints well and offers affordable subscription plans. It is very easy to set up if you follow the instructions in the box and on the screen. I like the fact that it has wireless connection. The printer prints fast but 'moves'. The quality of the photos is excellent. It prints well on papers as well, but depending on the picture quality or paper quality, it sometimes prints papers as bluish with gray. As a student, I use it mostly for printing research papers.",0.443145
8904,HP ENVY Inspire 7955e,2023-08-21,1,paper jams CONSTANTLY,"Every time I try to print more than one page - the pages get jammed (although no paper actually gets stuck - it partially prints and then says jammed). I've tried different papers - thinner, thicker - it doesn't matter. It prints 2 pages and then jams - prints 2 more and jams. It partially prints and says it's jammed and ended up wasting over 30 pieces of paper trying to print an 11 page document.",0.424191
1473,Canon PIXMA TR4720,2023-03-05,1,Jams and Doesn't Print Correctly,"This printer is awful. Just after the return window was closed, it stopped working! It tries to suck up too many papers at once and jams every single time I print more than one sheet. I had high hopes for this printer, but it was just a waste of money. I'm super disappointed! I have even tried different brands of paper but it still jams. It prints blurry, but I'm not sure if that's from being a crappy printer or from wrestling jammed paper out of it all the time. Either way, avoid this one!!",0.405851
1554,Epson - Workforce 3820,2023-07-12,3,Pick Roller ineffective,"This printer has a pick roller issue. It will often get papers jam, picking multiple papers at once and it is a bit slower compared to similar printers.",0.397878


## Bigram model

In [127]:
#preprocess data to extract bigrams from original reviews
#concat tile and review
df['Original_full'] = df['Original title'].str.cat(df['Original review'], sep=' ', na_rep='')
df['Original_full'].head(5)

0    Must use an Epson utility to print from iOS/iPhone/iPad You must use the Epson Smart Panel app to print from an iPhone or iPad which is annoying. What is frustrating is that the "Smart" app doesn't allow you to print Apple Notes. How can the app not at least work with the basic Apple apps? The Eco tank is nice but all the more reason you don't need an Epson app checking every time whether you have any ink left since this should only happen about once every 6 months to a year. The prints of photos are okay but nothing to write home about.
1                                       Major paper loading issues!!!! Worst printer! Has major issue loading paper. Paper keeps getting jammed and doesn't load. I have regular new paper, I fan the paper and it still doesn't load.Worst customer service as well. they promised me they will send me a new Epson printer not a refurbished one.Of course they send me a refurbished one and I now have to go through the hassle again of ages of customer servi

In [128]:
# Define a function to preprocess and tokenize the text
def preprocess_text(text):
    if isinstance(text, str):
        # Lowercase the text
        text = text.lower()

        # Remove non-alphanumeric characters
        text = re.sub(r'[^\w\s]', ' ', text)

        # Tokenize the text
        unigrams = word_tokenize(text)
        bigrams_list = list(bigrams(unigrams))

        # Stemming
        stemmer = PorterStemmer()
        stemmed_unigrams = [stemmer.stem(word) for word in unigrams]
        stemmed_bigrams = [tuple(stemmer.stem(word) for word in bigram) for bigram in bigrams_list]

        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        unigrams_without_stopwords = [word for word in stemmed_unigrams if word not in stop_words]
        bigrams_without_stopwords = [' '.join(bigram) for bigram in stemmed_bigrams if not any(word in stop_words for word in bigram)]

        # Join both unigrams and bigrams
        tokens = unigrams_without_stopwords + bigrams_without_stopwords
    else:
        tokens = [] 

    return tokens

# Apply the modified preprocessing function to review data
df['Processed_bigram'] = df['Original_full'].apply(preprocess_text)

### Raw Term Frequency

In [129]:
dictionary_bi = corpora.Dictionary(df['Processed_bigram'])
corpus_bi = [dictionary_bi.doc2bow(text) for text in df['Processed_bigram']]
Index_bi = similarities.SparseMatrixSimilarity(corpus_bi, len(dictionary_bi))

df['Vector_Raw_Bigram'] = corpus_bi
print(len(dictionary_bi))

97667


In [130]:
def query_raw_bi(query):
    preprocessed_query = preprocess_text(query)
    qVector_bi = dictionary_bi.doc2bow(preprocessed_query)
    return qVector_bi

In [131]:
qVector_bi = query_raw_bi('low quality')
SimRaw_bi = Index_bi[qVector_bi]
df['Similarity_Raw_Bigram'] = SimRaw_bi
df_raw_bi = df.sort_values(by = 'Similarity_Raw_Bigram', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_Raw_Bigram"]
pd.set_option('display.max_colwidth', None)
df_raw_bi[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_Raw_Bigram
8821,Canon PIXMA TR4720,2023-08-16,2,Low quality printing,"It was easy to install, but the printing job sucked. I decided to use it to print PP slides for the first time and the quality was very low.",0.440225
9416,Canon PIXMA TS3520,2022-11-13,5,good quality low price,Very pleased with printer. Good quality for a low price.,0.414781
191,Epson - ET-4850,2023-05-14,1,Very Poor Quality,I have had this printer for several months. It is cheap quality. Paper constantly jams. The interface is baffling and nearly unusable. Now it jams every time I use it. Just a very low quality piece of equipment.,0.412393
2184,Epson - ET-2800,2023-02-28,2,,There’s only normal or high quality. There’s no draft or low quality mode to save ink. This really bothers me. I’ll return to post about this printer yield. At least I got rid of cartridges.,0.379663
107,Epson - ET-4850,2023-02-27,5,Color and print quality,Quality is excellent and fast,0.365148


In [132]:
qVector_bi = query_raw_bi('Paper jam')
SimRaw_bi = Index_bi[qVector_bi]
df['Similarity_Raw_Bigram'] = SimRaw_bi
df_raw_bi = df.sort_values(by = 'Similarity_Raw_Bigram', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_Raw_Bigram"]
pd.set_option('display.max_colwidth', None)
df_raw_bi[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_Raw_Bigram
9710,Canon PIXMA MG3620,2023-06-28,1,Consistent paper jams,"Most of the paper jams I could fix. However, I had a paper jam so bad, it destroyed the rollers so I used the warranty. Then, it happened again. Do yourself a favor, buy a different printer.",0.766131
1255,HP OfficeJet Pro 9015e,2022-03-22,3,continues to have paper jams,"what can I do, this printer continues to have paper jams and is not working",0.755929
5437,HP OfficeJet Pro 8025e,2021-10-17,2,Poor quality.,"If you print something? Paper jams. Reload the paper the only way it can go? Paper jam.WiFi blinks? Paper jam. You leave the printer alone too long? Paper jam. It’s incredible how hard it must be to make a printer that just prints when it’s told to, because no one can seem to do it. I’ve had this for 3 months and it’s a very high maintenance printer.",0.712396
4081,HP ENVY Inspire 7255e,2022-05-14,2,Can't find paper jam,"I can't set it up with my laptop, keeps saying I have a paper jam.I need a printer manual.",0.707107
3255,Epson - Workforce 4820,2022-06-19,3,Paper jam,"If you don’t mind a paper jam several times after filling the paper tray, then this machine is for you. I purchased 2 months ago. Fill the paper tray, use about 1/2 the paper in the tray, and bam…. A paper jam!Of course the jam is in the back of printer…. What a pain!!!I do not recommend this printer",0.699896


In [133]:
qVector_bi = query_raw_bi('Customer support')
SimRaw_bi = Index_bi[qVector_bi]
df['Similarity_Raw_Bigram'] = SimRaw_bi
df_raw_bi = df.sort_values(by = 'Similarity_Raw_Bigram', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_Raw_Bigram"]
pd.set_option('display.max_colwidth', None)
df_raw_bi[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_Raw_Bigram
1818,HP OfficeJet Pro 8034e,2023-06-12,1,Worst Customer Support Ever!,I just loaded brand new cartridges and my printing is completely faded. Absolutely horrible getting in touch with customer support to help. Do not buy!,0.57735
9613,Canon PIXMA MG3620,2023-07-26,3,Powers completely off when not in use,"Returning this. For 1. when not in use it shuts down completely. I don’t know how long it stays idle before shutting down and there’s no way to find out because it’s not in the manual nor can you call customer support without having an account and providing your email and phone number. I also tried to call customer support to get an answer of is a Google Chromebook compatible? It does not appear to be but that is not listed anywhere in the instructions and again, I can’t speak to anyone in customer support unless I create an account. It took me awhile to set it up wirelessly but I succeeded and have printed from my iPhone a few times, but the Chromebook is a no go.",0.495434
992,HP OfficeJet Pro 9025e,2022-09-22,1,"Won't connect to scan, no customer support",I have had three of these same printers for 5 years. This one will not connect to my wifi network or scan. The customer support number is not a working number. HP tries to charge to answer any questions. I would not recommend!,0.485071
7121,HP DeskJet 2755e,2023-08-08,3,Help printer woes,Had to call customer support to set up/ link to my devices,0.480384
8297,Canon PIXMA TR4720,2021-10-01,1,"Cheap, flimsy and the worst customer support","Very fragile and flimsy printer. I had a simple question for Customer support and it took me an hour to get through. Finally I found out I had to register the product in order to even talk to someone. When you try to register online they ask you for your product through voice recognition. I said my model 3 times and they don't recognize my model number and hang up on you. I did this for an hour, had my wife and son talk into the phone and apparently my model number TR4720 does not exist, even though I have it in front of me.Bottom line if you need customer support you best figure it out on your own because nobody is there to help you.Lets see how long the paper tray holds up, maybe a month or so? I think a piece of tape to hold it together would be better.I will be returning this printer.",0.412082


### TF-IDF

In [134]:
# Create a TFIDF reverse index
TFIDF_bi = models.TfidfModel(corpus_bi)
corpus_TFIDF_bi = [TFIDF_bi[vec] for vec in corpus_bi]
IndexTFIDF_bi = similarities.SparseMatrixSimilarity(corpus_TFIDF_bi, len(dictionary_bi))

df['Vector_TFIDF_Bigram'] = corpus_TFIDF_bi
print(len(dictionary_bi))

97667


In [135]:
def query_tfidf_bi(query):
    preprocessed_query = preprocess_text(query)
    qVector_bi = dictionary_bi.doc2bow(preprocessed_query)
    qVectorTFIDF_bi = TFIDF_bi[qVector_bi]
    return qVectorTFIDF_bi

In [136]:
qVectorTFIDF_bi = query_tfidf_bi('low quality')
simTFIDF_bi = IndexTFIDF_bi[qVectorTFIDF_bi]
df['Similarity_TFIDF_Bigram'] = simTFIDF_bi
df_tfidf_bi = df.sort_values(by = 'Similarity_TFIDF_Bigram', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_TFIDF_Bigram"]
pd.set_option('display.max_colwidth', None)
df_tfidf_bi[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_TFIDF_Bigram
7422,HP ENVY 6055e,2021-12-03,3,Low Quality,this printer disconnects all the time. I spend a lot of time trying to print. the printer came with toner but my printer is reading as it is low ink?? it's really frustrating trying to use this.,0.392702
8821,Canon PIXMA TR4720,2023-08-16,2,Low quality printing,"It was easy to install, but the printing job sucked. I decided to use it to print PP slides for the first time and the quality was very low.",0.360529
191,Epson - ET-4850,2023-05-14,1,Very Poor Quality,I have had this printer for several months. It is cheap quality. Paper constantly jams. The interface is baffling and nearly unusable. Now it jams every time I use it. Just a very low quality piece of equipment.,0.265875
2184,Epson - ET-2800,2023-02-28,2,,There’s only normal or high quality. There’s no draft or low quality mode to save ink. This really bothers me. I’ll return to post about this printer yield. At least I got rid of cartridges.,0.262055
8656,Canon PIXMA TR4720,2023-06-07,1,Item had obviously been used. Ink cartridges were dried up. Very disappointing !,"Printer did not produce quality copies. It arrived with original packaging already removed and Ink installed and dry. With new ink, the copies were low quality. Very disappointed in Cannon.",0.238766


In [137]:
qVectorTFIDF_bi = query_tfidf_bi('paper jam')
simTFIDF_bi = IndexTFIDF_bi[qVectorTFIDF_bi]
df['Similarity_TFIDF_Bigram'] = simTFIDF_bi
df_tfidf_bi = df.sort_values(by = 'Similarity_TFIDF_Bigram', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_TFIDF_Bigram"]
pd.set_option('display.max_colwidth', None)
df_tfidf_bi[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_TFIDF_Bigram
1255,HP OfficeJet Pro 9015e,2022-03-22,3,continues to have paper jams,"what can I do, this printer continues to have paper jams and is not working",0.688106
2321,Epson - ET-2850,2023-06-13,1,Did not work.,"Printer did not work. It had a paper jam when I was setting up the printer. I cleared the paper jam as directed but the printer would not clear the paper jam error on the printer screen. I called Epson for help, and they said to just send it back.",0.62867
9710,Canon PIXMA MG3620,2023-06-28,1,Consistent paper jams,"Most of the paper jams I could fix. However, I had a paper jam so bad, it destroyed the rollers so I used the warranty. Then, it happened again. Do yourself a favor, buy a different printer.",0.552715
3255,Epson - Workforce 4820,2022-06-19,3,Paper jam,"If you don’t mind a paper jam several times after filling the paper tray, then this machine is for you. I purchased 2 months ago. Fill the paper tray, use about 1/2 the paper in the tray, and bam…. A paper jam!Of course the jam is in the back of printer…. What a pain!!!I do not recommend this printer",0.536228
6119,HP ENVY 6455e,2021-10-05,2,Printer jams …. All the time!,It is difficult to connect to the internet and the paper jams constantly,0.505307


In [138]:
qVectorTFIDF_bi = query_tfidf_bi('customer support')
simTFIDF_bi = IndexTFIDF_bi[qVectorTFIDF_bi]
df['Similarity_TFIDF_Bigram'] = simTFIDF_bi
df_tfidf_bi = df.sort_values(by = 'Similarity_TFIDF_Bigram', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_TFIDF_Bigram"]
pd.set_option('display.max_colwidth', None)
df_tfidf_bi[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_TFIDF_Bigram
1818,HP OfficeJet Pro 8034e,2023-06-12,1,Worst Customer Support Ever!,I just loaded brand new cartridges and my printing is completely faded. Absolutely horrible getting in touch with customer support to help. Do not buy!,0.389979
992,HP OfficeJet Pro 9025e,2022-09-22,1,"Won't connect to scan, no customer support",I have had three of these same printers for 5 years. This one will not connect to my wifi network or scan. The customer support number is not a working number. HP tries to charge to answer any questions. I would not recommend!,0.38967
9613,Canon PIXMA MG3620,2023-07-26,3,Powers completely off when not in use,"Returning this. For 1. when not in use it shuts down completely. I don’t know how long it stays idle before shutting down and there’s no way to find out because it’s not in the manual nor can you call customer support without having an account and providing your email and phone number. I also tried to call customer support to get an answer of is a Google Chromebook compatible? It does not appear to be but that is not listed anywhere in the instructions and again, I can’t speak to anyone in customer support unless I create an account. It took me awhile to set it up wirelessly but I succeeded and have printed from my iPhone a few times, but the Chromebook is a no go.",0.362506
3286,Epson - Workforce 4820,2020-10-21,1,Software issues - do not order this printer,"I received my first printer and kept getting an error saying ""insert paper cassette correctly"". I am no novice to printers but I called customer support because no matter what I did it wasn't working. Customer support determined it was a software problem and to replace the unit. I did so and received my new one. Yayyy, I finally have a printer after months of back ordering and running to UPS for my print jobs. Nope, this one had the exact same issue, and I was told the exact same thing by customer support.Apparently this is a new model that was just released and they haven't worked out the bugs yet. DO NOT PURCHASE THIS PRINTER.",0.339225
3569,HP ENVY Inspire 7955e,2023-08-14,1,No customer support,A little over a year old and it’s saying there’s a paper jam but there’s no paper in it. Once your warranty expires HP will not talk to you,0.317486


In [139]:
qVectorTFIDF_bi = query_tfidf_bi('bad connection')
simTFIDF_bi = IndexTFIDF_bi[qVectorTFIDF_bi]
df['Similarity_TFIDF_Bigram'] = simTFIDF_bi
df_tfidf_bi = df.sort_values(by = 'Similarity_TFIDF_Bigram', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_TFIDF_Bigram"]
pd.set_option('display.max_colwidth', None)
df_tfidf_bi[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_TFIDF_Bigram
8741,Canon PIXMA TR4720,2022-02-27,2,Master Set Up Didn't,"Master setup would not go past ""Install ink cartridges"", So it was essentially frozen. After reinstalling the color and black inks over and over, it finally continued and finished the installation and printed a test sheet! very frustrating and time consuming. It appears it was a bad connection on one of the ink cartridges.",0.238567
1457,HP OfficeJet Pro 9015e,2021-12-30,4,printer,Not bad printer for price,0.146873
3052,Epson - Workforce 4820,2023-03-26,3,,"The printer seems like a reasonable printer for the price and once it’s set up performs reasonably well. However, when setting up the printer and installing the software for Mac users, it doesn’t always install smoothly and corrupts. The scanner software for installation says to install smart, scan software, but actually that’s the wrong software. It should be scan 2 software, which also corrupts and doesn’t completely install all the time. EPSON technical support is terrible. I can’t say it’s even close to acceptable. There is a big English problem of understanding each other, their phone lines being overseas for north Americans always has a bad connection and will disconnect during your conversation. The technicians don’t know what they’re doing and do not know their products and are constantly putting you on hold to talk to somebody by computer in the USA to try and solve the problems you’re calling about. EPSON really needs to get their act together and bring back English-speaking technical support.",0.139453
5743,Epson - Workforce 3820,2023-08-06,1,unable to connect with wifi,very bad,0.113909
7854,Canon Pixma TS6420a,2022-11-25,5,Works great with phone,I use my phone for almost everything so printer is good for that. Setting up was not too bad,0.099082


In [170]:
qVectorTFIDF_bi = query_tfidf_bi('Return because paper jam')
simTFIDF_bi = IndexTFIDF_bi[qVectorTFIDF_bi]
df['Similarity_TFIDF_Bigram'] = simTFIDF_bi
df_tfidf_bi = df.sort_values(by = 'Similarity_TFIDF_Bigram', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_TFIDF_Bigram"]
pd.set_option('display.max_colwidth', None)
df_tfidf_bi[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_TFIDF_Bigram
1255,HP OfficeJet Pro 9015e,2022-03-22,3,continues to have paper jams,"what can I do, this printer continues to have paper jams and is not working",0.276452
2321,Epson - ET-2850,2023-06-13,1,Did not work.,"Printer did not work. It had a paper jam when I was setting up the printer. I cleared the paper jam as directed but the printer would not clear the paper jam error on the printer screen. I called Epson for help, and they said to just send it back.",0.252573
9631,Canon PIXMA MG3620,2023-05-08,5,Ease of connection,Printer works very well and because paper is fed and delivered at the front it’s great for a slide out. I’ve printed pictures as well as prints and the colors are very nice. Relatively quiet. Prints remotely too,0.227387
9710,Canon PIXMA MG3620,2023-06-28,1,Consistent paper jams,"Most of the paper jams I could fix. However, I had a paper jam so bad, it destroyed the rollers so I used the warranty. Then, it happened again. Do yourself a favor, buy a different printer.",0.222057
4364,HP Smart Tank 6001,2022-12-25,2,Kept on getting paper jam had to return,Since day one I kept on getting paper jams and then one day a glitch where there's no paper but it's jammed did everything I could to fix it in the end just returned it,0.215832


## Export Data

In [140]:
# Get the list of columns to pivot 
columns_to_pivot = ['Topic0', 'Topic1', 'Topic2', 'Topic3', 'Topic4']

# Create a list of id_vars columns by excluding the columns to pivot
id_vars = [col for col in df.columns if col not in columns_to_pivot]

# Melt the DataFrame without listing all id_vars columns explicitly
melted_df = pd.melt(df, id_vars=id_vars, value_vars=columns_to_pivot, var_name='Topic', value_name='Probability')

In [142]:
columns = ['Review Model', 'Review date', 'Review name',
        'Review rating', 'Full review',
        'Verified Purchase or not', 'People_find_helpful', 
        'list price',  'Original title',
        'Original review',  'Brand','Processed_bigram', 'Topic', 'Probability', 'compound'
       ]
df_final = melted_df[columns]
df_final.rename(columns={'list price': "List price"}, inplace=True)

topic_mapping = {
    "Topic0": "Printing job",
    "Topic1": "Support and returns",
    "Topic2": "Print and scan quality",
    "Topic3": "Setup and connection",
    "Topic4": "Cartridge replacement"
}

df_final['Topic'] = df_final['Topic'].replace(topic_mapping)
df_final['Review date'] = pd.to_datetime(df_final['Review date']).dt.strftime('%Y/%m/%d')
df_final
df_final.to_csv('document_retrieval.csv') 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final.rename(columns={'list price': "List price"}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['Topic'] = df_final['Topic'].replace(topic_mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['Review date'] = pd.to_datetime(df_final['Review date']).dt.strftime('%Y/%m/%d')


In [143]:
# Save the processed data using joblib to feed on Streamlit app 
import joblib

processed_data = df_final
joblib.dump(processed_data, 'processed_data.joblib')

['processed_data.joblib']

## Pre-trained Model 

In [149]:
df['Combined_text'] = df['Original review']+ df['Original title']

df['Review Content new'] = df['Combined_text'].apply(lambda x: str(x) if x is not None else '') 

def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    return text

df['Cleaned Reviews'] = df['Review Content new'].apply(clean_text)

model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

def embed(text,model):
    embeddings = model(text)
    return [embedding.numpy() for embedding in embeddings]


df['Embed_sentence_full'] = embed(df['Cleaned Reviews'], model)
df['Embed_sentence_full'].head(1)

0    [0.040679608, -0.033425793, -0.025045425, 0.06257245, 0.044001512, 0.02029398, -0.0029381867, -0.0033876088, 0.006384075, 0.010058293, -0.04061897, -0.008215251, -0.025722502, 0.050726403, -0.07280706, 0.0724395, 0.028136518, -0.032982737, 0.034164395, -0.015773084, -0.039595075, 0.011161954, 0.039010186, -0.0602996, 0.07263744, 0.07363291, -0.050808378, 0.017686307, -0.022004604, -0.03532791, -0.039124873, 0.022299647, 0.06980569, -0.0036597042, -0.013681514, -0.026004737, 0.06269214, -0.036454033, 0.014913474, 0.059584584, 0.07285111, 0.051486854, 0.027832558, -0.03867116, 0.027102524, 0.067891285, -0.068553306, 0.05438977, 0.03196614, 0.02768853, -0.07045476, 0.028660955, -0.009724305, 0.059131697, -0.07359828, 0.03490811, -0.062494814, -0.07251798, 0.05807416, -0.06920827, -0.010244656, -0.044454586, 0.051043727, -0.021159256, 0.04275987, 0.026477922, 0.026472587, -0.029791782, -0.029914962, -0.030534077, -0.07205031, -0.02380819, 0.05197437, 0.04664372, 0.04994975, -0.0672161

In [150]:
def embed_label(text,model):
    embedding = model([text])
    return embedding.numpy()[0]

def cosine_similarity(embedding_vetor, embedding_label):
    similarity = np.dot(embedding_vetor,embedding_label)/(norm(embedding_vetor)*norm(embedding_label))
    return similarity

In [168]:
text = 'Return because set up problem'
input_embedding_vector = embed_label(text,model)
df['Similarity_score'] = df['Embed_sentence_full'].apply(lambda x: cosine_similarity(input_embedding_vector,x))

pd.set_option('display.max_colwidth', None)
df_new = df.sort_values(by = 'Similarity_score',ascending = False)
columns = ['Review Model', 'Review date', 'Review name',
       'Review rating', 
       'Original title','Original review',  'Similarity_score' ]
df_select = df_new[columns]
df_final = df_select.head(4).reset_index(drop = True)
df_final

Unnamed: 0,Review Model,Review date,Review name,Review rating,Original title,Original review,Similarity_score
0,Canon PIXMA MG3620,2023-04-23,Leah,2,Too hard to set up,Returned too hard to set up,0.529793
1,HP Smart Tank 6001,2023-06-30,B. Whipple,1,not user friendly,have to return this. seems not compatable with my computer,0.424549
2,Epson - ET-3850,2023-01-05,Kimberly Ann Johnson,1,JUNK BROKEN,Refuse to return within 1 week of rcvg,0.414577
3,Canon PIXMA TR8620a,2023-03-17,mary mckinne,3,still trying to get it setup,have problems getting it to connect,0.413312


In [163]:
text = 'Paper jam'
input_embedding_vector = embed_label(text,model)
df['Similarity_score'] = df['Embed_sentence_full'].apply(lambda x: cosine_similarity(input_embedding_vector,x))

pd.set_option('display.max_colwidth', None)
df_new = df.sort_values(by = 'Similarity_score',ascending = False)
columns = ['Review Model', 'Review date', 'Review name',
       'Review rating', 'Full review',
       'Original title','Original review',  'Brand' ]
df_select = df_new[columns]
df_final = df_select.head(4).reset_index(drop = True)
df_final

Unnamed: 0,Review Model,Review date,Review name,Review rating,Full review,Original title,Original review,Brand
0,Epson - Workforce 4820,2022-06-19,Sharon,3,jam dont mind jam sever time fill tray machin purchas 2 month ago fill tray 1 2 tray bam jam cours jam back pain recommend,Paper jam,"If you don’t mind a paper jam several times after filling the paper tray, then this machine is for you. I purchased 2 months ago. Fill the paper tray, use about 1/2 the paper in the tray, and bam…. A paper jam!Of course the jam is in the back of printer…. What a pain!!!I do not recommend this printer",Epson
1,Canon Pixma TS6420a,2023-09-10,Randy,1,jam cassett horribl work want simpl direct put cassett slide nope get stuck easili fit tray top nope jam overtim 30 sheet pleas put tray end chuck,"Papaer Jam, Paper cassette","This printer is horrible. it works when it wants to. Simple directions...put paper in under cassette, slide it in, print, Nope it gets stuck and paper does easily fit in. So use the paper tray at the top, nope paper jam everytime, or 30 sheets of paper on it and you get "" please put paper in tray. ended up chucking it",Canon
2,HP OfficeJet Pro 8025e,2021-10-17,Michael,2,poor qualiti someth jam reload way go jam wifi blink jam leav alon long jam incred hard must make print told seem ive 3 month high mainten,Poor quality.,"If you print something? Paper jams. Reload the paper the only way it can go? Paper jam.WiFi blinks? Paper jam. You leave the printer alone too long? Paper jam. It’s incredible how hard it must be to make a printer that just prints when it’s told to, because no one can seem to do it. I’ve had this for 3 months and it’s a very high maintenance printer.",HP
3,HP OfficeJet Pro 9015e,2022-03-22,Peggy Bailey,3,continu jam continu jam work,continues to have paper jams,"what can I do, this printer continues to have paper jams and is not working",HP


In [164]:
text = 'HP support is bad'
input_embedding_vector = embed_label(text,model)
df['Similarity_score'] = df['Embed_sentence_full'].apply(lambda x: cosine_similarity(input_embedding_vector,x))

pd.set_option('display.max_colwidth', None)
df_new = df.sort_values(by = 'Similarity_score',ascending = False)
columns = ['Review Model', 'Review date', 'Review name',
       'Review rating', 'Full review',
       'Original title','Original review',  'Brand' ]
df_select = df_new[columns]
df_final = df_select.head(3).reset_index(drop = True)
df_final

Unnamed: 0,Review Model,Review date,Review name,Review rating,Full review,Original title,Original review,Brand
0,HP ENVY 6055e,2023-08-10,,1,pretti light terribl setup nonexist custom support spent hour half tri connect without success hi virtual assist worthless issu resolut hi live chat support unavail hi live phone support unavail would highli recommend look compet brand like canon epson avoid altogeth mood light pretti cool though 0 5 star,Pretty light | Terrible printer setup | Non-existent customer support.,I have spent over an hour and a half trying to get this printer connected without success. HP's virtual assistant is worthless in issue resolution. HP's live chat support is unavailable. HP's live phone support is unavailable. I would highly recommend looking for a competing brand like Canon or Epson to avoid HP altogether.The mood light is pretty cool though.0/5 Stars.,HP
1,HP ENVY Inspire 7255e,2022-07-19,R. R. Stewart,1,printer garbag po,ALL HP PRINTERS ARE GARBAGE!,pos,HP
2,HP OfficeJet Pro 9015e,2023-02-14,Toni*,3,absolut horribl tech support ord 2 year like tri help joke tri 2 year warranti damn frustrat,ABSOLULY HORRIBLE TECH SUPPORT!!,"This is my 3rd HP printer in 2 years! I like this one, but trying to get help from hp is such a joke!!Just try and get the 2 year warranty! So damn frustrated!!!!",HP
