In [47]:
import pandas as pd
import re
import math
import time
import datetime
import numpy as np
import nltk
import gensim
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.util import bigrams
from nltk.corpus import stopwords
from nltk.stem.porter import *
from gensim.models import Word2Vec
from gensim import models
from gensim import corpora
from gensim import similarities
from openai.embeddings_utils import cosine_similarity
from sklearn.metrics.pairwise import cosine_similarity
import tensorflow as tf
import tensorflow_hub as hub 
from numpy.linalg import norm

## Unigram model

In [9]:
df = pd.read_excel('amazon_review_processed_full.xlsx')
#df.columns

In [13]:
# Tokenisation (Full Review contains processed text - applied stemming and stopword removal, spelling check)
df['Tokenized Full review'] = df['Full review'].apply(lambda x: word_tokenize(str(x)) if isinstance(x, str) else [])
df[['Full review', 'Tokenized Full review']].head(5)

Unnamed: 0,Full review,Tokenized Full review
0,work great easi instal work great,"[work, great, easi, instal, work, great]"
1,spunki mid size slower speed im use old model ...,"[spunki, mid, size, slower, speed, im, use, ol..."
2,i,[i]
3,shag know he talk yup slam upgrad paid littl f...,"[shag, know, he, talk, yup, slam, upgrad, paid..."
4,user friendli would think someth simpl reconne...,"[user, friendli, would, think, someth, simpl, ..."


### Raw term frequency

In [14]:
# Create a dictionary from the tokenized content, bag of words and reverse index
dictionary = corpora.Dictionary(df['Tokenized Full review'])
corpus = [dictionary.doc2bow(text) for text in df['Tokenized Full review']]
Index = similarities.SparseMatrixSimilarity(corpus, len(dictionary))

In [15]:
def query_raw(text):
    qList = text.split()  
    qLower = [w.lower() for w in qList]
    stemmer = PorterStemmer()
    qStemmed = [stemmer.stem(w) for w in qLower]
    qVector = dictionary.doc2bow(qStemmed)
    return qVector

In [16]:
qVector = query_raw('low quality')
simRaw = Index[qVector]
df['Similarity_Raw'] = simRaw
df_raw = df.sort_values(by = 'Similarity_Raw', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_Raw"]
pd.set_option('display.max_colwidth', None)
df_raw[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_Raw
686,Canon PIXMA TS3520,2022-11-13,5,good quality low price,Very pleased with printer. Good quality for a low price.,0.685994
5238,Canon PIXMA TR4720,2023-08-16,2,Low quality printing,"It was easy to install, but the printing job sucked. I decided to use it to print PP slides for the first time and the quality was very low.",0.685994
3805,HP DeskJet 2755e,2023-04-29,4,Quality?,Print quality not what I would expect from HP.,0.57735
9777,Canon PIXMA TR4720,2023-07-25,3,"Loud and not high quality, but it’s cheap","Beware the quality is very low on this product, but it does the job that I need. It’s not the quietest printer I’ve had. You get what you pay for.",0.547723
7068,Epson - ET-4850,2023-05-14,1,Very Poor Quality,I have had this printer for several months. It is cheap quality. Paper constantly jams. The interface is baffling and nearly unusable. Now it jams every time I use it. Just a very low quality piece of equipment.,0.544331


In [17]:
qVector = query_raw('Paper jam')
simRaw = Index[qVector]
df['Similarity_Raw'] = simRaw
df_raw = df.sort_values(by = 'Similarity_Raw', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_Raw"]
pd.set_option('display.max_colwidth', None)
df_raw[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_Raw
2374,Canon PIXMA TR4720,2022-12-01,2,Printer Jam,"The printer jams each time I print, very annoying. Ink runs out quickly. I barely use the printer due to paper jamming.",0.547723
3944,HP ENVY 6455e,2022-01-22,4,Paper tends to jam,Paper tends to jam,0.5
2470,HP Smart Tank 7301,2023-01-28,1,Jams constantly,The printer consistently jams. It wont print anything. It wont even print a page without jamming.,0.486664
8904,HP ENVY Inspire 7955e,2023-08-21,1,paper jams CONSTANTLY,"Every time I try to print more than one page - the pages get jammed (although no paper actually gets stuck - it partially prints and then says jammed). I've tried different papers - thinner, thicker - it doesn't matter. It prints 2 pages and then jams - prints 2 more and jams. It partially prints and says it's jammed and ended up wasting over 30 pieces of paper trying to print an 11 page document.",0.4741
400,HP OfficeJet Pro 9015e,2022-03-22,3,continues to have paper jams,"what can I do, this printer continues to have paper jams and is not working",0.471405


### TF-IDF

In [18]:
# Create a TFIDF reverse index
TFIDF = models.TfidfModel(corpus)
corpus_TFIDF = [TFIDF[vec] for vec in corpus]
IndexTFIDF = similarities.SparseMatrixSimilarity(corpus_TFIDF, len(dictionary))

In [19]:
def query_tfidf(text):
    qList = text.split()  
    qLower = [w.lower() for w in qList]
    stemmer = PorterStemmer()
    qStemmed = [stemmer.stem(w) for w in qLower]
    qVector = dictionary.doc2bow(qStemmed)
    qVectorTFIDF = TFIDF[qVector]
    return qVectorTFIDF

In [20]:
qVectorTFIDF = query_tfidf('low quality')
simTFIDF = IndexTFIDF[qVectorTFIDF]
df['Similarity_TFIDF'] = simTFIDF
df_tfidf = df.sort_values(by = 'Similarity_TFIDF', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_TFIDF"]
pd.set_option('display.max_colwidth', None)
df_tfidf[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_TFIDF
686,Canon PIXMA TS3520,2022-11-13,5,good quality low price,Very pleased with printer. Good quality for a low price.,0.751272
5238,Canon PIXMA TR4720,2023-08-16,2,Low quality printing,"It was easy to install, but the printing job sucked. I decided to use it to print PP slides for the first time and the quality was very low.",0.597942
3187,HP DeskJet 2755e,2023-08-17,5,low price,setup took time,0.558956
1593,HP ENVY 6055e,2021-12-03,3,Low Quality,this printer disconnects all the time. I spend a lot of time trying to print. the printer came with toner but my printer is reading as it is low ink?? it's really frustrating trying to use this.,0.540725
1627,HP Smart Tank 5101,2023-09-10,5,It's works,This one is worth it it has where I can see the when the ink gets low,0.512525


In [21]:
qVectorTFIDF = query_tfidf('paper jam')
simTFIDF = IndexTFIDF[qVectorTFIDF]
df['Similarity_TFIDF'] = simTFIDF
df_tfidf = df.sort_values(by = 'Similarity_TFIDF', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_TFIDF"]
pd.set_option('display.max_colwidth', None)
df_tfidf[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_TFIDF
6611,Epson - ET-4850,2023-08-16,1,Paper Jam ALL THE TIME,"I need a copier that I can feed at least a few papers through at a time but this one doesn't copy even one paper without a paper jam, constantly, over and over again. If you need a copier that will consistently work, this one is not it.",0.454711
6179,Canon Pixma TS6420a,2022-08-10,4,Works fine,"I switched from HP to Canon. This printer prints well and offers affordable subscription plans. It is very easy to set up if you follow the instructions in the box and on the screen. I like the fact that it has wireless connection. The printer prints fast but 'moves'. The quality of the photos is excellent. It prints well on papers as well, but depending on the picture quality or paper quality, it sometimes prints papers as bluish with gray. As a student, I use it mostly for printing research papers.",0.443145
8904,HP ENVY Inspire 7955e,2023-08-21,1,paper jams CONSTANTLY,"Every time I try to print more than one page - the pages get jammed (although no paper actually gets stuck - it partially prints and then says jammed). I've tried different papers - thinner, thicker - it doesn't matter. It prints 2 pages and then jams - prints 2 more and jams. It partially prints and says it's jammed and ended up wasting over 30 pieces of paper trying to print an 11 page document.",0.424191
1473,Canon PIXMA TR4720,2023-03-05,1,Jams and Doesn't Print Correctly,"This printer is awful. Just after the return window was closed, it stopped working! It tries to suck up too many papers at once and jams every single time I print more than one sheet. I had high hopes for this printer, but it was just a waste of money. I'm super disappointed! I have even tried different brands of paper but it still jams. It prints blurry, but I'm not sure if that's from being a crappy printer or from wrestling jammed paper out of it all the time. Either way, avoid this one!!",0.405851
1554,Epson - Workforce 3820,2023-07-12,3,Pick Roller ineffective,"This printer has a pick roller issue. It will often get papers jam, picking multiple papers at once and it is a bit slower compared to similar printers.",0.397878


## Bigram model

In [22]:
#preprocess data to extract bigrams from original reviews
#concat tile and review
df['Original_full'] = df['Original title'].str.cat(df['Original review'], sep=' ', na_rep='')
df['Original_full'].head(5)

0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 Works great 🖨 was easy to install and works great.
1                                                                                                                  

In [23]:
# Define a function to preprocess and tokenize the text
def preprocess_text(text):
    if isinstance(text, str):
        # Lowercase the text
        text = text.lower()

        # Remove non-alphanumeric characters
        text = re.sub(r'[^\w\s]', ' ', text)

        # Tokenize the text
        unigrams = word_tokenize(text)
        bigrams_list = list(bigrams(unigrams))

        # Stemming
        stemmer = PorterStemmer()
        stemmed_unigrams = [stemmer.stem(word) for word in unigrams]
        stemmed_bigrams = [tuple(stemmer.stem(word) for word in bigram) for bigram in bigrams_list]

        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        unigrams_without_stopwords = [word for word in stemmed_unigrams if word not in stop_words]
        bigrams_without_stopwords = [' '.join(bigram) for bigram in stemmed_bigrams if not any(word in stop_words for word in bigram)]

        # Join both unigrams and bigrams
        tokens = unigrams_without_stopwords + bigrams_without_stopwords
    else:
        tokens = [] 

    return tokens

# Apply the modified preprocessing function to review data
df['Processed_bigram'] = df['Original_full'].apply(preprocess_text)

### Raw Term Frequency

In [24]:
dictionary_bi = corpora.Dictionary(df['Processed_bigram'])
corpus_bi = [dictionary_bi.doc2bow(text) for text in df['Processed_bigram']]
Index_bi = similarities.SparseMatrixSimilarity(corpus_bi, len(dictionary_bi))

df['Vector_Raw_Bigram'] = corpus_bi
print(len(dictionary_bi))

97667


In [25]:
def query_raw_bi(query):
    preprocessed_query = preprocess_text(query)
    qVector_bi = dictionary_bi.doc2bow(preprocessed_query)
    return qVector_bi

In [26]:
qVector_bi = query_raw_bi('low quality')
SimRaw_bi = Index_bi[qVector_bi]
df['Similarity_Raw_Bigram'] = SimRaw_bi
df_raw_bi = df.sort_values(by = 'Similarity_Raw_Bigram', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_Raw_Bigram"]
pd.set_option('display.max_colwidth', None)
df_raw_bi[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_Raw_Bigram
5238,Canon PIXMA TR4720,2023-08-16,2,Low quality printing,"It was easy to install, but the printing job sucked. I decided to use it to print PP slides for the first time and the quality was very low.",0.440225
686,Canon PIXMA TS3520,2022-11-13,5,good quality low price,Very pleased with printer. Good quality for a low price.,0.414781
7068,Epson - ET-4850,2023-05-14,1,Very Poor Quality,I have had this printer for several months. It is cheap quality. Paper constantly jams. The interface is baffling and nearly unusable. Now it jams every time I use it. Just a very low quality piece of equipment.,0.412393
2076,Epson - ET-2800,2023-02-28,2,,There’s only normal or high quality. There’s no draft or low quality mode to save ink. This really bothers me. I’ll return to post about this printer yield. At least I got rid of cartridges.,0.379663
3449,Epson - ET-4850,2023-02-27,5,Color and print quality,Quality is excellent and fast,0.365148


In [28]:
qVector_bi = query_raw_bi('Paper jam')
SimRaw_bi = Index_bi[qVector_bi]
df['Similarity_Raw_Bigram'] = SimRaw_bi
df_raw_bi = df.sort_values(by = 'Similarity_Raw_Bigram', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_Raw_Bigram"]
pd.set_option('display.max_colwidth', None)
df_raw_bi[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_Raw_Bigram
4545,Canon PIXMA MG3620,2023-06-28,1,Consistent paper jams,"Most of the paper jams I could fix. However, I had a paper jam so bad, it destroyed the rollers so I used the warranty. Then, it happened again. Do yourself a favor, buy a different printer.",0.766131
400,HP OfficeJet Pro 9015e,2022-03-22,3,continues to have paper jams,"what can I do, this printer continues to have paper jams and is not working",0.755929
5691,HP OfficeJet Pro 8025e,2021-10-17,2,Poor quality.,"If you print something? Paper jams. Reload the paper the only way it can go? Paper jam.WiFi blinks? Paper jam. You leave the printer alone too long? Paper jam. It’s incredible how hard it must be to make a printer that just prints when it’s told to, because no one can seem to do it. I’ve had this for 3 months and it’s a very high maintenance printer.",0.712396
7614,HP ENVY Inspire 7255e,2022-05-14,2,Can't find paper jam,"I can't set it up with my laptop, keeps saying I have a paper jam.I need a printer manual.",0.707107
5705,Epson - Workforce 4820,2022-06-19,3,Paper jam,"If you don’t mind a paper jam several times after filling the paper tray, then this machine is for you. I purchased 2 months ago. Fill the paper tray, use about 1/2 the paper in the tray, and bam…. A paper jam!Of course the jam is in the back of printer…. What a pain!!!I do not recommend this printer",0.699896


In [29]:
qVector_bi = query_raw_bi('Customer support')
SimRaw_bi = Index_bi[qVector_bi]
df['Similarity_Raw_Bigram'] = SimRaw_bi
df_raw_bi = df.sort_values(by = 'Similarity_Raw_Bigram', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_Raw_Bigram"]
pd.set_option('display.max_colwidth', None)
df_raw_bi[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_Raw_Bigram
6889,HP OfficeJet Pro 8034e,2023-06-12,1,Worst Customer Support Ever!,I just loaded brand new cartridges and my printing is completely faded. Absolutely horrible getting in touch with customer support to help. Do not buy!,0.57735
3330,Canon PIXMA MG3620,2023-07-26,3,Powers completely off when not in use,"Returning this. For 1. when not in use it shuts down completely. I don’t know how long it stays idle before shutting down and there’s no way to find out because it’s not in the manual nor can you call customer support without having an account and providing your email and phone number. I also tried to call customer support to get an answer of is a Google Chromebook compatible? It does not appear to be but that is not listed anywhere in the instructions and again, I can’t speak to anyone in customer support unless I create an account. It took me awhile to set it up wirelessly but I succeeded and have printed from my iPhone a few times, but the Chromebook is a no go.",0.495434
7061,HP OfficeJet Pro 9025e,2022-09-22,1,"Won't connect to scan, no customer support",I have had three of these same printers for 5 years. This one will not connect to my wifi network or scan. The customer support number is not a working number. HP tries to charge to answer any questions. I would not recommend!,0.485071
8300,HP DeskJet 2755e,2023-08-08,3,Help printer woes,Had to call customer support to set up/ link to my devices,0.480384
738,Canon PIXMA TR4720,2021-10-01,1,"Cheap, flimsy and the worst customer support","Very fragile and flimsy printer. I had a simple question for Customer support and it took me an hour to get through. Finally I found out I had to register the product in order to even talk to someone. When you try to register online they ask you for your product through voice recognition. I said my model 3 times and they don't recognize my model number and hang up on you. I did this for an hour, had my wife and son talk into the phone and apparently my model number TR4720 does not exist, even though I have it in front of me.Bottom line if you need customer support you best figure it out on your own because nobody is there to help you.Lets see how long the paper tray holds up, maybe a month or so? I think a piece of tape to hold it together would be better.I will be returning this printer.",0.412082


### TF-IDF

In [30]:
# Create a TFIDF reverse index
TFIDF_bi = models.TfidfModel(corpus_bi)
corpus_TFIDF_bi = [TFIDF_bi[vec] for vec in corpus_bi]
IndexTFIDF_bi = similarities.SparseMatrixSimilarity(corpus_TFIDF_bi, len(dictionary_bi))

df['Vector_TFIDF_Bigram'] = corpus_TFIDF_bi
print(len(dictionary_bi))

97667


In [31]:
def query_tfidf_bi(query):
    preprocessed_query = preprocess_text(query)
    qVector_bi = dictionary_bi.doc2bow(preprocessed_query)
    qVectorTFIDF_bi = TFIDF_bi[qVector_bi]
    return qVectorTFIDF_bi

In [32]:
qVectorTFIDF_bi = query_tfidf_bi('low quality')
simTFIDF_bi = IndexTFIDF_bi[qVectorTFIDF_bi]
df['Similarity_TFIDF_Bigram'] = simTFIDF_bi
df_tfidf_bi = df.sort_values(by = 'Similarity_TFIDF_Bigram', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_TFIDF_Bigram"]
pd.set_option('display.max_colwidth', None)
df_tfidf_bi[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_TFIDF_Bigram
1593,HP ENVY 6055e,2021-12-03,3,Low Quality,this printer disconnects all the time. I spend a lot of time trying to print. the printer came with toner but my printer is reading as it is low ink?? it's really frustrating trying to use this.,0.392702
5238,Canon PIXMA TR4720,2023-08-16,2,Low quality printing,"It was easy to install, but the printing job sucked. I decided to use it to print PP slides for the first time and the quality was very low.",0.360529
7068,Epson - ET-4850,2023-05-14,1,Very Poor Quality,I have had this printer for several months. It is cheap quality. Paper constantly jams. The interface is baffling and nearly unusable. Now it jams every time I use it. Just a very low quality piece of equipment.,0.265875
2076,Epson - ET-2800,2023-02-28,2,,There’s only normal or high quality. There’s no draft or low quality mode to save ink. This really bothers me. I’ll return to post about this printer yield. At least I got rid of cartridges.,0.262055
3767,Canon PIXMA TR4720,2023-06-07,1,Item had obviously been used. Ink cartridges were dried up. Very disappointing !,"Printer did not produce quality copies. It arrived with original packaging already removed and Ink installed and dry. With new ink, the copies were low quality. Very disappointed in Cannon.",0.238766


In [33]:
qVectorTFIDF_bi = query_tfidf_bi('paper jam')
simTFIDF_bi = IndexTFIDF_bi[qVectorTFIDF_bi]
df['Similarity_TFIDF_Bigram'] = simTFIDF_bi
df_tfidf_bi = df.sort_values(by = 'Similarity_TFIDF_Bigram', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_TFIDF_Bigram"]
pd.set_option('display.max_colwidth', None)
df_tfidf_bi[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_TFIDF_Bigram
400,HP OfficeJet Pro 9015e,2022-03-22,3,continues to have paper jams,"what can I do, this printer continues to have paper jams and is not working",0.688106
3766,Epson - ET-2850,2023-06-13,1,Did not work.,"Printer did not work. It had a paper jam when I was setting up the printer. I cleared the paper jam as directed but the printer would not clear the paper jam error on the printer screen. I called Epson for help, and they said to just send it back.",0.62867
4545,Canon PIXMA MG3620,2023-06-28,1,Consistent paper jams,"Most of the paper jams I could fix. However, I had a paper jam so bad, it destroyed the rollers so I used the warranty. Then, it happened again. Do yourself a favor, buy a different printer.",0.552715
5705,Epson - Workforce 4820,2022-06-19,3,Paper jam,"If you don’t mind a paper jam several times after filling the paper tray, then this machine is for you. I purchased 2 months ago. Fill the paper tray, use about 1/2 the paper in the tray, and bam…. A paper jam!Of course the jam is in the back of printer…. What a pain!!!I do not recommend this printer",0.536228
5398,HP ENVY 6455e,2021-10-05,2,Printer jams …. All the time!,It is difficult to connect to the internet and the paper jams constantly,0.505307


In [34]:
qVectorTFIDF_bi = query_tfidf_bi('customer support')
simTFIDF_bi = IndexTFIDF_bi[qVectorTFIDF_bi]
df['Similarity_TFIDF_Bigram'] = simTFIDF_bi
df_tfidf_bi = df.sort_values(by = 'Similarity_TFIDF_Bigram', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_TFIDF_Bigram"]
pd.set_option('display.max_colwidth', None)
df_tfidf_bi[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_TFIDF_Bigram
6889,HP OfficeJet Pro 8034e,2023-06-12,1,Worst Customer Support Ever!,I just loaded brand new cartridges and my printing is completely faded. Absolutely horrible getting in touch with customer support to help. Do not buy!,0.389979
7061,HP OfficeJet Pro 9025e,2022-09-22,1,"Won't connect to scan, no customer support",I have had three of these same printers for 5 years. This one will not connect to my wifi network or scan. The customer support number is not a working number. HP tries to charge to answer any questions. I would not recommend!,0.38967
3330,Canon PIXMA MG3620,2023-07-26,3,Powers completely off when not in use,"Returning this. For 1. when not in use it shuts down completely. I don’t know how long it stays idle before shutting down and there’s no way to find out because it’s not in the manual nor can you call customer support without having an account and providing your email and phone number. I also tried to call customer support to get an answer of is a Google Chromebook compatible? It does not appear to be but that is not listed anywhere in the instructions and again, I can’t speak to anyone in customer support unless I create an account. It took me awhile to set it up wirelessly but I succeeded and have printed from my iPhone a few times, but the Chromebook is a no go.",0.362506
6289,Epson - Workforce 4820,2020-10-21,1,Software issues - do not order this printer,"I received my first printer and kept getting an error saying ""insert paper cassette correctly"". I am no novice to printers but I called customer support because no matter what I did it wasn't working. Customer support determined it was a software problem and to replace the unit. I did so and received my new one. Yayyy, I finally have a printer after months of back ordering and running to UPS for my print jobs. Nope, this one had the exact same issue, and I was told the exact same thing by customer support.Apparently this is a new model that was just released and they haven't worked out the bugs yet. DO NOT PURCHASE THIS PRINTER.",0.339225
10119,HP ENVY Inspire 7955e,2023-08-14,1,No customer support,A little over a year old and it’s saying there’s a paper jam but there’s no paper in it. Once your warranty expires HP will not talk to you,0.317486


In [35]:
qVectorTFIDF_bi = query_tfidf_bi('bad connection')
simTFIDF_bi = IndexTFIDF_bi[qVectorTFIDF_bi]
df['Similarity_TFIDF_Bigram'] = simTFIDF_bi
df_tfidf_bi = df.sort_values(by = 'Similarity_TFIDF_Bigram', ascending=False)
selected_columns = ["Review Model", "Review date", "Review rating", "Original title", "Original review", "Similarity_TFIDF_Bigram"]
pd.set_option('display.max_colwidth', None)
df_tfidf_bi[selected_columns].head(5)

Unnamed: 0,Review Model,Review date,Review rating,Original title,Original review,Similarity_TFIDF_Bigram
4570,Canon PIXMA TR4720,2022-02-27,2,Master Set Up Didn't,"Master setup would not go past ""Install ink cartridges"", So it was essentially frozen. After reinstalling the color and black inks over and over, it finally continued and finished the installation and printed a test sheet! very frustrating and time consuming. It appears it was a bad connection on one of the ink cartridges.",0.238567
4216,HP OfficeJet Pro 9015e,2021-12-30,4,printer,Not bad printer for price,0.146873
2343,Epson - Workforce 4820,2023-03-26,3,,"The printer seems like a reasonable printer for the price and once it’s set up performs reasonably well. However, when setting up the printer and installing the software for Mac users, it doesn’t always install smoothly and corrupts. The scanner software for installation says to install smart, scan software, but actually that’s the wrong software. It should be scan 2 software, which also corrupts and doesn’t completely install all the time. EPSON technical support is terrible. I can’t say it’s even close to acceptable. There is a big English problem of understanding each other, their phone lines being overseas for north Americans always has a bad connection and will disconnect during your conversation. The technicians don’t know what they’re doing and do not know their products and are constantly putting you on hold to talk to somebody by computer in the USA to try and solve the problems you’re calling about. EPSON really needs to get their act together and bring back English-speaking technical support.",0.139453
807,Epson - Workforce 3820,2023-08-06,1,unable to connect with wifi,very bad,0.113909
6108,Canon Pixma TS6420a,2022-11-25,5,Works great with phone,I use my phone for almost everything so printer is good for that. Setting up was not too bad,0.099082


## Export Data

In [41]:
# Get the list of columns to pivot 
columns_to_pivot = ['Topic0', 'Topic1', 'Topic2', 'Topic3', 'Topic4']

# Create a list of id_vars columns by excluding the columns to pivot
id_vars = [col for col in df.columns if col not in columns_to_pivot]

# Melt the DataFrame without listing all id_vars columns explicitly
melted_df = pd.melt(df, id_vars=id_vars, value_vars=columns_to_pivot, var_name='Topic', value_name='Probability')

In [60]:
columns = ['Review Model', 'Review date', 'Review name',
        'Review rating', 'Full review',
        'Verified Purchase or not', 'People_find_helpful', 'vine or not',
        'list price', 'rating count', 'overall rating', 'Original title',
        'Original review',  'Brand','Tokenized Full review',
         'Vector_Raw_Bigram', 'Vector_TFIDF_Bigram', 'Processed_bigram', 'Original_full',
          'ID', 'Topic', 'Probability','neg', 'neu', 'pos', 'compound'
       ]
df_final = melted_df[columns]
df_final.to_csv('document_retrieval.csv') 

In [61]:
# Save the processed data using joblib to feed on Streamlit app 
import joblib

processed_data = df_final
joblib.dump(processed_data, 'processed_data.joblib')

['processed_data.joblib']

## LLM Model 

In [48]:
df['Combined_text'] = df['Original review']+ df['Original title']

df['Review Content new'] = df['Combined_text'].apply(lambda x: str(x) if x is not None else '') 

def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    return text

df['Cleaned Reviews'] = df['Review Content new'].apply(clean_text)

model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

def embed(text,model):
    embeddings = model(text)
    return [embedding.numpy() for embedding in embeddings]


df['Embed_sentence_full'] = embed(df['Cleaned Reviews'], model)
df['Embed_sentence_full'].head(1)

0    [-0.06476055, -0.046573855, -0.027457241, 0.06322332, 0.070216216, -0.025670437, 0.050893664, -0.004295973, 0.03966454, 0.0054599927, -0.049052257, -0.032727037, -0.011888139, 0.025713963, 0.013434531, 0.08975346, 0.011625516, 0.06539028, -0.030184822, 0.05018931, -0.049441848, 0.03448917, -0.059821535, 0.020342916, -0.044016834, 0.0050706957, 0.034620825, 0.0055083036, -0.034311306, -0.016627306, 0.07001131, 0.030078024, -0.012592496, -0.008625138, -0.028716592, 0.07233813, 0.045581482, -0.042595785, 0.021744747, -0.0060381214, 0.0154395485, -0.051222473, 0.016993193, 0.05014672, 0.006472177, -0.07095797, -0.046874862, -0.030451862, -0.009931262, -0.0041217143, 0.007588667, -0.005515727, -0.04068433, -0.030772027, 0.05853296, -0.00013855337, 0.029832633, -0.0034011218, -0.014614559, 0.02438878, 0.053751674, 0.018763307, 0.03760518, 0.058118768, 0.08511807, 0.022973988, 0.05065482, 0.061361168, 0.05189148, 0.014992673, 0.017877309, 0.029545128, 0.046483662, -0.016950184, 0.0049429

In [49]:
def embed_label(text,model):
    embedding = model([text])
    return embedding.numpy()[0]

def cosine_similarity(embedding_vetor, embedding_label):
    similarity = np.dot(embedding_vetor,embedding_label)/(norm(embedding_vetor)*norm(embedding_label))
    return similarity

In [53]:
text = 'Bad Connection'
input_embedding_vector = embed_label(text,model)
df['LLM_similarity_score'] = df['Embed_sentence_full'].apply(lambda x: cosine_similarity(input_embedding_vector,x))

pd.set_option('display.max_colwidth', None)
df_new = df.sort_values(by = 'LLM_similarity_score',ascending = False)
columns = ['Review Model', 'Review date', 'Review name',
       'Review rating', 'Full review',
       'Original title','Original review',  'Brand' ]
df_select = df_new[columns]
df_final = df_select.head(4).reset_index(drop = True)
df_final

Unnamed: 0,Review Model,Review date,Review name,Review rating,Full review,Original title,Original review,Brand
0,Epson - ET-4850,2022-08-02,Ronel Calvert,3,bad internet connect,Printer,Bad internet conection,Epson
1,HP Smart Tank 6001,2023-07-16,C. Brown,1,return connect problem,Returned,Connection problems,HP
2,Canon PIXMA TR4720,2022-09-29,William G. Gillam,3,network connect weak troubl connect differ devic,Network connectivity is weak,I have trouble connecting from different devices.,Canon
3,HP ENVY 6055e,2022-08-31,Julie K.,2,constantli connect problem jam etc qualiti fine actual work wast much time tri fix variou connect error etc sinc bought year ago current tri fix jam easytous product,"Constantly has connection problems, paper jams, etc.","The quality is fine when it actually works. But I have wasted SO much time trying to fix various connection errors, etc. since I bought it a year ago. Currently trying to fix a paper jam. Not an easy-to-use product.",HP


In [54]:
text = 'Paper jam'
input_embedding_vector = embed_label(text,model)
df['LLM_similarity_score'] = df['Embed_sentence_full'].apply(lambda x: cosine_similarity(input_embedding_vector,x))

pd.set_option('display.max_colwidth', None)
df_new = df.sort_values(by = 'LLM_similarity_score',ascending = False)
columns = ['Review Model', 'Review date', 'Review name',
       'Review rating', 'Full review',
       'Original title','Original review',  'Brand' ]
df_select = df_new[columns]
df_final = df_select.head(4).reset_index(drop = True)
df_final

Unnamed: 0,Review Model,Review date,Review name,Review rating,Full review,Original title,Original review,Brand
0,Epson - Workforce 4820,2022-06-19,Sharon,3,jam dont mind jam sever time fill tray machin purchas 2 month ago fill tray 1 2 tray bam jam cours jam back pain recommend,Paper jam,"If you don’t mind a paper jam several times after filling the paper tray, then this machine is for you. I purchased 2 months ago. Fill the paper tray, use about 1/2 the paper in the tray, and bam…. A paper jam!Of course the jam is in the back of printer…. What a pain!!!I do not recommend this printer",Epson
1,Canon Pixma TS6420a,2023-09-10,Randy,1,jam cassett horribl work want simpl direct put cassett slide nope get stuck easili fit tray top nope jam overtim 30 sheet pleas put tray end chuck,"Papaer Jam, Paper cassette","This printer is horrible. it works when it wants to. Simple directions...put paper in under cassette, slide it in, print, Nope it gets stuck and paper does easily fit in. So use the paper tray at the top, nope paper jam everytime, or 30 sheets of paper on it and you get "" please put paper in tray. ended up chucking it",Canon
2,HP OfficeJet Pro 8025e,2021-10-17,Michael,2,poor qualiti someth jam reload way go jam wifi blink jam leav alon long jam incred hard must make print told seem ive 3 month high mainten,Poor quality.,"If you print something? Paper jams. Reload the paper the only way it can go? Paper jam.WiFi blinks? Paper jam. You leave the printer alone too long? Paper jam. It’s incredible how hard it must be to make a printer that just prints when it’s told to, because no one can seem to do it. I’ve had this for 3 months and it’s a very high maintenance printer.",HP
3,HP OfficeJet Pro 9015e,2022-03-22,Peggy Bailey,3,continu jam continu jam work,continues to have paper jams,"what can I do, this printer continues to have paper jams and is not working",HP


In [58]:
text = 'Support and returns'
input_embedding_vector = embed_label(text,model)
df['LLM_similarity_score'] = df['Embed_sentence_full'].apply(lambda x: cosine_similarity(input_embedding_vector,x))

pd.set_option('display.max_colwidth', None)
df_new = df.sort_values(by = 'LLM_similarity_score',ascending = False)
columns = ['Review Model', 'Review date', 'Review name',
       'Review rating', 'Full review',
       'Original title','Original review',  'Brand' ]
df_select = df_new[columns]
df_final = df_select.head(4).reset_index(drop = True)
df_final

Unnamed: 0,Review Model,Review date,Review name,Review rating,Full review,Original title,Original review,Brand
0,HP OfficeJet Pro 9025e,2021-05-06,Scotthpdx,1,realli sucki want return unhappi call support sever time joy keep drop network reinstal week scan poor want return return window close yesterday tri reach seller updat call amazon custom servic abl quickli effici arrang return,Really sucky printer - want to return. UNHAPPY,Have called support several times and no joy. Keeps dropping off my network and have to reinstall once a week. Scanning is poor and I want to return but my return window closed yesterday. Will try to reach the selller.Update - Called over to Amazon customer service and they were able to quickly and efficiently arrange a return of the printer.,HP
1,Canon PIXMA TS3520,2022-05-20,william,1,junk thing work quit support non exist go buy,Junk,This thing worked for a while and then quit. Support is non existence. Go buy a hp.,Canon
2,HP OfficeJet Pro 9015e,2022-11-28,kristal e blair,1,piec crap cheap never buy bought month unabl call amazon sent tech support went step 1 5 month later part start fall call amazon back return window sent back support sat phone hour 6 differ video chat ask credit card replac assur polici call amazon back refund miss restock fee happi give promot fee make differ actual show account see end full refund never buy,Piece of crap! Cheap!!,"Will never buy HP again!!!Bought printer. After one month unable to print. Called Amazon, sent me to HP tech support went through all of the steps there. 1.5 months later parts started falling off of the printer. Called Amazon back, now out of return window- sent back to Hp support. Sat on the phone for over and hour 6 different video chats. Now I am being asked for my credit card # to replace the printer as an assurance policy. Called Amazon back again. Now, the refund I get in miss a restocking fee. I am not happy with that so they will give a promotion fee to make up the difference of which does not actually show up on your account, they can only “see it” on their end.#brokenprinter, no a full refund!!! Will never buy HP again!",HP
3,Canon PIXMA TR8620a,2023-03-11,Lori,2,big desk unpack big desk return wanna know could return kohl like item purchas amazon take up close point drop point believ call im lose 5 99 return abl return free kohl,To big for my desk,This printer when unpacked was to big for my desk so I had to return it. I wanna know why I could not do a return through Kohls like most items purchased through Amazon? I had to take it to UPS and the only one closes was a point drop off point as I believe they called it. so I’m losing $5.99 off my return when I should have been able to return it free through Kohls,Canon
