In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import nltk
nltk.download('wordnet')
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize

from sklearn.cluster import KMeans

from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mriva\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mriva\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
df = pd.read_csv('../datasets/cleaned/combined_text.csv')

In [3]:
lemmatizer = WordNetLemmatizer()

In [4]:
appliance_df = df.groupby('category').get_group('appliances')

In [5]:
def process_text(text):
    t = re.sub(r'[\W]+', ' ', text)
    t = t.lower()
    t = PorterStemmer().stem(t)
    t = word_tokenize(t)
    t = [w for w in t if not w in stop_words]
    t = " ".join(t)
    return t

In [6]:
stop_words = set(stopwords.words('english'))
data = pd.DataFrame()
for asin, items  in appliance_df.groupby('asin').groups.items():
    test_text = appliance_df.loc[items[0], 'combined_text']
    test_text = process_text(test_text)
    
    train_text = " ".join(appliance_df.loc[items[1:], 'combined_text'])
    train_text = process_text(train_text)
    
    data = data.append(pd.DataFrame({
        'label': [asin], 
        'train': [train_text],
        'test': [test_text]
    }), ignore_index=True)

In [7]:
tfidf = TfidfVectorizer(stop_words = 'english',
                             max_features = 50_000)
corpus = data['train'] + ' ' + data['test']
tfidf = tfidf.fit(corpus)
train_vec = tfidf.transform(data['train'])
test_vec = tfidf.transform(data['test'])

In [8]:
sim_unigram=cosine_similarity(test_vec, train_vec)

In [9]:
def get_similar(x, n_preds = 3):
    return list(data['label'].loc[x.argsort()[(n_preds+1)*-1:-1]])

In [10]:
most_similar_list = [get_similar(x) for x in sim_unigram]

In [11]:
data['pred'] = most_similar_list

In [12]:
data['score'] = [1 if label in sim else 0 for label, sim in zip(data['label'], data['pred'])]

In [13]:
data

Unnamed: 0,label,train,test,pred,score
0,B00004U9JO,power cord included plug disposal unit ttok mi...,badger 5 leaking water bottom near reset botto...,"[B00004U9JP, B00004U9JO, B008M2IPBE]",1
1,B00004U9JP,model number may help insinkerator model badge...,9 year old badger 1 needs replacing badger 1 i...,"[B001B4E0VY, B00004U9JP, B000BQOWES]",1
2,B00004W4UJ,electic stove varying sizes burners mentioned ...,burner plates electric glass cooktops burner p...,"[B004MLTDBU, B005HYW4M6, B00004W4UK]",0
3,B00004W4UK,electic stove varying sizes burners mentioned ...,burner plates electric glass cooktops burner p...,"[B004MLTDBU, B005HYW4M6, B00004W4UK]",1
4,B00005AUHX,many strips come kit need least 12 kit contain...,many test kits box mant times test water sever...,"[B00797TR56, B007K1NT82, B00B9DZLAG]",0
...,...,...,...,...,...
1148,B00J39MKMA,haul away old appliances particular delivery c...,dimensions overall width 27 overall depth 29 3...,"[B0058JRXYE, B008BVX4QC, B006O7BPE8]",0
1149,B00JJZBHK4,hello amis warranty manufacturer repair return...,many pieces clothes put washer videos washer s...,"[B00HZRJVCO, B00CDWTQKI, B005GM942C]",0
1150,B00KJ07SEM,filter made filter assembled iso 9001 quality ...,name manufacturer waterfall show nsf42 nsf53 c...,"[B000GAQFPS, B0006MQCA4, B0073B0YBO]",0
1151,B00KX3M1LA,see two model numbers lyric anyone tell difere...,two lyric thermostats coexist home e multiple ...,"[B005Y29KL4, B009NWOUG6, B00GJ2EZQ8]",0


In [14]:
accuracy = data.score.sum() / data.score.count()
accuracy

0.1647875108412836