In [1]:
import pandas as pd
import nltk
nltk.download('punkt')
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mriva\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
ASIN_FOLDER_PATH = '../datasets/asin_text_split/'


stop_words = set(stopwords.words('english'))

def get_asin_text_by_category(i, path=ASIN_FOLDER_PATH):
    return pd.read_csv(f"{path}asin_text_{i}.csv")

def process_text(text):
    t = re.sub(r'[\W]+', ' ', text)
    t = t.lower()
    t = PorterStemmer().stem(t)
    t = word_tokenize(t)
    t = [w for w in t if not w in stop_words]
    t = " ".join(t)
    return t

def get_similar(x, asins, n_preds = 3):
    return list(asins.loc[x.argsort()[(n_preds+1)*-1:-1]])

def suggest_asin(entry, category):
    df = get_asin_text_by_category(category)
    stem_entry = process_text(entry)
    corpus = list(df['text'])
    corpus.append(stem_entry)
    tfidf = TfidfVectorizer(stop_words = None, max_features = 20_000)
    tfidf = tfidf.fit(corpus)
    data_vec = tfidf.transform(df['text'])
    entry_vec = tfidf.transform([stem_entry])
    sim_unigram=cosine_similarity(entry_vec, data_vec)
    return get_similar(sim_unigram[0], df['asin']) 

In [3]:
suggest_asin('i want an iphone', 5)

['B003ELOOZO', 'B00CPK7U60', 'B0082YVBM2']