In [23]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from nltk.tokenize import word_tokenize
from gensim.corpora.dictionary import Dictionary
from gensim.models.tfidfmodel import TfidfModel
from gensim.similarities import MatrixSimilarity
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
ABSOLUTE_PATH = "../../datasets/"
df = pd.read_csv(ABSOLUTE_PATH+"category_dataset.csv")
df["keywords_array"] = df["keywords_array"].apply(literal_eval)

In [None]:
#Checking length of array for each product through histogram
nk = []
for keywords in df["keywords_array"]:
    nc = len(keywords)
    nk.append(nc)
plt.hist(nk)
plt.xlabel("No. of characters in keyword array column")
plt.ylabel("No. of products")
plt.show()

In [None]:
col_array = ["keywords_array"]
for col in col_array:
    print("%s: %.3f"% (col, np.mean(df[col].str.len())))

In [None]:
number_of_keywords = []
for keywords in df["keywords_array"]:
    n_keywords = len(keywords.split(','))
    number_of_keywords.append(n_keywords)
    
plt.hist(number_of_keywords)
plt.xlabel("number of words in keyword column")
plt.ylabel("number of products")


In [15]:
keywords = df["keywords_array"].tolist()

In [16]:
dictionary = Dictionary(keywords)
corpus = [dictionary.doc2bow(doc) for doc in keywords] 

In [18]:

tfidf = TfidfModel(corpus) 
sims = MatrixSimilarity(tfidf[corpus], num_features=len(dictionary))

In [21]:
def keywords_recommendation(keywords, number_of_hits):
    query_doc_bow = dictionary.doc2bow(keywords) # get a bag of words from the query_doc
    query_doc_tfidf = tfidf[query_doc_bow] #convert the regular bag of words model to a tf-idf model where we have tuples
    # of the movie ID and it's tf-idf value for the movie

    similarity_array = sims[query_doc_tfidf] # get the array of similarity values between our movie and every other movie. 
    #So the length is the number of movies we have. To do this, we pass our list of tf-idf tuples to sims.

    similarity_series = pd.Series(similarity_array.tolist(), index=df.product_name.values) #Convert to a Series
    top_hits = similarity_series.sort_values(ascending=False)[:number_of_hits] #get the top matching results, 
    # i.e. most similar movies

    # Print the top matching movies
    print("Our top %s most similar products for the keywords %s are:" %(number_of_hits, keywords))
    for idx, (product,score) in enumerate(zip(top_hits.index, top_hits)):
        print("%d '%s' with a similarity score of %.3f" %(idx+1, product, score))

In [22]:
keywords_test = ['tic']
keywords_recommendation(keywords_test, 5)

Our top 5 most similar products for the keywords ['tic'] are:
1 'Naaz Tic Tac Toe mini Board Game' with a similarity score of 0.285
2 'Takspin casual collection of clutcher & hair pin (set of 7) Hair Claw' with a similarity score of 0.055
3 'AutoKraftZ Silicone Car Bumper Guard' with a similarity score of 0.000
4 'Aerosoft Margaret Loafers' with a similarity score of 0.000
5 'Utex Slim Fit Fit Baby Boy's Jeans' with a similarity score of 0.000


In [46]:
#Cosine Similarity
def get_vectors(text):
    vectorizer =  CountVectorizer(tokenizer=lambda doc: doc, lowercase=False)
    X = vectorizer.fit_transform(text)
    return X

In [47]:
vectors = get_vectors(df.keywords_array.tolist())

In [59]:
def cosine_recommender(keywords,number_of_hits=5):
    cosines = []
    for i in range(vectors.shape[0]):
        vector_list = [keywords,vectors[i]]
        cosines.append(cosine_similarity(vector_list)[0,1])
        
    cosines  = pd.Series(cosines)
    index = cosines.nlargest(number_of_hits+1).index
    matches = df.loc[index]
    for match,score in  zip(matches["product_name"][1:],cosines[index][1:]):
        print(match,score)

In [61]:
cosine_recommender(["shoe"])

ValueError: setting an array element with a sequence.