In [22]:
### Imports
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import gensim
import scipy.sparse as sparse
# Set seaborn color palette
colors = sns.color_palette("pastel")
# Import for checking th python version
import struct
%matplotlib inline
# Surprise related Imports
from surprise import SVD
from surprise import Dataset
from surprise import evaluate, print_perf
import time
from collections import Counter

In [5]:
# Loading Orders_Users Dataset
# TODO : Remove ,nrows=100000 to run for full dataset
fields=["order_id","user_id"]
df_orders_users = pd.read_csv("../data/orders.csv",usecols=fields,nrows=100000)
df_orders_users.head()

Unnamed: 0,order_id,user_id
0,2539329,1
1,2398795,1
2,473747,1
3,2254736,1
4,431534,1


In [6]:
# Loading Orders_Products Dataset
# TODO : Remove ,nrows=100000 to run for full dataset
fields=["order_id","product_id"]
convert={'product_id':str}
df_orders_products = pd.read_csv("../data/order_products__prior.csv",usecols=fields,converters=convert,nrows=100000)
df_orders_products.head()


Unnamed: 0,order_id,product_id
0,2,33120
1,2,28985
2,2,9327
3,2,45918
4,2,30035


In [7]:
fields=["product_id","product_name"]
df_products = pd.read_csv("../data/products.csv",usecols=fields).set_index('product_id')
df_products.head()

Unnamed: 0_level_0,product_name
product_id,Unnamed: 1_level_1
1,Chocolate Sandwich Cookies
2,All-Seasons Salt
3,Robust Golden Unsweetened Oolong Tea
4,Smart Ones Classic Favorites Mini Rigatoni Wit...
5,Green Chile Anytime Sauce


In [8]:
# add this '.head(100000)' part to run for partial Dataset;
df_order_products_sample=df_order_products
# .head(100000)


In [9]:
df_order_products_sample.head()

Unnamed: 0,order_id,product_id
0,2,33120
1,2,28985
2,2,9327
3,2,45918
4,2,30035


In [10]:
# Saving list of products per oredr ID
df_order_products_list_sample = df_order_products_sample.groupby("order_id").apply(lambda order: order['product_id'].tolist()).reset_index(name="quantity")

In [11]:
df_order_products_list_sample.head()

Unnamed: 0,order_id,quantity
0,2,"[33120, 28985, 9327, 45918, 30035, 17794, 4014..."
1,3,"[33754, 24838, 17704, 21903, 17668, 46667, 174..."
2,4,"[46842, 26434, 39758, 27761, 10054, 21351, 225..."
3,5,"[13176, 15005, 47329, 27966, 23909, 48370, 132..."
4,6,"[40462, 15873, 41897]"


In [12]:
# taking the longest order length
longest = np.max(df_order_products_list_sample["quantity"].apply(len))
print(longest)

65


In [13]:
# Making Orders to sentences of word2Vec
sentences = df_order_products_list_sample["quantity"].values
# print(sentences)

In [14]:
# Model on WordtoVec for Similar Prodcuts
model= gensim.models.Word2Vec(sentences, size=100, window=longest, min_count=2, workers=4)

In [15]:
#vocabulary of all the keys:
vocab = list(model.wv.vocab.keys())

In [20]:
df_users_prodcuts=df_orders_users.merge(df_orders_products,on='order_id')
df_users_prodcuts.head()

Unnamed: 0,order_id,user_id,product_id
0,8382,23,3873
1,8382,23,28199
2,8382,23,42372
3,8382,23,23106
4,8382,23,33819


In [171]:
# Saving list of products per User ID
df_user_products_list_sample = df_users_prodcuts.groupby("user_id").apply(lambda order: order['product_id'].tolist()).reset_index(name="quantity")
df_user_products_list_sample.head()

Unnamed: 0,user_id,quantity
0,23,"[3873, 28199, 42372, 23106, 33819, 3108, 42959..."
1,27,"[6799, 8580, 31231, 1185, 44051, 11123, 33787,..."
2,66,"[31829, 32226, 8143, 20738, 8832]"
3,90,"[32818, 27582, 12302, 1831, 19204, 10180, 1545..."
4,150,"[24852, 17616, 47877, 33313, 651, 16953, 49615..."


In [237]:
# k_freq is k most items a user has purchased and n is the number of 
# similar item we want to find corresponding to a product
k_freq=5;n_similar=1;

#  Find  Most Similar item given the ProductID; Exception added if 
#  the product not in library
def find_similar(index):
    try:
        if index in vocab:
            return(model.most_similar(positive=[vocab[int(index)]], topn=n_similar))[0][0]
        return(index)
    except IndexError:
        return(index)
    
# Given a row  of (user_id,Products_list) it finds the k suggestion for
# him by finding the most similar item to his k frequent puchases 
def tranform_row(row):
    most_frequnt_k_list=Counter(row['quantity']).most_common(k_freq)
    return_list=[find_similar(item[0]) for item in most_frequnt_k_list]
    return return_list


In [238]:
# dataFrame to store the Suggestion Product List
df_user_products_list_suggestion=df_user_products_list_sample

In [239]:
# Finds K best suggestions and stores in the quantity Column
df_user_products_list_suggestion['quantity']=df_user_products_list_sample_counter.apply(lambda row: tranform_row(row),axis=1)


In [240]:
df_user_products_list_sample_counter.head()

Unnamed: 0,user_id,quantity
0,23,"[28199, 23106, 9547, 33819, 42959]"
1,27,"[24852, 13409, 31231, 33754, 11123]"
2,66,"[32226, 23270, 20738, 31829, 43076]"
3,90,"[20985, 27582, 7118, 13733, 18352]"
4,150,"[16953, 36393, 47144, 35518, 33313]"


In [16]:
# Get Top N similar Products to a product
def get_similar_products(product_id,top_n):
    suggestions = model.most_similar(positive=[vocab[product_id]], topn=top_n)
    output = list()
    for suggestion in suggestions:
        output.append((int(suggestion[0]),df_products.loc[int(suggestion[0])]['product_name'],suggestion[1]))
    return output

In [17]:
#Printing k similar items for n random Products:
def print_k_suggestions_for_n_prodcuts(n,k):
    for i in range(n):
        print("Product is : ",df_products.loc[int(vocab[i])]['product_name'])
        print("related products are : ")
        suggestions=get_similar_products(i,k)
        print([suggestion[1] for suggestion in suggestions])

In [18]:
print_k_suggestions_for_n_prodcuts(2,5)

Product is :  Honey Whole Wheat
related products are : 
['Lemongrass Citrus Scent Disinfecting Wipes', 'Organic Strawana Probugs Kefir', 'Organic Air Chilled Whole Chicken', 'Coconut Water', 'Organic Butterhead (Boston, Butter, Bibb) Lettuce']
Product is :  Whole Milk With Vitamin D
related products are : 
['Shredded Parmesan', 'Red Seedless Grapes', 'Strawberry Preserves', 'Heavy Whipping Cream', 'Organic Brown Rice']
