In [46]:
# import the libraries needed for the analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statistics import mode,mean,median
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from lightfm import LightFM




In [16]:
# loading the datasets
geo = pd.read_csv("01.geo.csv", encoding='cp1252', sep=";")
custom = pd.read_csv("02.customers.csv", encoding='cp1252', sep=";")
sel = pd.read_csv("03.sellers.csv", encoding='cp1252', sep=";")
ord_status = pd.read_csv("04.order_status.csv", encoding='cp1252', sep=";")
ord_items = pd.read_csv("05.order_items.csv", encoding='cp1252', sep=";")
ord_pay = pd.read_csv("06.order_payments.csv", encoding='cp1252', sep=";")
prod_rev = pd.read_csv("07.product_reviews.csv", encoding='cp1252', sep=";")
prod = pd.read_csv("08.products.csv", encoding='cp1252', sep=";")


In [17]:
# We notice there are 610 NaN values in product_category_name,
# for the purpose of our analysis we substitute them with the category "Others"
prod["product_category_name"] = prod["product_category_name"].fillna("Others")
max_vals = ord_items.groupby("order_id")["order_item_sequence_id"].max().to_dict()
ord_items["max_order"] = ord_items["order_id"].map(max_vals)


In [18]:
# Merging datasets and data cleaning
pr = pd.merge(prod, prod_rev, on='product_id')
custom_df = pd.merge(custom, ord_status, on='customer_id')
custom_df.dropna(subset=['order_id'], inplace=True)
custom_df = custom_df.drop_duplicates(['customer_unique_id'])
ord_items.dropna(subset=['order_id'])

Unnamed: 0,order_id,order_item_sequence_id,product_id,price,shipping_cost,seller_id,max_shipping_seller_date,max_order
0,e5fa5a7210941f7d56d0208e4e071d35,1,f3c2d01a84c947b078e32bbef0718962,595,1556,a425f92c199eb576938df686728acd20,19/09/2017 00:15,1.0
1,bfbd0f9bdef84302105ad712db648a6c,1,5a6b04657a4c5ee34285d1e4619a96b4,4499,283,ecccfa2bb93b34a3bf033cc5d1dcdc69,19/09/2017 23:11,3.0
2,bfbd0f9bdef84302105ad712db648a6c,2,5a6b04657a4c5ee34285d1e4619a96b4,4499,283,ecccfa2bb93b34a3bf033cc5d1dcdc69,19/09/2017 23:11,3.0
3,bfbd0f9bdef84302105ad712db648a6c,3,5a6b04657a4c5ee34285d1e4619a96b4,4499,283,ecccfa2bb93b34a3bf033cc5d1dcdc69,19/09/2017 23:11,3.0
4,cd3b8574c82b42fc8129f6d502690c3e,1,e2a1d45a73dc7f5a7f9236b043431b89,2999,1096,b499c00f28f4b7069ff6550af8c1348a,08/10/2017 10:34,1.0
...,...,...,...,...,...,...,...,...
112650,3b61aab5de69abc1731138bd104a777f,1,6aa063e063f2ab982b471e58afe06d72,99999,2477,610f72e407cdd7caaa2f8167b0163fd8,18/09/2019 21:10,1.0
112651,9c94a4ea2f7876660fa6f1b59b69c8e6,1,282b126b2354516c5f400154398f616d,7599,147,7a241947449cc45dbfda4f9d0798d9d0,03/02/2020 20:23,1.0
112652,13bdf405f961a6deec817d817f5c6624,1,96ea060e41bdecc64e2de00b97068975,6999,1466,7a241947449cc45dbfda4f9d0798d9d0,05/02/2020 03:30,1.0
112653,c2bb89b5c1dd978d507284be78a04cb2,1,87b92e06b320e803d334ac23966c80b1,9999,6144,7a241947449cc45dbfda4f9d0798d9d0,09/04/2020 22:35,2.0


In [19]:
# Cleaning and merging the datasets by dropping duplicates
new = pd.merge(ord_items, pr, on=['order_id', 'product_id'])
final = pd.merge(new, custom_df, on=['order_id'])
final = final.drop(columns=['order_item_sequence_id'])
final.drop_duplicates()

Unnamed: 0,order_id,product_id,price,shipping_cost,seller_id,max_shipping_seller_date,max_order,product_category_name,product_weight_gr,product_length_cm,...,customer_id,customer_unique_id,customer_autonomous_community,customer_city,order_status,ts_order_purchase,ts_order_approved,ts_order_delivered_carrier,ts_order_delivered_customer,ts_order_estimated_delivery
0,e5fa5a7210941f7d56d0208e4e071d35,f3c2d01a84c947b078e32bbef0718962,595,1556,a425f92c199eb576938df686728acd20,19/09/2017 00:15,1.0,cell phones,700.0,25.0,...,683c54fc24d40ee9f8a6fc179fd9856c,4854e9b3feff728c13ee5fc7d1547e92,Comunidad Valenciana,Alicante,canceled,05/09/2017 00:15,07/10/2017 13:17,,,28/10/2017 00:00
1,bfbd0f9bdef84302105ad712db648a6c,5a6b04657a4c5ee34285d1e4619a96b4,4499,283,ecccfa2bb93b34a3bf033cc5d1dcdc69,19/09/2017 23:11,3.0,beauty & personal care,1000.0,16.0,...,86dc2ffce2dfff336de2f386a786e574,830d5b7aaa3b6f1e9ad63703bec97d23,Andalucía,Almería,delivered,15/09/2017 12:16,15/09/2017 12:16,07/11/2017 17:11,09/11/2017 07:47,04/10/2017 00:00
4,cd3b8574c82b42fc8129f6d502690c3e,e2a1d45a73dc7f5a7f9236b043431b89,2999,1096,b499c00f28f4b7069ff6550af8c1348a,08/10/2017 10:34,1.0,handbags & accessories,9000.0,16.0,...,7812fcebfc5e8065d31e1bb5f0017dae,87776adb449c551e74c13fc34f036105,Andalucía,Sevilla,delivered,03/10/2017 22:31,04/10/2017 10:19,08/10/2017 10:34,14/10/2017 16:08,23/11/2017 00:00
5,c3d9e402b6a0fbe2a5f7fc5b41117c38,817e1c2d22418c36386406ccacfa53e8,189,4845,624f4ece8da4aafb77699233d480f8ef,08/10/2017 10:45,1.0,furniture,2750.0,50.0,...,5720a15d022c09d2634c71c80c8d4102,9f302d00dd3e18ed3745778184b4f0fe,Extremadura,Mérida,delivered,04/10/2017 10:16,04/10/2017 10:45,28/10/2017 15:34,08/11/2017 10:41,08/12/2017 00:00
6,c4b41c36dd589e901f6879f25a74ec1d,9e93b2c4cb5eea05e75a481c129b104d,99,872,ce27a3cc3c8cc1ea79d11e561e9bebb6,08/10/2017 13:26,1.0,automotive,300.0,16.0,...,4bb880cac21c7a9e1371ab1ebd601706,3f4f614c632af7fc7508462a7cb55ac2,Comunidad de Madrid,Madrid,delivered,04/10/2017 12:53,04/10/2017 13:26,07/11/2017 16:21,09/11/2017 13:37,24/11/2017 00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106831,59eaa904b3f0dbde2785ac1b27eccd18,61919b39651acb61ec24307ed8b9502d,299,1475,f61c63d13f7cd800549d5acdd390ae72,13/09/2019 14:55,1.0,cell phones,250.0,17.0,...,3e90b5882ce0e665b837de00a2a8625c,7dc9f5b742ea9e6e4cff3ca5b1c8a78b,Andalucía,Granada,delivered,20/08/2019 10:19,20/08/2019 14:55,21/08/2019 12:37,24/08/2019 16:09,18/09/2019 00:00
106832,1afe384f199748cff7a42c9902065560,4c2a4020fcd651812100ebbeac1b2753,59999,2918,610f72e407cdd7caaa2f8167b0163fd8,14/09/2019 02:09,1.0,kitchen & dining,22300.0,45.0,...,df646960391593c3f41cd448d84800c7,24a438c52577c3c30ba86532b4166ff8,Andalucía,Málaga,delivered,21/08/2019 01:45,21/08/2019 02:09,22/08/2019 15:32,23/08/2019 12:38,19/09/2019 00:00
106833,3b61aab5de69abc1731138bd104a777f,6aa063e063f2ab982b471e58afe06d72,99999,2477,610f72e407cdd7caaa2f8167b0163fd8,18/09/2019 21:10,1.0,kitchen & dining,9000.0,25.0,...,8e29c6e9c795eda30cf7f47244720132,101375bf617fd60c9eee42f98d9a73d6,Comunidad de Madrid,Madrid,delivered,25/08/2019 20:59,25/08/2019 21:10,28/08/2019 15:05,30/08/2019 16:47,21/09/2019 00:00
106834,9c94a4ea2f7876660fa6f1b59b69c8e6,282b126b2354516c5f400154398f616d,7599,147,7a241947449cc45dbfda4f9d0798d9d0,03/02/2020 20:23,1.0,home accessories,3700.0,20.0,...,22e922696a7d1ab9a19c6b702fedc387,b030929cf3b8c3370ea8c611f9ccb32e,Región de Murcia,Murcia,shipped,14/03/2018 19:23,14/03/2018 19:23,16/03/2018 14:31,,04/08/2018 00:00


In [20]:
# Compute the number of unique users and products
n_users = final["customer_unique_id"].unique().shape[0]
n_items = final["product_id"].unique().shape[0]
print("Number of users= "+ str(n_users) + "| Number of products= "+ str(n_items))

Number of users= 93914| Number of products= 31757


In [21]:
# Select columns needed for the recommendation systems
cols = ['product_category_name', 
        'customer_unique_id', 
        'review_score', 
        'product_id']
final = final[cols]

In [22]:
# Extract a sample on which to perform the recommendation system
final_sample= final.sample(10000)

In [23]:
# Select users and products from sample
users = final_sample['customer_unique_id']
products = final_sample['product_id']

In [24]:
# Encode categorical values corresponding to the user and product ids
le = LabelEncoder() # create object LabelEncoder
final_sample.iloc[:, 1] = le.fit_transform(final_sample.iloc[:, 1])
le1 = LabelEncoder() # create object LabelEncoder
final_sample.iloc[:, 3] = le1.fit_transform(final_sample.iloc[:, 3])

In [25]:
# Dictionary containing customer unique id and corresponding encoding value
final_sample.set_index(users)['customer_unique_id'].to_dict()


{'74ee08af23620cf2f8d67884cb587262': 4559,
 '2dd33269ef4cd004e2d2536fee365b56': 1780,
 'a3c6251f883470e35b3b0b8338e14446': 6319,
 '097221858a1f2f6d37ca6915ae039780': 398,
 '9a1b0020d3c4e67cbc445d839e6aef43': 5985,
 '9f379a9b2a8b4356b1775c90cd76c66f': 6180,
 'e89757d565be1e7784b3aca4432077c3': 8939,
 '84e980ddc764302628d9be7734c5ced7': 5158,
 'bfe4a08a1ef7fd9541f82d728640ad36': 7396,
 'd9f337eea016e16ef54ccd655e8f25ff': 8398,
 'e635f73415c2f0f580fa7bdff6298ab2': 8862,
 'db63d1938f9fd68035212ae046df6917': 8446,
 '0063817060d662048ed08bccecaf45d5': 12,
 'd4b96caf30925ba6e6ec36d0a449fc52': 8214,
 '3496647e29e6d83f3aa91a822818d94d': 2066,
 '6d3a7899a5416ff9dc48e3fe1e7123e3': 4261,
 'a601294fe5eeb1f05f93b1a11bdd01cd': 6399,
 'f9554a1488503337327a447698972e9b': 9570,
 '9aabbac49ace928eaf4efee23c487e39': 6011,
 '99b2b3d13b782039cdc4230380fa6c07': 5967,
 '79093f1f5356ceced1473035e849be7c': 4725,
 'f3772be93d6e37750531a252f637e94b': 9346,
 '4013fe69a1a2188578ab19e62529acb6': 2537,
 'ba8bced49158

In [26]:
# Dictionary containing product id and corresponding encoding value
final_sample.set_index(products)['product_id'].to_dict()


{'16007704980fa90718364a0dd747133e': 572,
 '53b36df67ebb7c41585e8d54d6772e08': 2123,
 'a62e25e09e05e6faf31d90c6ec1aa3d1': 4270,
 'd017a2151d543a9885604dc62a3d9dcc': 5381,
 'e53e557d5a159f5aa2c5e995dfdf244b': 5910,
 '173e9fe34bfe97f3a5e6dc57fe897b74': 606,
 'a7e03f1bcebad9ca43f8f4dbda9bc1b6': 4308,
 '7d0b7e7fe3e96e047cacafa33a808bd7': 3214,
 'e672fbe634ad07cce9d85f412becb2c0': 5931,
 'f264c1d9b20b5e4a340254d0405e613b': 6231,
 '7c898e0b8ea203dd94ba846627fc34d1': 3199,
 'ef2fb3a493929907466587fecb5611e2': 6157,
 '57fcacc3434a1f2f2b039c1b4e61f5e1': 2247,
 'd1c427060a0f73f6b889a5c7c61f2ac4': 5423,
 '41c24b8ce92d1a2cac62db5edfd088b2': 1687,
 '6c3effec7c8ddba466d4f03f982c7aa3': 2771,
 '43423cdffde7fda63d0414ed38c11a73': 1728,
 '7e0dc102074f8285580c9777f79c90cf': 3237,
 'f41a6dc6692a3c5c21769881b158de3e': 6276,
 'db5efde3ad0cc579b130d71c4b2db522': 5660,
 'cef6b1cb351ebdaec947e31ad360f5db': 5344,
 'dd0286e208604a3b60d3d0653981dbd7': 5700,
 '145028037adecb27b1a58cf9597676cc': 529,
 '944750710134

# Recommendation- First approach

In [28]:
# Creating a matrix containinf customers and products
n_users = final_sample.customer_unique_id.unique().shape[0]
n_items = final_sample.product_id.unique().shape[0]
n_items = final_sample['product_id'].max()
A = np.zeros((n_users,n_items))

print("Original rating matrix : ",A)

Original rating matrix :  [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [32]:
# Creation of the sparse matrix needed as input for the Nearest Neighbors algorithm
for i in range(len(A)):
    for j in range(len(A[0])):
        if A[i][j] >= 3:
            A[i][j] = 1
        else:
            A[i][j] = 0
csr_sample = csr_matrix(A)
print(csr_sample)




In [33]:
# Application of the algorithm, it computes the distance among customers according to the cosine similarity
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=3, n_jobs=-1)
knn.fit(csr_sample)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=-1, n_neighbors=3, p=2, radius=1.0)

In [34]:
# Printing what the user likes
dataset_sort_des = final_sample.sort_values(['customer_unique_id'], ascending=[True])
filter1 = dataset_sort_des[dataset_sort_des['customer_unique_id'] == 768].product_id
filter1 = filter1.tolist()
filter1 = filter1[:20]
print("Items liked by user: ",filter1)

Items liked by user:  [6325]


In [35]:
# Printing the recommended items to the user
distances1 = []
indices1 = []
for i in filter1:
    distances , indices = knn.kneighbors(csr_sample[i],n_neighbors=3)
    indices = indices.flatten()
    indices = indices[1:]
    indices1.extend(indices)
print("Items to be recommended: ",indices1)

Items to be recommended:  [1, 2]


# Recommendation- Second approach

In [39]:
# creating a rating matrix transposed needed as input in the algorithm
rating_crosstab = final_sample.pivot_table(values='review_score', 
                                           index='customer_unique_id', 
                                           columns='product_category_name', 
                                           fill_value=0)
rating_crosstab.head()
X = rating_crosstab.T

In [40]:
#Applying the Truncated SVD for linear dimensionality reduction
SVD = TruncatedSVD(n_components=12, random_state=5)
resultant_matrix = SVD.fit_transform(X)
resultant_matrix.shape
corr_mat = np.corrcoef(resultant_matrix)
corr_mat.shape

(67, 12)

In [44]:
# Computation of similarity among products based on correlation 
col_idx = rating_crosstab.columns.get_loc("toys games")
corr_specific = corr_mat[col_idx]
pd.DataFrame({'corr_specific':corr_specific, 'Product': rating_crosstab.columns})\
.sort_values('corr_specific', ascending=False)\
.head(10)

Unnamed: 0,corr_specific,Product
59,1.0,toys games
12,0.738978,coffee machines
31,0.729263,home appliances
19,0.685341,fabric
33,0.645504,home emergency kits
66,0.616362,woman's fashion
34,0.589707,home lighting
44,0.56673,medical supplies
26,0.553585,handbags & accessories
43,0.489448,mattresses & pillows


# Recommendation- Third Approach



In [47]:
# Splitting the dataset into train and test
train, test = train_test_split(final_sample,test_size= 0.25, random_state=1)

In [48]:
# Create data to insert into the algorithm
item_dict = {}
df = final_sample[['product_id', 'product_category_name']].sort_values('product_id').reset_index()
for i in range(df.shape[0]):
    item_dict[(df.loc[i,'product_id'])] = df.loc[i,'product_category_name']
# Dummify categorical features
final_sample_transformed = final_sample.drop(columns="customer_unique_id")
final_sample_transformed = pd.get_dummies(final_sample, columns = ['review_score', 'product_category_name'])
final_sample_transformed = final_sample_transformed.sort_values('product_id').reset_index().drop('index', axis=1)
final_sample_transformed.head(5)
# Convert to csr matrix
final_csr = csr_matrix(final_sample_transformed.drop('product_id', axis=1).values)

In [49]:
# Create another a rating matrix using products and reviews
user_book_interaction = pd.pivot_table(final_sample, index='customer_unique_id', columns='product_id', values='review_score')
# Fill missing values with 0
user_book_interaction = user_book_interaction.fillna(0)
user_id = list(user_book_interaction.index)
user_dict = {}
counter = 0 
for i in user_id:
    user_dict[i] = counter
    counter += 1
# Convert to csr matrix
user_book_interaction_csr = csr_matrix(user_book_interaction.values)
user_book_interaction_csr

<9835x6601 sparse matrix of type '<class 'numpy.float64'>'
	with 9874 stored elements in Compressed Sparse Row format>

In [50]:
# LightFM algorithm for recommendation
model = LightFM(loss='warp',
                random_state=2016,
                learning_rate=0.90,
                no_components=150,
                user_alpha=0.000005)
model = model.fit(user_book_interaction_csr,
                  epochs=100,
                  num_threads=16, verbose=False)

In [51]:
# Use a function to summarize the results of the algorithm
def sample_recommendation_user(model, final_sample, customer_unique_id, user_dict, 
                               item_dict,threshold = 0,nrec_items = 5, show = True):
    n_users, n_items = final_sample.shape
    user_x = user_dict[customer_unique_id]
    scores = pd.Series(model.predict(user_x,np.arange(n_items), item_features=final_csr))
    scores.index = final_sample.columns
    scores = list(pd.Series(scores.sort_values(ascending = False).index))
    
    known_items = list(pd.Series(final_sample.loc[customer_unique_id,:] \
                                 [final_sample.loc[customer_unique_id,:] > threshold].index).sort_values(ascending = False))
    
    scores = [x for x in scores if x not in known_items]
    return_score_list = scores[0:nrec_items]
    known_items = list(pd.Series(known_items).apply(lambda x: item_dict[x]))
    scores = list(pd.Series(return_score_list).apply(lambda x: item_dict[x]))
    if show == True:
        print ("User: " + str(customer_unique_id))
        print("Known Likes:")
        
    counter = 1
    for i in known_items:
        print(str(counter) + '- ' + i)
        counter += 1
    print("\n Recommended Items:")
    for i in scores:
        print(str(counter) + '- ' + i)
        counter += 1

In [52]:
# Result of the algorithm for user 768
sample_recommendation_user(model, user_book_interaction,768, user_dict, item_dict)

User: 768
Known Likes:
1- furniture

 Recommended Items:
2- kitchen & dining
3- Others
4- computer accessories
5- beauty & personal care
6- sport outdoors
