In [22]:
# In this notebook you will find the preprocessing of the embeddings 
# and 2 methodologies
# one method using random forest classifier and one method using keras

# Method 1:
#     Using session embeddings try to predict and embedding that will represent an item
#     Compare this embedding to the item embeddings
#     Recommend the top K items

# Method 2:
#    Multilabel classification using keras
#    For a given item embedding predict directly the item
#    Items are represented using label encoder and converted to categorical


In [23]:
import calendar
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.decomposition import PCA
import math
import random

In [24]:
# read data
item_features = pd.read_csv('dressipi_recsys2022/item_features.csv', sep=',')
clusters = pd.read_csv('clustered_features.csv')
purchases = pd.read_csv('train_purchases_cleanned.csv')
sessions = pd.read_csv('train_sessions_cleanned.csv')
cand_items = pd.read_csv('dressipi_recsys2022/candidate_items.csv', sep=',')


train_purchases = pd.read_csv('dressipi_recsys2022/train_purchases.csv', sep=',')
train_sessions = pd.read_csv('dressipi_recsys2022/train_sessions.csv', sep=',')
purchases = train_purchases
sessions = train_sessions

purchases = purchases[purchases.item_id.isin(cand_items.item_id)]
sessions = sessions[sessions.session_id.isin(purchases.session_id)]

In [25]:
purchases.shape

(450153, 3)

In [26]:
# create embeddings for each item
item_embeddings = pd.concat(
            [item_features,pd.get_dummies(item_features['feature_category_id'])],axis=1
            )\
            .drop(['feature_value_id','feature_category_id'],axis=1)\
            .groupby('item_id').sum()\
            .reset_index()
item_embeddings.head()

Unnamed: 0,item_id,1,2,3,4,5,6,7,8,9,...,64,65,66,67,68,69,70,71,72,73
0,2,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,1,0,0,1,0
1,3,0,0,1,1,1,0,1,0,0,...,0,1,0,0,1,1,0,0,1,1
2,4,0,0,1,1,1,0,1,0,0,...,0,1,0,0,1,1,0,0,1,1
3,7,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,1,0,0,1,0
4,8,0,0,1,1,1,0,1,0,0,...,0,1,0,0,1,1,0,0,1,1


In [27]:
# create embeddings for each session
session_embeddings = sessions.merge(item_embeddings,on='item_id')\
                                    .drop(['date','item_id'],axis=1)\
                                    .groupby('session_id').sum()\
                                    .reset_index()
session_embeddings.head()

Unnamed: 0,session_id,1,2,3,4,5,6,7,8,9,...,64,65,66,67,68,69,70,71,72,73
0,13,0,0,0,1,0,1,1,0,0,...,0,1,0,0,1,1,0,0,1,0
1,31,0,0,8,8,8,0,8,0,0,...,0,8,0,0,8,8,0,0,8,8
2,42,0,0,2,4,2,1,4,1,0,...,0,3,0,0,4,4,1,0,4,4
3,113,0,0,4,6,4,2,6,0,0,...,0,6,0,0,6,6,0,0,6,6
4,115,0,8,0,0,0,0,0,0,0,...,0,0,8,8,8,0,0,0,0,8


In [28]:
# use item embeddings on purchases
purchases_embeddings = purchases.merge(item_embeddings,on='item_id').drop(['date'],axis=1)
purchases_embeddings.head()


Unnamed: 0,session_id,item_id,1,2,3,4,5,6,7,8,...,64,65,66,67,68,69,70,71,72,73
0,13,18626,0,0,1,1,1,0,1,1,...,0,0,0,0,1,1,0,0,1,1
1,20336,18626,0,0,1,1,1,0,1,1,...,0,0,0,0,1,1,0,0,1,1
2,27879,18626,0,0,1,1,1,0,1,1,...,0,0,0,0,1,1,0,0,1,1
3,40121,18626,0,0,1,1,1,0,1,1,...,0,0,0,0,1,1,0,0,1,1
4,41591,18626,0,0,1,1,1,0,1,1,...,0,0,0,0,1,1,0,0,1,1


In [29]:
# split data to random train and test
train_data = session_embeddings.merge(purchases_embeddings,on='session_id')
print(train_data.shape)
train_data.to_csv('test_train_data.csv')


xc = [col for col in train_data.columns if str(col)[-1]=='x']
# xc.append('season')
# xc.append('year')
xy = [col for col in train_data.columns if str(col)[-1]=='y']


train_session_ids = train_data.sample(frac=0.9).session_id.tolist()
test_session_ids = train_data[~train_data.session_id.isin(train_session_ids)]['session_id'].tolist()


x_train = train_data[train_data.session_id.isin(train_session_ids)][xc]
y_train = train_data[train_data.session_id.isin(train_session_ids)][xy]


x_test  = train_data[train_data.session_id.isin(test_session_ids)][xc]
y_test  = train_data[train_data.session_id.isin(test_session_ids)][xy]




(450153, 148)


In [30]:
# Save this dataset to train keras model
# it will not be used in the current file
x = xc.copy()
x.append('item_id')
train_data[x].to_csv('session_embeddings.csv')

In [32]:
x_train

Unnamed: 0,1_x,2_x,3_x,4_x,5_x,6_x,7_x,8_x,9_x,10_x,...,64_x,65_x,66_x,67_x,68_x,69_x,70_x,71_x,72_x,73_x
0,0,0,0,1,0,1,1,0,0,0,...,0,1,0,0,1,1,0,0,1,0
1,0,0,8,8,8,0,8,0,0,0,...,0,8,0,0,8,8,0,0,8,8
4,0,8,0,0,0,0,0,0,0,0,...,0,0,8,8,8,0,0,0,0,8
6,0,0,0,3,0,0,3,0,0,0,...,0,0,0,0,3,3,0,0,3,3
7,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,1,1,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
450148,0,0,1,1,1,0,1,0,0,0,...,0,1,0,0,1,1,0,0,1,1
450149,0,0,0,6,0,0,6,0,0,0,...,0,0,0,0,6,6,0,0,6,6
450150,0,1,8,12,8,0,10,0,0,0,...,1,8,0,1,11,10,0,1,10,11
450151,0,0,1,1,1,0,1,0,0,0,...,0,1,0,0,1,1,0,0,1,1


In [33]:
x_test

Unnamed: 0,1_x,2_x,3_x,4_x,5_x,6_x,7_x,8_x,9_x,10_x,...,64_x,65_x,66_x,67_x,68_x,69_x,70_x,71_x,72_x,73_x
2,0,0,2,4,2,1,4,1,0,0,...,0,3,0,0,4,4,1,0,4,4
3,0,0,4,6,4,2,6,0,0,0,...,0,6,0,0,6,6,0,0,6,6
5,0,0,0,3,0,0,3,0,0,0,...,0,0,0,0,3,3,0,0,3,3
9,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,1,1,0,0,1,1
14,0,0,4,4,4,0,4,0,0,0,...,0,4,0,0,4,4,0,0,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
450101,0,0,1,1,1,0,1,1,0,1,...,0,0,0,0,1,1,0,0,1,1
450102,0,0,5,10,5,5,10,2,0,2,...,0,8,0,0,10,10,0,0,10,8
450108,0,0,0,6,0,0,6,0,0,0,...,0,0,0,0,6,6,0,0,6,6
450126,0,0,5,5,5,0,5,0,0,0,...,0,5,0,0,5,5,0,0,5,5


In [34]:
len(test_session_ids),len(train_session_ids)

(45015, 405138)

# Train Model

In [14]:
rfclf = RandomForestClassifier( max_depth=20,random_state=0,n_estimators=20)
rfclf.fit(x_train, y_train)

# Evaluate train on train set

In [35]:
import warnings
warnings.filterwarnings("ignore")
correct = 0
counter = 0
samples= 1000
for xV,yV in zip(x_train.values,y_train.values):
    prediction = rfclf.predict([xV])
    counter +=1
    if list(prediction[0]) == list(yV):
        correct +=1
    if counter > samples:
        break
        
print("True positive:",correct, "out of:",samples)

#433
#426 with season

True positive: 402 out of: 1000


# Evaluate on test set

In [17]:
import warnings
warnings.filterwarnings("ignore")
correct = 0
counter = 0
samples= 2000
for xV,yV in zip(x_test.values,y_test.values):
    prediction = rfclf.predict([xV])
    counter +=1
    if list(prediction[0]) == list(yV):
        correct +=1
    if counter > samples:
        break
        
print("True positive:",correct, "out of:",samples)
#361 with season

True positive: 728 out of: 2000


In [37]:
rec_from = item_embeddings[item_embeddings.item_id.isin(cand_items.item_id)]
rec_from = item_embeddings[item_embeddings.item_id.isin(purchases.item_id)]



counts = 0
counts_ = 0
samples = 200
test_ssid = train_session_ids[1]
for i in test_session_ids[:samples]:
    # For a given session id use the classifier and predict the embedding of the item he will purchase
    # get the session embedding and predict the item
    test_ssid=i
    test_embeddings = train_data[train_data.session_id==test_ssid][xc]
    t = rfclf.predict(test_embeddings)
    
    # use y_true to evaluate the outcome
    y_true = train_data[train_data.session_id==test_ssid]['item_id'].values[0]
    
    y_true_vector = item_embeddings[item_embeddings.item_id==y_true].iloc[:,1:].values
    
    if list(t[0]) == list(y_true_vector[0]):
        counts_ +=1


In [38]:
counts_

84

# Score (Precision) :

In [39]:
print("True Positive :",counts_)
print("False Positive:", samples - counts)
print("Precision     :",counts_/samples)

True Positive : 84
False Positive: 200
Precision     : 0.42
