In [1]:
# In this notebook you will find the preprocessing of the embeddings 
# and 2 methodologies
# one method using random forest classifier and one method using keras

# Method 1:
#     Using session embeddings try to predict and embedding that will represent an item
#     Compare this embedding to the item embeddings
#     Recommend the top K items

# Method 2:
#    Multilabel classification using keras
#    For a given item embedding predict directly the item
#    Items are represented using label encoder and converted to categorical


In [2]:
import calendar
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.decomposition import PCA
import math
import random

In [3]:
# read data
item_features = pd.read_csv('dressipi_recsys2022/item_features.csv', sep=',')
clusters = pd.read_csv('clustered_features.csv')
purchases = pd.read_csv('train_purchases_cleanned.csv')
sessions = pd.read_csv('train_sessions_cleanned.csv')
cand_items = pd.read_csv('dressipi_recsys2022/candidate_items.csv', sep=',')


# train_purchases = pd.read_csv('dressipi_recsys2022/train_purchases.csv', sep=',')
# train_sessions = pd.read_csv('dressipi_recsys2022/train_sessions.csv', sep=',')
# purchases = train_purchases
# sessions = train_sessions

purchases = purchases[purchases.item_id.isin(cand_items.item_id)]
sessions = sessions[sessions.session_id.isin(purchases.session_id)]

In [4]:

purchases['date'] = pd.to_datetime(purchases['date'])
purchases['year'] = purchases['date'].dt.year
purchases['month'] = purchases['date'].dt.month
sales_per_period = purchases.groupby(['year','month','item_id']).count()\
                                            .reset_index().drop('session_id',axis=1)\
                                            .sort_values(by=['item_id','year','month']).rename(columns={'date':'sales'})


def assign_season(month):
    seasons = {
    
    1:[11,12,1],
    2:[2,3,4],
    3:[5,6,7],
    4:[8,9,10]
    }


    for k,v in seasons.items():
        if month in seasons[k]:
            return k
    
sales_per_period['season'] = sales_per_period['month'].apply(assign_season)

In [5]:
purchases = purchases.merge(sales_per_period[['item_id','season']],on='item_id')

In [6]:
# create embeddings for each item
item_embeddings = pd.concat(
            [item_features,pd.get_dummies(item_features['feature_category_id'])],axis=1
            )\
            .drop(['feature_value_id','feature_category_id'],axis=1)\
            .groupby('item_id').sum()\
            .reset_index()
item_embeddings.head()

Unnamed: 0,item_id,1,2,3,4,5,6,7,8,9,...,64,65,66,67,68,69,70,71,72,73
0,2,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,1,0,0,1,0
1,3,0,0,1,1,1,0,1,0,0,...,0,1,0,0,1,1,0,0,1,1
2,4,0,0,1,1,1,0,1,0,0,...,0,1,0,0,1,1,0,0,1,1
3,7,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,1,0,0,1,0
4,8,0,0,1,1,1,0,1,0,0,...,0,1,0,0,1,1,0,0,1,1


In [7]:
# create embeddings for each session
session_embeddings = sessions.merge(item_embeddings,on='item_id')\
                                    .drop(['date','item_id'],axis=1)\
                                    .groupby('session_id').sum()\
                                    .reset_index()
session_embeddings.head()

Unnamed: 0,session_id,1,2,3,4,5,6,7,8,9,...,64,65,66,67,68,69,70,71,72,73
0,13,0,0,0,1,0,1,1,0,0,...,0,1,0,0,1,1,0,0,1,0
1,113,0,0,4,6,4,2,6,0,0,...,0,6,0,0,6,6,0,0,6,6
2,115,0,8,0,0,0,0,0,0,0,...,0,0,8,8,8,0,0,0,0,8
3,140,0,0,0,3,0,0,3,0,0,...,0,0,0,0,3,3,0,0,3,3
4,153,0,0,0,3,0,0,3,0,0,...,0,0,0,0,3,3,0,0,3,3


In [8]:
# use item embeddings on purchases
purchases_embeddings = purchases.merge(item_embeddings,on='item_id').drop(['date'],axis=1)
purchases_embeddings.head()


Unnamed: 0,session_id,item_id,year,month,season,1,2,3,4,5,...,64,65,66,67,68,69,70,71,72,73
0,13,18626,2020,3,2,0,0,1,1,1,...,0,0,0,0,1,1,0,0,1,1
1,13,18626,2020,3,2,0,0,1,1,1,...,0,0,0,0,1,1,0,0,1,1
2,13,18626,2020,3,2,0,0,1,1,1,...,0,0,0,0,1,1,0,0,1,1
3,13,18626,2020,3,3,0,0,1,1,1,...,0,0,0,0,1,1,0,0,1,1
4,13,18626,2020,3,3,0,0,1,1,1,...,0,0,0,0,1,1,0,0,1,1


In [9]:
# split data to random train and test
train_data = session_embeddings.merge(purchases_embeddings,on='session_id')
print(train_data.shape)
train_data.to_csv('test_train_data.csv')


xc = [col for col in train_data.columns if str(col)[-1]=='x']
xc.append('season')
# xc.append('year')
xy = [col for col in train_data.columns if str(col)[-1]=='y']


train_session_ids = train_data.sample(frac=0.9).session_id.tolist()
test_session_ids = train_data[~train_data.session_id.isin(train_session_ids)]['session_id'].tolist()


x_train = train_data[train_data.session_id.isin(train_session_ids)][xc]
y_train = train_data[train_data.session_id.isin(train_session_ids)][xy]


x_test  = train_data[train_data.session_id.isin(test_session_ids)][xc]
y_test  = train_data[train_data.session_id.isin(test_session_ids)][xy]




(2761912, 151)


In [10]:
x_test

Unnamed: 0,1_x,2_x,3_x,4_x,5_x,6_x,7_x,8_x,9_x,10_x,...,65_x,66_x,67_x,68_x,69_x,70_x,71_x,72_x,73_x,season
4006,0,0,8,8,8,0,8,0,0,0,...,8,0,0,8,8,0,0,8,8,3
6059,0,0,0,1,0,0,1,0,0,0,...,0,0,0,1,1,0,0,1,1,2
6060,0,0,0,1,0,0,1,0,0,0,...,0,0,0,1,1,0,0,1,1,2
6061,0,0,0,1,0,0,1,0,0,0,...,0,0,0,1,1,0,0,1,1,3
6077,0,0,6,12,6,5,12,0,0,0,...,12,0,0,12,12,0,0,12,12,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2749136,0,0,0,2,0,0,2,0,0,0,...,2,0,0,2,2,0,0,2,2,3
2750551,0,0,1,1,1,0,1,0,0,0,...,1,0,0,1,1,0,0,1,1,3
2754939,0,0,4,4,4,0,4,0,0,0,...,4,0,0,4,4,0,0,4,4,3
2759241,0,0,0,1,0,0,1,0,0,0,...,0,0,0,1,1,0,0,1,1,3


In [11]:
len(test_session_ids),len(train_session_ids)

(528, 2485721)

In [12]:
rfclf = RandomForestClassifier( max_depth=5,random_state=0,n_estimators=2)
rfclf.fit(x_train, y_train)


KeyboardInterrupt



In [None]:
import warnings
warnings.filterwarnings("ignore")
correct = 0
counter = 0
samples= 1000
for xV,yV in zip(x_train.values,y_train.values):
    prediction = rfclf.predict([xV])
    counter +=1
    if list(prediction[0]) == list(yV):
        correct +=1
    if counter > samples:
        break
        
print("True positive:",correct, "out of:",samples)

#433

In [None]:
import warnings
warnings.filterwarnings("ignore")
correct = 0
counter = 0
samples= 1000
for xV,yV in zip(x_test.values,y_test.values):
    prediction = rfclf.predict([xV])
    counter +=1
    if list(prediction[0]) == list(yV):
        correct +=1
    if counter > samples:
        break
        
print("True positive:",correct, "out of:",samples)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity


rec_from = item_embeddings[item_embeddings.item_id.isin(cand_items.item_id)]
rec_from = item_embeddings[item_embeddings.item_id.isin(purchases.item_id)]

class topN_results:
    def __init__(self):
        self.correct = 0
        
        
    def plus_one(self,n_top,results,y_true):
        if y_true in results[:n_top]:
            self.correct +=1


            
top1000 = topN_results()
top500 = topN_results()
top200 = topN_results()
top100 = topN_results()
top50 = topN_results()
top20 = topN_results()
top10 = topN_results()
top5 = topN_results()
top1 = topN_results()
counts = 0
counts_ = 0
samples = 20000
test_ssid = train_session_ids[1]
for i in test_session_ids[:samples]:
    # For a given session id use the classifier and predict the embedding of the item he will purchase
    # Compare the embedding using cosine similarity with all the item embeddings, and rank them based 
    # on the similarity
    # recommend the top k items
    
    # get the session embedding and predict the item
    test_ssid=i
    test_embeddings = train_data[train_data.session_id==test_ssid][xc]
    t = rfclf.predict(test_embeddings)
    
    # use y_true to evaluate the outcome
    y_true = train_data[train_data.session_id==test_ssid]['item_id'].values[0]
    
    y_true_vector = item_embeddings[item_embeddings.item_id==y_true].iloc[:,1:].values
    
    if list(t[0]) == list(y_true_vector[0]):
        counts_ +=1


In [65]:
counts_

7113