  <h1 align="center">E-commerce behaviour predictions </h1> 



#Dataset description

The training data contains full e-commerce session information. The aim is to predict the `aid` values for each session type thats occur after the last timestamp `ts` in the test session for each session in the test data. In other words, the test data contains sessions truncated by timestamp, and model should predict what occurs after the point of truncation.

> train.csv - the training data, which contains full session data: 

`session` - the unique session id 

`aid` - the article id (product code) of the associated event 

`ts` - the Unix timestamp of the event 

`type` - the event type, i.e., whether a product was clicked, added to the user's cart, or ordered during the session: 
0.  'clicks', 
1.  'carts', 
2. 'orders' 

> test.csv - the test data, which contains truncated session data
your task is to predict the next aid clicked after the session truncation, as well as the the remaining aids that are added to carts and orders; you may predict up to 20 values for each session type


> Acknowledgements:
> > Copyright (c) 2022 Otto (GmbH & Co KG), https://www.otto.de/jobs/technology/ueberblick/

#Loading and exploring dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.image import imread
import seaborn as sns

from datetime import datetime


import warnings
warnings.filterwarnings('ignore')

import gc

from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds

#from sklearn.neighbors import NearestNeighbors, KDTree
from sklearn import preprocessing
from sklearn.decomposition import PCA

import tqdm.notebook as tq
from tqdm import tqdm

import joblib

# from google.colab import output
# output.enable_custom_widget_manager()

# #!pip install cuml
# import cuml, cudf #; cuml.__version__
# from cuml.neighbors import NearestNeighbors

!pip install surprise
from surprise import Dataset, Reader, accuracy
from surprise import BaselineOnly, SVD, SVDpp, NormalPredictor, NMF
from surprise import KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline
from surprise import SlopeOne, CoClustering
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV

from collections import defaultdict

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 KB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp38-cp38-linux_x86_64.whl size=3366457 sha256=5e071fe83d496e82ea0707a4f88b617f9fa45af9c012ebf74e23899626ab0841
  Stored in directory: /root/.cache/pip/wheels/af/db/86/2c18183a80ba05da35bf0fb7417aac5cddbd93bcb1b92fd3ea
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.3 surprise-0.1


In [2]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Colab Notebooks/Na GITa/

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks/Na GITa


In [None]:
# import sys

# local_vars = list(locals().items())
# for var, obj in local_vars:
#     print(var, sys.getsizeof(obj))

In [None]:
train = pd.read_csv('data/onlineshop/train_colab.csv', usecols=[1, 2, 3, 4])
test = pd.read_csv('data/onlineshop/test_colab.csv', usecols=[1, 2, 3, 4])

In [None]:
# train = cudf.read_csv('data/onlineshop/train_colab.csv', usecols=[1, 2, 3, 4])
# test = cudf.read_csv('data/onlineshop/test_colab.csv', usecols=[1, 2, 3, 4])

In [None]:
train.head()

Unnamed: 0,session,aid,ts,type
0,0,1349536,1661634295,0
1,0,165096,1661634321,0
2,0,315914,1661634351,0
3,0,315914,1661634431,1
4,0,1680276,1661634664,0


In [None]:
train.tail()

Unnamed: 0,session,aid,ts,type
12941604,12899776,1737908,1661723987,0
12941605,12899777,384045,1661723976,0
12941606,12899777,384045,1661723986,0
12941607,12899778,561560,1661723983,0
12941608,12899778,32070,1661723994,0


In [None]:
test.head()

Unnamed: 0,session,aid,ts,type
0,12899779,59625,1661724000,0
1,12899780,1142000,1661724000,0
2,12899780,582732,1661724058,0
3,12899780,973453,1661724109,0
4,12899780,736515,1661724136,0


In [None]:
test.tail()

Unnamed: 0,session,aid,ts,type
6540533,14571577,1141710,1662328774,0
6540534,14571578,519105,1662328775,0
6540535,14571579,739876,1662328775,0
6540536,14571580,202353,1662328781,0
6540537,14571581,1100210,1662328791,0


Replacing `ts` with info about hour and day

In [None]:
#datetime.fromtimestamp(train.ts[1]).strftime('%a')

In [None]:
#datetime.fromtimestamp(train.ts[1]).strftime('%H%M')

In [None]:
train['ts'] = pd.to_datetime(train['ts'], unit='s')
test['ts'] = pd.to_datetime(test['ts'], unit='s')

In [None]:
train['day'] = train['ts'].dt.day_name()
test['day'] = test['ts'].dt.day_name()

In [None]:
train['hour'] = train['ts'].dt.hour
test['hour'] = test['ts'].dt.hour

In [None]:
train_time = train.drop(columns=['ts'])
test_time = test.drop(columns=['ts'])

In [None]:
del train
del test

In [None]:
gc.collect()

#Matrix factorization with SVD from scratch




In [None]:
data = pd.concat([train, test])

In [None]:
data = data.drop(columns=['ts'])
data['type'] = data['type'] + 1 #to make sparse matrix with pivot (NaN replaced by 0)

In [None]:
def SVD(chunk, targets, recommend, n_products=20, k=10):
    """ KNN model for chunks
    Arguments:
        chunk: part of data
        targets: sessions from test dataset in chunk
        n_products: number of products recommendations
        k: number of singular values and vectors to compute
        
    Returns:
        csr_matrix: one row for every target, with numbers of n_neighbors found in chunk
    """

    chunk_df = chunk.pivot_table(index='session', columns='aid', values='type').fillna(0)
    chunk_matrix = csr_matrix(chunk_df.values)

    u, s, v = svds(chunk_matrix.asfptype(), k=k)

    del chunk_matrix
    gc.collect()

    pred = np.dot(np.dot(u, np.diag(s)), v) 
    #pred = normalize(pred)
    #replace used products values with 0
    pred[chunk_df.values > 0] = 0
    #swap values in pivot df
    chunk_df[:] = pred
  
    for t in targets:
      sorted_df = chunk_df[chunk_df.index == t].sort_values(by = t, axis = 1, ascending = False)
      products = zip(sorted_df.columns.values[:n_products], sorted_df.values[0][:n_products])
      recommend[t].append(list(map(list, list(products))))
      recommend[t] = sum(recommend[t], []) #flatten array
      recommend[t] = [sorted(recommend[t], key=lambda x: x[1], reverse=True)[:20]]

    del chunk_df
    del sorted_df
    gc.collect()

def recommend_products(recommend, suffix, recommendations):
  """         
    Returns:
      recommendations: dict with 20 most repetitive products for each target  
  """
    
  for k, v in recommend.items():
    products = sum(v, []) #flatten results
    #first 20 most repeated products
    #products = sorted(products, key = products.count, reverse = True)
    products = [i[0] for i in products[:20]]
    recommendations[str(k) + '_' + suffix] = " ".join(str(i) for i in products)

In [None]:
data_orders = data[data.type == 3]
data_clicks = data[data.type == 1]
data_carts = data[data.type == 2]

del data
gc.collect()

data_type = [data_orders, data_clicks, data_carts]

recommendations = {}

targets_all = test.session.unique() #all sessions in test dataset

del train
del test
gc.collect()

0

In [None]:
# with np.load('clicks_values_part1.npz') as data:
#     values = data['arr_0'].tolist()
# with np.load('clicks_keys_part1.npz') as data:
#     keys = data['arr_0'].astype(int)
# recommend = {k: v for k, v in zip(keys, values)}

In [None]:
# del values, keys
# gc.collect()

11

In [None]:
#load recommend and change products range for diff data

for d in range(2, len(data_type)): #for carts
  data = data_type[d]
  chunks_products = list(range(0, data.aid.nunique(), 1000)) + [data.aid.nunique()]
  chunks_list = list(zip(chunks_products, chunks_products[1:]))
  recommend = {key: [[[0.0, 0.0]]*20] for key in targets_all} 
  #recommend = np.load('clicks_part1.npy');


  for i1, i2 in tq.tqdm(chunks_list):
          chunk = data[data['aid'].isin(sorted(data.aid.unique())[i1:i2])] #sorted: assumption that similar products are numbered similarly
          targets = chunk[chunk.session.isin(targets_all)].session.unique() #check which test sessions are in chunk, 
                                                                            #if they aren't, SVD shows constant in every column anyway
          if len(targets) > 0:
            SVD(chunk, targets, recommend, k=10)
            del chunk
            gc.collect()
          else:
            del chunk
            gc.collect()

          if i1%150000 == 0: #save every 150 iteration
            if d == 0:          
              joblib.dump(recommend, 'orders_part1.joblib');
            elif d == 1: 
              recommend_keys = np.array(list(recommend.keys()), dtype=int)
              np.savez_compressed('clicks_keys_part1.npz', recommend_keys, allow_pickle=False);
              recommend_values = np.array(list(recommend.values()), dtype=float)
              np.savez_compressed('clicks_values_part1.npz', recommend_values, allow_pickle=False);

              del recommend_keys, recommend_values
              gc.collect()
            else:
              recommend_keys = np.array(list(recommend.keys()), dtype=int)
              np.savez_compressed('carts_keys_part1.npz', recommend_keys, allow_pickle=False);
              recommend_values = np.array(list(recommend.values()), dtype=float)
              np.savez_compressed('carts_values_part1.npz', recommend_values, allow_pickle=False);

              del recommend_keys, recommend_values
              gc.collect()


  if d == 0:
          recommend_products(recommend, 'orders', recommendations)
          joblib.dump(recommendations, 'orders_recomm.joblib');
  elif d == 1:
          recommend_products(recommend, 'clicks', recommendations)
          joblib.dump(recommendations, 'clicks_recomm.joblib');
  else:
          recommend_products(recommend, 'carts', recommendations)
          joblib.dump(recommendations, 'carts_recomm.joblib');
      


  0%|          | 0/381 [00:00<?, ?it/s]

**Fixing bugs**

In [None]:
with np.load('clicks_values_part1.npz') as data:
    values = data['arr_0'].astype(int).tolist()
with np.load('clicks_keys_part1.npz') as data:
    keys = data['arr_0'].astype(int)
#recommend = {k: v for k, v in zip(keys, values)}

In [None]:
targets_all = list(range(12899779, 14571582))

In [None]:
recommend = {key: [] for key in targets_all} 

In [None]:
for k, v in zip(keys, values):
  recommend[k] = v

In [None]:
recommendations = {}
recommend_products(recommend, 'clicks', recommendations)

In [None]:
len(recommendations)*3

5015409

In [None]:
joblib.dump(recommendations, 'clicks_recomm.joblib');

In [None]:
with np.load('carts_values_part1.npz') as data:
    values = data['arr_0'].astype(int).tolist()
with np.load('carts_keys_part1.npz') as data:
    keys = data['arr_0'].astype(int)

targets_all = list(range(12899779, 14571582))
recommend = {key: [] for key in targets_all} 

for k, v in zip(keys, values):
  recommend[k] = v

recommendations = {}
recommend_products(recommend, 'carts', recommendations)

In [None]:
joblib.dump(recommendations, 'carts_recomm.joblib');

In [None]:
ord = joblib.load('orders_part1.joblib')

targets_all = list(range(12899779, 14571582))
recommend = {key: [] for key in targets_all} 

for k, v in ord.items():
  recommend[k] = v

recommendations = {}
recommend_products(recommend, 'orders', recommendations)

In [None]:
joblib.dump(recommendations, 'orders_recomm.joblib');

##Recommendation of most popular products

For empty targets recommend most popular products. 

In [None]:
def most_popular(data, k):
  u, count = np.unique(data.aid.values, return_counts=True)
  count_sort_ind = np.argsort(-count)
  return u[count_sort_ind][:k]


def most_popular_fill(recommendations, popular):
   indices = [i for i, x in enumerate(list(recommendations.values())) if (x == '') | (x == '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0')]
   empty_recomm = np.array(list(recommendations.keys()))[indices]
   for er in empty_recomm:
     recommendations[er] = " ".join(str(i) for i in popular)

In [None]:
orders_rec = joblib.load('orders_recomm.joblib');
most_ordered = most_popular(data_orders, k=20)
most_popular_fill(orders_rec, most_ordered)

In [None]:
clicks_rec = joblib.load('clicks_recomm.joblib');
most_clicked = most_popular(data_clicks, k=20)
most_popular_fill(clicks_rec, most_clicked)

In [None]:
carts_rec = joblib.load('carts_recomm.joblib');
most_carted = most_popular(data_carts, k=20)
most_popular_fill(carts_rec, most_carted)

#Submission file

In [None]:
len(carts_rec)

1671803

In [None]:
len(orders_rec)

1671803

In [None]:
len(clicks_rec)

1671803

Submission layout:

```
session_type,labels
12906577_clicks,135193 129431 119318 ...
12906577_carts,135193 129431 119318 ...
12906577_orders,135193 129431 119318 ...
12906578_clicks, 135193 129431 119318 ...
etc.
```



In [None]:
submission = pd.DataFrame(columns=['session_type', 'labels'], data=[[0, 0]]*3*1671803)

In [None]:
submission.iloc[::3, 0] = list(clicks_rec.keys())
submission.iloc[1::3, 0] = list(carts_rec.keys())
submission.iloc[2::3, 0] = list(orders_rec.keys())

In [None]:
submission.iloc[::3, 1] = list(clicks_rec.values())
submission.iloc[1::3, 1] = list(carts_rec.values())
submission.iloc[2::3, 1] = list(orders_rec.values())

In [None]:
submission.head()

Unnamed: 0,session_type,labels
0,12899779_clicks,59594 58211 58965 58619 58830 58317 58386 5856...
1,12899779_carts,122983 1460571 1116095 554660 166037 1006198 1...
2,12899779_orders,122983 1445562 1531805 1460571 1534690 332654 ...
3,12899780_clicks,736915 736915 974030 974968 1141175 736999 736...
4,12899780_carts,122983 1460571 1116095 554660 166037 1006198 1...


In [None]:
submission.tail()

Unnamed: 0,session_type,labels
5015404,14571580_carts,122983 1460571 1116095 554660 166037 1006198 1...
5015405,14571580_orders,122983 1445562 1531805 1460571 1534690 332654 ...
5015406,14571581_clicks,1099010 1100142 1099464 1098720 1098934 109941...
5015407,14571581_carts,122983 1460571 1116095 554660 166037 1006198 1...
5015408,14571581_orders,122983 1445562 1531805 1460571 1534690 332654 ...


In [None]:
submission.to_csv("submission_v1.csv", index=False) #14 571 582, 5015409

#Surprise module

In [3]:
train = pd.read_csv('data/onlineshop/train_colab.csv', usecols=[1, 2, 4])
train['type'] = train['type'] + 1
test = pd.read_csv('data/onlineshop/test_colab.csv', usecols=[1, 2, 4])
test['type'] = test['type'] + 1

##Comparison of different algorithms

###SVD algorithm

In [None]:
whole = pd.concat([train, test])

In [None]:
#whole.head()

In [None]:
#cross_validate(BaselineOnly(), data, verbose=True)

In [None]:
reader = Reader(rating_scale=(1, 3))
data = Dataset.load_from_df(whole, reader)

In [None]:
del whole
gc.collect()

8

In [None]:
# split data into train test
trainset, testset = train_test_split(data, test_size=0.3, random_state=2023)
del data
gc.collect()

In [None]:
del data
gc.collect()

362

In [None]:
# train
svd = SVD()
svd.fit(trainset)

# run the trained model against the testset
test_pred = svd.test(testset)

# get RMSE
accuracy.rmse(test_pred, verbose=True)

OOM error

In [None]:
reader = Reader(rating_scale=(1, 3))
data = Dataset.load_from_df(train, reader)
# split data into train test
trainset, testset = train_test_split(data, test_size=0.3, random_state=2023)
del data
gc.collect()

# train
svd = SVD()
svd.fit(trainset)

# run the trained model against the testset
test_pred = svd.test(testset)

# get RMSE
accuracy.rmse(test_pred, verbose=True)
#7 min, 9 GB

RMSE: 0.9588


0.9588327323402144

In [None]:
pred = svd.predict(12899779, 1142000, verbose=True)

user: 12899779   item: 1142000    r_ui = None   est = 1.00   {'was_impossible': False}


In [None]:
pred = svd.predict(14571579, 1142000, 0, verbose=True)

user: 14571579   item: 1142000    r_ui = 0.00   est = 1.00   {'was_impossible': False}


In [None]:
pred.est

1

In [None]:
del svd, pred, test_pred
gc.collect()

578

###SVD++ algorithm

The SVD++ algorithm, an extension of SVD taking into account implicit ratings.

In [None]:
reader = Reader(rating_scale=(1, 3))
data = Dataset.load_from_df(train, reader)
# split data into train test
trainset, testset = train_test_split(data, test_size=0.3, random_state=2023)
del data
gc.collect()

# train
svdpp = SVDpp()
svdpp.fit(trainset)

# run the trained model against the testset
test_pred = svdpp.test(testset)

# get RMSE
accuracy.rmse(test_pred, verbose=True)
#15 min, 9 GB

RMSE: 0.3807


0.38068617429579715

###Baseline algorithm

Algorithm predicting the baseline estimate for given user and item.

In [None]:
reader = Reader(rating_scale=(1, 3))
data = Dataset.load_from_df(train, reader)
# split data into train test
trainset, testset = train_test_split(data, test_size=0.3, random_state=2023)
del data
gc.collect()

# train
bo = BaselineOnly()
bo.fit(trainset)

# run the trained model against the testset
test_pred = bo.test(testset)

# get RMSE
accuracy.rmse(test_pred)
#4 min, 7GB

Estimating biases using als...
RMSE: 0.3778


0.3777565533452866

###Normal Predictor

Algorithm predicting a random rating based on the distribution of the training set, which is assumed to be normal.

In [None]:
reader = Reader(rating_scale=(1, 3))
data = Dataset.load_from_df(train, reader)
# split data into train test
trainset, testset = train_test_split(data, test_size=0.3, random_state=2023)
del data
gc.collect()

# train
npr = NormalPredictor()
npr.fit(trainset)

# run the trained model against the testset
test_pred = npr.test(testset)

# get RMSE
accuracy.rmse(test_pred)
#2 min, 7 GB

RMSE: 0.5041


0.5041041626152478

###Non-negative Matrix Factorization

A collaborative filtering algorithm based on Non-negative Matrix Factorization.

In [None]:
reader = Reader(rating_scale=(1, 3))
data = Dataset.load_from_df(train, reader)
# split data into train test
trainset, testset = train_test_split(data, test_size=0.3, random_state=2023)
del data
gc.collect()

# train
nmf = NMF()
nmf.fit(trainset)

# run the trained model against the testset
test_pred = nmf.test(testset)

# get RMSE
accuracy.rmse(test_pred)
#13 min, 7.5 GB

RMSE: 0.4033


0.40327832648475725

###KNN

In [None]:
reader = Reader(rating_scale=(1, 3))
data = Dataset.load_from_df(train.iloc[-100000:], reader)
# split data into train test
trainset, testset = train_test_split(data, test_size=0.3, random_state=2023)
del data
gc.collect()

# train
knn = KNNBasic(k=20, sim_options = {"name": "cosine"})
knn.fit(trainset)

# run the trained model against the testset
test_pred = knn.test(testset)

# get RMSE
accuracy.rmse(test_pred)
#20 s, 11 GB

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.3990


0.3990224243365047

###  KNN Baseline

A basic collaborative filtering algorithm taking into account a baseline rating.

In [None]:
reader = Reader(rating_scale=(1, 3))
data = Dataset.load_from_df(train.iloc[-100000:], reader)
# split data into train test
trainset, testset = train_test_split(data, test_size=0.3, random_state=2023)
del data
gc.collect()

# train
knnb = KNNBaseline(k=20, sim_options = {"name": "cosine"})
knnb.fit(trainset)

# run the trained model against the testset
test_pred = knnb.test(testset)

# get RMSE
accuracy.rmse(test_pred)
#16 s, 11 GB

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.3950


0.3949926123132908

###KNN With Means

A basic collaborative filtering algorithm, taking into account the mean ratings of each user.

In [None]:
reader = Reader(rating_scale=(1, 3))
data = Dataset.load_from_df(train.iloc[-100000:], reader)
# split data into train test
trainset, testset = train_test_split(data, test_size=0.3, random_state=2023)
del data
gc.collect()

# train
knnm = KNNWithMeans(k=20, sim_options = {"name": "cosine"})
knnm.fit(trainset)

# run the trained model against the testset
test_pred = knnm.test(testset)

# get RMSE
accuracy.rmse(test_pred)
#12 s, 11 GB

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.4015


0.40149389592964463

###KNN With Z-Score

A basic collaborative filtering algorithm, taking into account the z-score normalization of each user.

In [None]:
reader = Reader(rating_scale=(1, 3))
data = Dataset.load_from_df(train.iloc[-100000:], reader)
# split data into train test
trainset, testset = train_test_split(data, test_size=0.3, random_state=2023)
del data
gc.collect()

# train
knnz = KNNWithZScore(k=20, sim_options = {"name": "cosine"})
knnz.fit(trainset)

# run the trained model against the testset
test_pred = knnz.test(testset)

# get RMSE
accuracy.rmse(test_pred)
#16 s, 11 GB


Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.4016


0.40163789547331863

### Slope One algorithm

A simple yet accurate collaborative filtering algorithm.

In [None]:
reader = Reader(rating_scale=(1, 3))
data = Dataset.load_from_df(train.iloc[-50000:], reader)
# split data into train test
trainset, testset = train_test_split(data, test_size=0.3, random_state=2023)
del data
gc.collect()

# train
so = SlopeOne()
so.fit(trainset)

# run the trained model against the testset
test_pred = so.test(testset)

# get RMSE
accuracy.rmse(test_pred, verbose=True)
#15 s, 9 GB

RMSE: 0.4103


0.4102791138456171

###Co-clustering algorithm

A collaborative filtering algorithm based on co-clustering.

In [None]:
reader = Reader(rating_scale=(1, 3))
data = Dataset.load_from_df(train, reader)
# split data into train test
trainset, testset = train_test_split(data, test_size=0.3, random_state=2023)
del data
gc.collect()

# train
cc = CoClustering()
cc.fit(trainset)

# run the trained model against the testset
test_pred = cc.test(testset)

# get RMSE
accuracy.rmse(test_pred, verbose=True)
#13 min, 6.5 GB

RMSE: 0.4026


0.4025581605846153

##SVD++ 

###Whole trainset

In [None]:
reader = Reader(rating_scale=(1, 3))
# trainset = Dataset.load_from_df(train[['session', 'aid', 'type']], reader).build_full_trainset()
svdpp = SVDpp()
# svdpp.fit(trainset)
data = pd.concat([train.iloc[int(len(train)/1.5):], test])
data = Dataset.load_from_df(data, reader).build_full_trainset()
svdpp.fit(data)                             

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x7feb0587a0d0>

In [None]:
pred = svdpp.predict(12899779, 1142000, verbose=True)

user: 12899779   item: 1142000    r_ui = None   est = 1.02   {'was_impossible': False}


In [None]:
svdpp.predict(12899779, 0).est

1.053739872757839

In [None]:
# testset = [valid_Dataset.df.loc[i].to_list() for i in range(len(valid_Dataset.df))]
# algo.test(testset)[:2] 


First 5 users

In [None]:
chunk = test[test['session'].isin(test.session.unique()[:5])]

In [None]:
testset = Dataset.load_from_df(chunk, reader).build_full_trainset().build_anti_testset()
predictions = svdpp.test(testset)

In [None]:
rec = defaultdict(list)
for uid, iid, true_r, est, _ in predictions:
      rec[uid].append((iid, est))

In [None]:
def recommend(predictions, n=20):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. 

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # Map the predictions to each user.
    rec = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        rec[uid].append((iid, est))

    # Sort the predictions for each user and retrieve the k highest ones.    
    for uid, user_ratings in rec.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        rec[uid] = [i[0] for i in user_ratings[:n]]

    return rec


# for uid, user_ratings in rec.items():
#     print(uid, [iid for (iid, _) in user_ratings])

In [None]:
result = recommend(predictions, n=20)

In [None]:
result

defaultdict(list,
            {12899779: [1696036,
              127404,
              1711180,
              834354,
              229748,
              1596098,
              1669402,
              1352725,
              595994,
              1033148,
              562753,
              255297,
              1344773,
              1114789,
              363336,
              1494780,
              973453,
              1072927,
              582732,
              602722],
             12899780: [1696036,
              1711180,
              834354,
              1669402,
              1352725,
              127404,
              595994,
              740494,
              602722,
              198385,
              638410,
              59625,
              829180,
              229748,
              1099390,
              1344773,
              1596098,
              363336,
              406001,
              1033148],
             12899781: [1711180,
              834354,
        

##Hyperparameters tuning

In [4]:
reader = Reader(rating_scale=(1, 3))
data = Dataset.load_from_df(train, reader)

param_grid = {"bsl_options": {"method": ["als"], 
                              "reg_u": [25, 10], 
                              "reg_i": [5, 10, 15]},
                            #  'n_epochs': [10, 15]},
              }

gs = GridSearchCV(BaselineOnly, param_grid, measures=["rmse"], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score["rmse"])

# combination of parameters that gave the best RMSE score
print(gs.best_params["rmse"])

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
0.37634773913222924
{'bsl_options': {'method': 'als', 'reg_u': 10, 'reg_i': 15}}


In [4]:
reader = Reader(rating_scale=(1, 3))
data = Dataset.load_from_df(train, reader)

param_grid = {"bsl_options": {"method": ["als"], 
                              "reg_u": [10], 
                              "reg_i": [5],
                              'n_epochs': [5, 15, 25]},
              }

gs = GridSearchCV(BaselineOnly, param_grid, measures=["rmse"], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score["rmse"])

# combination of parameters that gave the best RMSE score
print(gs.best_params["rmse"])

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
0.3772093910954353
{'bsl_options': {'method': 'als', 'reg_u': 10, 'reg_i': 5, 'n_epochs': 25}}


In [4]:
reader = Reader(rating_scale=(1, 3))
data = Dataset.load_from_df(train, reader)

param_grid = {"bsl_options": {"method": ["sgd"], 
                              "learning_rate": [0.005, 0.00005], 
                              'n_epochs': [20, 30]},
              }
gs = GridSearchCV(BaselineOnly, param_grid, measures=["rmse"], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score["rmse"])

# combination of parameters that gave the best RMSE score
print(gs.best_params["rmse"])

Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
0.37641505272739234
{'bsl_options': {'method': 'sgd', 'learning_rate': 0.005, 'n_epochs': 20}}


In [None]:
# from surprise import dump
# dump.dump('./dump_file', algo=svdpp)

In [None]:
# check other algorithms
# give weights to type
#DELETE inefficient methods

In [None]:
#https://apple.github.io/turicreate/docs/userguide/recommender/
#https://making.lyst.com/lightfm/docs/home.html#usage

In [None]:
# test_set = train_set.build_anti_testset()
# test_set_batches = util.get_batches(test_set)
# with mp.Manager() as manager:
#     predictions = manager.list()
#     process = [
#                 mp.Process(
#                     target=self._predict,
#                     args = [model, test_set_batch, predictions]
#                 ) for test_set_batch in test_set_batches
#             ]
#     for p in process:
#         p.start()
#     for p in process:
#         p.join()
#     print('all tasks finished')
#     predictions = predictions._getvalue()