In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import datetime

from surprise import Dataset
from surprise import Reader
from surprise import NormalPredictor, BaselineOnly, SVD, SVDpp, NMF, KNNBasic, KNNBaseline, KNNWithMeans

transactions = pd.read_csv('data/transactions.csv', parse_dates=['t_dat'])
articles = pd.read_csv('data/articles.csv')
customers = pd.read_csv('data/customers.csv')

### Data Preparation

In [2]:
start_train = datetime.date(2019, 9, 22)
start_test = datetime.date(2020, 9, 15)

tran_test = transactions[transactions.t_dat > pd.Timestamp(start_test)]
tran_train = transactions[(transactions.t_dat <= pd.Timestamp(start_test)) ] # & (transactions.t_dat > pd.Timestamp(start_train))

del transactions

print(tran_train.shape, tran_test.shape)

def group_transactions(df):
    df = df.groupby(["customer_id", "article_id"])['price'].sum()
    return df.reset_index()

tran_train, tran_test = map(group_transactions, (tran_train, tran_test))

print(tran_train.shape, tran_test.shape)

def min_max_normalization(df):
    df.price = (df.price - df.price.min()) / (df.price.max() - df.price.min())
    return df

tran_train, tran_test = map(min_max_normalization, (tran_train, tran_test))

reader = Reader(rating_scale=(0, 1))
data_train = Dataset.load_from_df(tran_train[["customer_id", "article_id", "price"]], reader=reader)
data_trainset = data_train.build_full_trainset()

(31548013, 5) (240311, 5)
(27101148, 3) (213728, 3)


### Modelling

In [None]:
def evaluate(model, tran_test):

    result = []
    for idx, row in tqdm(tran_test.iterrows()):
        y = row['price']
        y_pred = model.predict(row['customer_id'], row['article_id']).est
        result.append((y, y_pred))

    result = np.array(result)
    errs = result[:,0] - result[:,1]
    rmse = np.sqrt(np.mean(errs ** 2))
    mae = np.mean(np.abs(errs))

    print(f"Score in test set mae: {mae:.3f}, rmse: {rmse:.3f}")

In [None]:
def recommend(model, item_list: np.ndarray, user: int, top_k: int = 5):
    preds = list()
    for item in item_list:
        pred = model.predict(user, item).est
        preds.append(pred)

    assert len(item_list) == len(preds)

    idxs = np.array(preds).argsort()[::-1][:top_k]

    values = np.array(preds)[idxs]
    keys = item_list[idxs]

    return dict(zip(keys, values))

#### Random and Bias Models

In [5]:
models = [NormalPredictor(), BaselineOnly()]

for model in models:
    
    model_to_fit = model
    print(f"Results for {type(model_to_fit).__name__}")
    model.fit(data_trainset)
    evaluate(model, tran_test)

Results for NormalPredictor


213728it [00:11, 18945.39it/s]


Score in test set mae: 0.022, rmse: 0.034
Results for BaselineOnly
Estimating biases using als...


213728it [00:10, 20172.52it/s]


Score in test set mae: 0.019, rmse: 0.029


#### SVD

In [5]:
model_to_fit = SVD()
print(f"Results for {type(model_to_fit).__name__}")
model_to_fit.fit(data_trainset)
evaluate(model_to_fit, tran_test)

Results for SVD


213728it [00:11, 18687.68it/s]


Score in test set mae: 0.026, rmse: 0.040


#### kNN

In [None]:
sim_options = {'name': 'cosine',
               'user_based': False
               }
model_to_fit = KNNBasic(sim_options=sim_options)
print(f"Results for {type(model_to_fit).__name__}")
model_to_fit.fit(data_trainset)
evaluate(model_to_fit, tran_test)

Results for KNNBasic
Computing the cosine similarity matrix...


MemoryError: Unable to allocate 36.7 GiB for an array with shape (70221, 70221) and data type float64

### Recommendation

In [5]:
model = BaselineOnly()
model.fit(data_trainset)

Estimating biases using als...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x7fa54985d730>

In [None]:
preds = list()
for idx, row in tqdm(tran_test.iterrows()):
    pred = model.predict(row['customer_id'], row['article_id']).est
    preds.append(pred)

pd.Series(preds).value_counts()

213728it [00:09, 21472.56it/s]


0.009625    871
0.000000    104
0.013802     69
0.013417     65
0.008626     61
           ... 
0.011141      1
0.012057      1
0.013454      1
0.013766      1
0.012086      1
Length: 195283, dtype: int64

In [7]:
articles_unseen = set(articles.article_id.unique()) - set(tran_train.article_id.unique()) - set(tran_test.article_id.unique())
customers_unseen = set(customers.customer_id.unique()) - set(tran_train.customer_id.unique()) - set(tran_test.customer_id.unique())

In [None]:
model.predict(list(customers_unseen)[0], list(articles_unseen)[0])

Prediction(uid='397b134637693dd30bf21efd253fef827682602d9a8571b991dddaa8ad0cafd5', iid=638976001, r_ui=None, est=0.00962524045748534, details={'was_impossible': False})

- It seems that model can predict even not seen in train set since predicts with overall mean i think.
- Hopefully knn-based models predict different than mean.

In [None]:
# Predictions with seen articles
user = tran_test.customer_id.unique()[0]
item_list = tran_test.article_id.unique()

recommend(model, item_list, user, 5)

{780031001: 0.14738238172775892,
 780031004: 0.12462908023721772,
 916300002: 0.12141662780409737,
 876342001: 0.1075285683908763,
 839464001: 0.08702136985695666}

In [None]:
# Predictions with unseen articles
user = tran_test.customer_id.unique()[0]
item_list = articles.article_id.unique()

recommend(model, item_list, user, 10)

NameError: name 'model' is not defined

In [8]:
# Predictions with unseen customer
user = list(customers_unseen)[0]
item_list = articles.article_id.unique()

recommend(model, item_list, user, 5)

{780031001: 0.1469779500050143,
 780031004: 0.12422464851447311,
 916300002: 0.12101219608135276,
 876342001: 0.10712413666813168,
 639338001: 0.10582483459278863}

In [9]:
# Predictions with unseen customer
user = list(customers_unseen)[1]
item_list = articles.article_id.unique()

recommend(model, item_list, user, 5)

{780031001: 0.1469779500050143,
 780031004: 0.12422464851447311,
 916300002: 0.12101219608135276,
 876342001: 0.10712413666813168,
 639338001: 0.10582483459278863}