In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import datetime

transactions = pd.read_csv('data/transactions.csv', parse_dates=['t_dat'])
articles = pd.read_csv('data/articles.csv')
customers = pd.read_csv('data/customers.csv')

In [2]:
id2type = dict(zip(articles['article_id'], articles['product_type_no']))

article_list = articles.article_id.unique()
customer_list = customers.customer_id.unique()

In [3]:
start_train = datetime.date(2020, 6, 22)
start_test = datetime.date(2020, 9, 15)

tran_test = transactions[transactions.t_dat > pd.Timestamp(start_test)]
tran_train_1 = transactions[(transactions.t_dat <= pd.Timestamp(start_test))& (transactions.t_dat > pd.Timestamp(start_train))] # 

del transactions

In [39]:
article_list_test = tran_test.article_id.unique()

In [4]:
from random import choice
pairs_0 = list()
customer_id_list = tran_train_1.customer_id.unique()
article_id_list = tran_train_1.article_id.unique()

for _ in tqdm(range(1_000_000)):
    pair = (choice(customer_id_list), choice(article_id_list))
    pairs_0.append(pair)

100%|██████████| 1000000/1000000 [00:01<00:00, 582908.32it/s]


In [6]:
# tran_train_0 = pd.DataFrame(data=pairs_0).rename(columns={0:'customer_id', 1:'article_id'})
# tran_train_0['price'] = 0
# tran_train_1['price'] = 1

In [5]:
tran_test_feat = tran_test.merge(customers, on='customer_id').merge(articles, on='article_id')
tran_train_feat = tran_train_1.merge(customers, on='customer_id').merge(articles, on='article_id')

# tran_train_feat_1 = tran_train_1.merge(customers, on='customer_id').merge(articles, on='article_id')
# tran_train_feat_0 = tran_train_0.merge(customers, on='customer_id').merge(articles, on='article_id')

# tran_train_feat = tran_train_feat_1.drop(['sales_channel_id', 't_dat'], axis=1).append(tran_train_feat_0)
# del tran_train_feat_0, tran_train_feat_1

In [6]:
tran_train_feat.dropna(inplace=True)
tran_test_feat.dropna(inplace=True)

tran_train_feat = tran_train_feat.sample(frac=1.)

In [7]:
tran_train_feat = tran_train_feat.dropna().reset_index(drop=True)
tran_test_feat = tran_test_feat.dropna().reset_index(drop=True)

In [8]:
object_columns = ['club_member_status', 'fashion_news_frequency', # 'sales_channel_id'
'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',# 'product_code',
'perceived_colour_master_id', 'department_no', 'index_group_no', 'section_no', 'garment_group_no']
non_object_columns = ['age']

cols_to_use = non_object_columns + object_columns

y_train = tran_train_feat['price']
X_train = tran_train_feat[cols_to_use]

y_test = tran_test_feat['price']
X_test = tran_test_feat[cols_to_use]

In [9]:
from sklearn.preprocessing import OneHotEncoder
X_train = X_train.astype(object)
X_train['age'] = X_train['age'].astype(int)
X_train['age'] = (X_train['age'] - X_train['age'].min()) / (X_train['age'].max()- X_train['age'].min())

X_test = X_test.astype(object)
X_test['age'] = X_test['age'].astype(int)
X_test['age'] = (X_test['age'] - X_test['age'].min()) / (X_test['age'].max()- X_test['age'].min())

X = X_train.append(X_test)

encoder = OneHotEncoder()
encoder.fit(X[object_columns])

del X

#### Modelling with Price

In [10]:
from sklearn.linear_model import SGDRegressor
model = SGDRegressor()

##### Train

In [17]:
batch_size = 1024
for start_ind in tqdm(range(0, X_train.shape[0], batch_size)):
    end_ind = start_ind + batch_size
    X_train_part = X_train.iloc[start_ind:end_ind]
    y_train_part = y_train.iloc[start_ind:end_ind]

    X_train_part_encoded = encoder.transform(X_train_part[object_columns]).todense()

    train_categorized = np.concatenate((X_train_part[non_object_columns].values, X_train_part_encoded), axis=1)

    model.partial_fit(train_categorized, y_train_part)

100%|██████████| 3625/3625 [00:33<00:00, 109.56it/s]


In [14]:
import pickle
with open('models/sgd_reg_3months.pkl', 'wb') as pfile:
    pickle.dump(model, pfile, protocol=pickle.HIGHEST_PROTOCOL)

##### Test

In [18]:
batch_size = 1024

preds = np.array([])
labels = np.array([])

for start_ind in tqdm(range(0, X_test.shape[0], batch_size)):
    end_ind = start_ind + batch_size
    X_test_part = X_test.iloc[start_ind:end_ind]
    y_test_part = y_test.iloc[start_ind:end_ind]

    X_test_part_part_encoded = encoder.transform(X_test_part[object_columns]).todense()

    test_categorized = np.concatenate((X_test_part[non_object_columns].values, X_test_part_part_encoded), axis=1)
    pred = model.predict(test_categorized)

    label = y_test.iloc[start_ind:end_ind].values

    preds = np.concatenate((preds, pred), axis=0)
    labels = np.concatenate((labels, label), axis=0)


100%|██████████| 233/233 [00:01<00:00, 119.42it/s]


In [19]:
errs = preds - labels
rmse = np.sqrt(np.mean(errs ** 2))
mae = np.mean(np.abs(errs))

print(f"Score in test set mae: {mae:.3f}, rmse: {rmse:.3f}")

Score in test set mae: 0.009, rmse: 0.014


#### Modelling with Purchase History -binary-

In [12]:
from sklearn.linear_model import SGDClassifier
model = SGDClassifier(n_jobs=-1, loss='log')

##### Train

In [13]:
batch_size = 1024
for start_ind in tqdm(range(0, X_train.shape[0], batch_size)):
    end_ind = start_ind + batch_size
    X_train_part = X_train.iloc[start_ind:end_ind]
    y_train_part = y_train.iloc[start_ind:end_ind]

    X_train_part_encoded = encoder.transform(X_train_part[object_columns]).todense()

    train_categorized = np.concatenate((X_train_part[non_object_columns].values, X_train_part_encoded), axis=1)

    model.partial_fit(train_categorized, y_train_part, classes=[0,1])

 52%|█████▏    | 2401/4590 [00:21<00:19, 113.60it/s]


KeyboardInterrupt: 

In [15]:
import pickle
with open('models/sgd_cls_3months.pkl', 'wb') as pfile:
    pickle.dump(model, pfile, protocol=pickle.HIGHEST_PROTOCOL)

In [14]:
import pickle
with open('models/sgd_cls_3months.pkl', 'rb') as pfile:
    model = pickle.load(pfile)

##### Test

In [12]:
batch_size = 1024

preds = np.array([])
labels = np.array([])

for start_ind in tqdm(range(0, X_test.shape[0], batch_size)):
    end_ind = start_ind + batch_size
    X_test_part = X_test.iloc[start_ind:end_ind]
    y_test_part = y_test.iloc[start_ind:end_ind]

    X_test_part_part_encoded = encoder.transform(X_test_part[object_columns]).todense()

    test_categorized = np.concatenate((X_test_part[non_object_columns].values, X_test_part_part_encoded), axis=1)
    pred = model.predict_proba(test_categorized)

    label = y_test.iloc[start_ind:end_ind].values

    preds = np.concatenate((preds, np.array(pred)[:,1]), axis=0)
    labels = np.concatenate((labels, label), axis=0)

100%|██████████| 233/233 [00:01<00:00, 120.76it/s]


#### Recommend

In [35]:
def recommend(model, item_list: np.ndarray, user: int, top_k: int = 5):

    pairs = list()
    for item in item_list:
        pair = (user, item)
        pairs.append(pair)

    pairs_df = pd.DataFrame(data=np.array(pairs)).rename(columns={0:'customer_id', 1:'article_id'})
    
    pairs_df.customer_id = pairs_df.customer_id.astype(str)
    pairs_df.article_id = pairs_df.article_id.astype(int)

    pairs_df = pairs_df.merge(customers, on='customer_id').merge(articles, on='article_id')

    pairs_df.age = pairs_df.age.fillna(0)
    pairs_df = pairs_df.astype(object)
    pairs_df['age'] = pairs_df['age'].astype(int)
    pairs_df['age'] = (pairs_df['age'] - tran_train_feat['age'].min()) / (tran_train_feat['age'].max()- tran_train_feat['age'].min())

    pairs_df = pairs_df[cols_to_use]

    pairs_df_encoded = encoder.transform(pairs_df[object_columns]).todense()

    test_categorized = np.concatenate((pairs_df[non_object_columns].values, pairs_df_encoded), axis=1)
    
    # preds = model.predict_proba(test_categorized)[:,1]
    preds = model.predict(test_categorized)

    idxs = np.array(preds).argsort()[::-1][:top_k]
    values = np.array(preds)[idxs]
    keys = item_list[idxs]

    return  dict(zip(keys, values)) #pd.DataFrame.from_dict(dict(zip(keys, values)), orient='index')

In [72]:
# recommend(model, tran_train_1.article_id.unique(), tran_test.customer_id.values[19], 20)
# tran_test[tran_test.customer_id == tran_test.customer_id.values[19]]
# tran_train_1[tran_train_1.customer_id == tran_train_1.customer_id.values[19]]

{852584001: 0.9688414236359668,
 815434001: 0.9688414236359668,
 473954013: 0.9648446559808451,
 754413001: 0.9630449715112543,
 903864001: 0.9630221418456586,
 855080001: 0.9630221418456586,
 880553002: 0.9630221418456586,
 707269003: 0.9630221418456586,
 473954008: 0.9630221418456586,
 754413002: 0.9625408650718881,
 855080010: 0.9625177361057294,
 707269004: 0.9625177361057294,
 477507001: 0.9580727277036971,
 730683050: 0.9573802771733695,
 855080008: 0.9568677160155147,
 759871002: 0.956401855133409,
 864562001: 0.9563563067968197,
 759871025: 0.9558112874312206,
 868063001: 0.9556094994449988,
 867969002: 0.9553777992296274}

In [30]:
user = tran_test.customer_id.unique()[2]
tran_test[tran_test.customer_id == user]

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
31548019,2020-09-16,001143bec624de9df82bd687babb382e00b0285e2371cc...,759814034,0.014898,1
31548020,2020-09-16,001143bec624de9df82bd687babb382e00b0285e2371cc...,759814034,0.014898,1
31548021,2020-09-16,001143bec624de9df82bd687babb382e00b0285e2371cc...,837443001,0.013559,1
31548022,2020-09-16,001143bec624de9df82bd687babb382e00b0285e2371cc...,890631002,0.002695,1
31615761,2020-09-18,001143bec624de9df82bd687babb382e00b0285e2371cc...,907951002,0.013542,1
31615762,2020-09-18,001143bec624de9df82bd687babb382e00b0285e2371cc...,912867001,0.010847,1
31615763,2020-09-18,001143bec624de9df82bd687babb382e00b0285e2371cc...,939503001,0.020322,1


In [31]:
# Predictions with seen articles
item_list = tran_test.article_id.unique()
recommend(model, item_list, user, 10)

Unnamed: 0,0
904838002,0.192372
916307002,0.192372
839464002,0.192144
839494001,0.190982
853928001,0.190637
916307001,0.190637
811943001,0.19059
932107001,0.19059
904838001,0.18956
916306001,0.18956


In [23]:
# Predictions with unseen articles
item_list = articles.article_id.unique()
recommend(model, item_list, user, 10)

ValueError: Found unknown categories [483, 196, 366, 464, 284, 349, 351] in column 2 during transform

In [27]:
customers_unseen = set(customers.customer_id.unique()) - set(tran_train_1.customer_id.unique())

In [28]:
# Predictions with unseen articles
item_list = articles.article_id.unique()
recommend(model, item_list, list(customers_unseen)[0], 10)

ValueError: Found unknown categories [483, 196, 366, 464, 284, 349, 351] in column 2 during transform

In [38]:
# Predictions with unseen articles
item_list = articles.article_id.unique()
recommend(model, item_list, list(customers_unseen)[1], 10)

{904838002: 0.19189907196187914,
 916307002: 0.19189907196187914,
 839464002: 0.1916708575011824,
 839494001: 0.19050844669505976,
 853928001: 0.19016350845237026,
 916307001: 0.19016350845237026,
 811943001: 0.19011662336447452,
 932107001: 0.19011662336447452,
 904838001: 0.1890866662096429,
 916306001: 0.1890866662096429}

#### Evaluate

In [44]:
def evaluate(model, tran_test, at_k: int = 5):

    scores = []
    for cust in tqdm(tran_test.customer_id.unique()):
        recs = recommend(model, article_list_test, cust, at_k)
        rec_ids = list(recs.keys())

        rec_group_ids = [id2type.get(rec_id) for rec_id in rec_ids]
        print(rec_ids)
        purchased = tran_test[tran_test.customer_id == cust]['article_id'].values

        purch_group_ids = [id2type.get(purch) for purch in purchased]

        score = np.isin(rec_group_ids, purch_group_ids).sum() / at_k

        scores.append(score)

    return scores

In [46]:
scores = evaluate(model, tran_test, 5)

  0%|          | 1/68984 [00:00<13:56:52,  1.37it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 2/68984 [00:01<13:59:23,  1.37it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 3/68984 [00:02<14:53:35,  1.29it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 4/68984 [00:03<14:53:52,  1.29it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 5/68984 [00:03<14:44:53,  1.30it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 6/68984 [00:04<14:34:31,  1.31it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 7/68984 [00:05<14:22:46,  1.33it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 8/68984 [00:06<14:17:27,  1.34it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 9/68984 [00:06<14:07:59,  1.36it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 10/68984 [00:07<14:02:27,  1.36it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 11/68984 [00:08<14:05:35,  1.36it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 12/68984 [00:08<14:02:21,  1.36it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 13/68984 [00:09<14:00:41,  1.37it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 14/68984 [00:10<13:56:32,  1.37it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 15/68984 [00:11<14:01:28,  1.37it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 16/68984 [00:11<13:58:16,  1.37it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 17/68984 [00:12<14:02:06,  1.36it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 18/68984 [00:13<14:16:42,  1.34it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 19/68984 [00:14<14:27:43,  1.32it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 20/68984 [00:14<14:16:00,  1.34it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 21/68984 [00:15<14:21:13,  1.33it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 22/68984 [00:16<14:16:26,  1.34it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 23/68984 [00:17<14:17:27,  1.34it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 24/68984 [00:17<14:17:11,  1.34it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 25/68984 [00:18<14:18:12,  1.34it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 26/68984 [00:19<14:10:13,  1.35it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 27/68984 [00:20<14:05:11,  1.36it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 28/68984 [00:20<14:21:12,  1.33it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 29/68984 [00:21<14:32:41,  1.32it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 30/68984 [00:22<14:32:27,  1.32it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 31/68984 [00:23<14:17:15,  1.34it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 32/68984 [00:23<14:30:01,  1.32it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 33/68984 [00:24<14:17:16,  1.34it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 34/68984 [00:25<14:18:48,  1.34it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 35/68984 [00:26<14:13:50,  1.35it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 36/68984 [00:26<14:32:19,  1.32it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 37/68984 [00:27<14:48:55,  1.29it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 38/68984 [00:28<14:27:11,  1.33it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 39/68984 [00:29<14:19:07,  1.34it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 40/68984 [00:29<14:12:27,  1.35it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 41/68984 [00:30<14:16:13,  1.34it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 42/68984 [00:31<14:15:09,  1.34it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 43/68984 [00:32<14:37:19,  1.31it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 44/68984 [00:32<14:35:53,  1.31it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 45/68984 [00:33<14:27:18,  1.32it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 46/68984 [00:34<14:42:28,  1.30it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 47/68984 [00:35<14:47:01,  1.30it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 48/68984 [00:35<14:40:55,  1.30it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 49/68984 [00:36<14:49:25,  1.29it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 50/68984 [00:37<14:50:13,  1.29it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 51/68984 [00:38<14:52:57,  1.29it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 52/68984 [00:39<14:36:03,  1.31it/s]

[904838002, 916307002, 839464002, 839494001, 853928001]


  0%|          | 52/68984 [00:39<14:38:53,  1.31it/s]


KeyboardInterrupt: 