In [1]:
import os, pickle
import numpy as np
import pandas as pd
from datetime import date, timedelta
import matplotlib.pyplot as plt

import implicit
from scipy.sparse import coo_matrix
from implicit.evaluation import mean_average_precision_at_k
import map_at_k

## Load Data

In [2]:
path = '../data/processed'
customers = pd.read_pickle(os.path.join(path,'customers.pkl'))
articles = pd.read_pickle(os.path.join(path,'articles.pkl'))
df = pd.read_pickle(os.path.join(path,'transactions.pkl'))

# Date Filtering
df = df[df['t_dat'] > '2020-08-21']
print('Max Date: ',df['t_dat'].max())

Max Date:  2020-09-22 00:00:00


In [3]:
# mapping index
path = '../data/processed'
infile = open(os.path.join(path,'index_to_cusId.pkl'),'rb')
index_to_id_dict = pickle.load(infile)
infile.close()

In [4]:
customers["customer_id"] = customers["customer_id"].map(index_to_id_dict)
df["customer_id"] = df["customer_id"].map(index_to_id_dict)
customers

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.0,0.0,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.0,0.0,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.0,0.0,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.0,0.0,ACTIVE,NONE,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...
...,...,...,...,...,...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0.0,0.0,ACTIVE,NONE,24.0,7aa399f7e669990daba2d92c577b52237380662f36480b...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0.0,0.0,ACTIVE,NONE,21.0,3f47f1279beb72215f4de557d950e0bfa73789d24acb5e...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,1.0,1.0,ACTIVE,Regularly,21.0,4563fc79215672cd6a863f2b4bf56b8f898f2d96ed590e...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,1.0,1.0,ACTIVE,Regularly,18.0,8892c18e9bc3dca6aa4000cb8094fc4b51ee8db2ed14d7...


In [5]:
ALL_USERS = customers['customer_id'].unique().tolist()
ALL_ITEMS = articles['article_id'].unique().tolist()

user_ids = dict(list(enumerate(ALL_USERS)))
item_ids = dict(list(enumerate(ALL_ITEMS)))

user_map = {u: uidx for uidx, u in user_ids.items()}
item_map = {i: iidx for iidx, i in item_ids.items()}

df['user_id'] = df['customer_id'].map(user_map)
df['item_id'] = df['article_id'].map(item_map)

del customers, articles
df.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,user_id,item_id
30597413,2020-08-22,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,913688003,0.033881,2,38,103595
30597414,2020-08-22,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,913688003,0.033881,2,38,103595
30597415,2020-08-22,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,923460001,0.042356,2,38,104483
30597416,2020-08-22,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,934380001,0.050831,2,38,105214
30597417,2020-08-22,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,913688001,0.033881,2,38,103593


## Create coo_matrix (user x item) and csr matrix (user x item)

In [8]:
row = df['user_id'].values
col = df['item_id'].values
data = np.ones(df.shape[0])
coo_train = coo_matrix((data, (row, col)), shape=(len(ALL_USERS), len(ALL_ITEMS)))
coo_train

<1371980x105542 sparse matrix of type '<class 'numpy.float64'>'
	with 1190911 stored elements in COOrdinate format>

In [18]:
temp = df.groupby(['user_id','item_id']).agg({'sales_channel_id':min})
temp = temp.drop(columns=['sales_channel_id']).reset_index()
temp['purchaase_flag'] = 1
temp

Unnamed: 0,user_id,item_id,purchaase_flag
0,0,16023,1
1,2,78503,1
2,6,3091,1
3,6,58295,1
4,13,7725,1
...,...,...,...
1051725,1371975,81231,1
1051726,1371977,47663,1
1051727,1371977,71107,1
1051728,1371977,78601,1


## Modelling

In [7]:
def to_user_item_coo(df):
    """ Turn a dataframe with transactions into a COO sparse items x users matrix"""
    row = df['user_id'].values
    col = df['item_id'].values
    data = np.ones(df.shape[0])
    coo = coo_matrix((data, (row, col)), shape=(len(ALL_USERS), len(ALL_ITEMS)))
    return coo


def split_data(df, validation_days=7):
    """ Split a pandas dataframe into training and validation data, using <<validation_days>>
    """
    validation_cut = df['t_dat'].max() - pd.Timedelta(validation_days)

    df_train = df[df['t_dat'] < validation_cut]
    df_val = df[df['t_dat'] >= validation_cut]
    return df_train, df_val

def get_val_matrices(df, validation_days=7):
    """ Split into training and validation and create various matrices
        
        Returns a dictionary with the following keys:
            coo_train: training data in COO sparse format and as (users x items)
            csr_train: training data in CSR sparse format and as (users x items)
            csr_val:  validation data in CSR sparse format and as (users x items)
    
    """
    df_train, df_val = split_data(df, validation_days=validation_days)
    coo_train = to_user_item_coo(df_train)
    coo_val = to_user_item_coo(df_val)

    csr_train = coo_train.tocsr()
    csr_val = coo_val.tocsr()
    
    return {'coo_train': coo_train,
            'csr_train': csr_train,
            'csr_val': csr_val
          }

def map_k(model,csr_train,csr_val)


def validate(matrices, factors=200, iterations=20, regularization=0.01,use_cg = True, show_progress=True):
    """ Train an ALS model with <<factors>> (embeddings dimension) 
    for <<iterations>> over matrices and validate with MAP@12
    """
    coo_train, csr_train, csr_val = matrices['coo_train'], matrices['csr_train'], matrices['csr_val']
    
    model = implicit.als.AlternatingLeastSquares(factors=factors, 
                                                 iterations=iterations, 
                                                 regularization=regularization, 
                                                 use_cg=use_cg,
                                                 random_state=42)
    model.fit(coo_train, show_progress=show_progress)
    
    # The MAPK by implicit doesn't allow to calculate allowing repeated items, which is the case.
    # TODO: change MAP@12 to a library that allows repeated items in prediction
    map12 = mean_average_precision_at_k(model, csr_train, csr_val, K=12, show_progress=show_progress, num_threads=4)
    print(f"Factors: {factors:>3} - Iterations: {iterations:>2} - Regularization: {regularization:4.3f} ==> MAP@12: {map12:6.5f}")
    return map12

### Baseline Model

In [14]:
matrices = get_val_matrices(df)
coo_train, csr_train, csr_val = matrices['coo_train'], matrices['csr_train'], matrices['csr_val']

model = implicit.als.AlternatingLeastSquares(factors=10,regularization=0.0,use_cg = True)
model.fit(coo_train, show_progress=True)
baseline_map12 = mean_average_precision_at_k(model, csr_train, csr_val, K=12, show_progress=True, num_threads=4)
print('Baseline score: ',round(baseline_map12,5))

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/10528 [00:00<?, ?it/s]

Baseline score:  0.00491


### Grid Search Hyper-parameter Tuning

In [14]:
best_map12 = 0
map12_list = []
factors = [40, 50, 60, 100, 200, 500]
iterations = [3, 12, 14, 15, 20]

for factor in factors:
#     for iterations in [3, 12, 14, 15, 20]:
    for iteration in [15]:
        for regularization in [0.01]:
            map12 = validate(matrices, factor, iteration, regularization, show_progress=False)
            map12_list.append(map12)
            if map12 > best_map12:
                best_map12 = map12
                best_params = {'factors': factor, 'iterations': iteration, 'regularization': regularization}
                print(f"Best MAP@12 found. Updating: {best_params}")
                
    

Factors:  40 - Iterations: 15 - Regularization: 0.010 ==> MAP@12: 0.00528
Best MAP@12 found. Updating: {'factors': 40, 'iterations': 15, 'regularization': 0.01}
Factors:  50 - Iterations: 15 - Regularization: 0.010 ==> MAP@12: 0.00534
Best MAP@12 found. Updating: {'factors': 50, 'iterations': 15, 'regularization': 0.01}
Factors:  60 - Iterations: 15 - Regularization: 0.010 ==> MAP@12: 0.00576
Best MAP@12 found. Updating: {'factors': 60, 'iterations': 15, 'regularization': 0.01}
Factors: 100 - Iterations: 15 - Regularization: 0.010 ==> MAP@12: 0.00621
Best MAP@12 found. Updating: {'factors': 100, 'iterations': 15, 'regularization': 0.01}
Factors: 200 - Iterations: 15 - Regularization: 0.010 ==> MAP@12: 0.00641
Best MAP@12 found. Updating: {'factors': 200, 'iterations': 15, 'regularization': 0.01}
Factors: 500 - Iterations: 15 - Regularization: 0.010 ==> MAP@12: 0.00590


### Final Model

In [9]:
def train(coo_train, factors=200, iterations=15, regularization=0.01, show_progress=True):
    model = implicit.als.AlternatingLeastSquares(factors=factors, 
                                                 iterations=iterations, 
                                                 regularization=regularization, 
                                                 random_state=42)
    model.fit(coo_train, show_progress=show_progress)
    return model

In [20]:
best_params = {'factors': 500, 'iterations': 3, 'regularization': 0.01}
print(best_params)

matrices = get_val_matrices(df)
coo_train, csr_train, csr_val = matrices['coo_train'], matrices['csr_train'], matrices['csr_val']
model = train(coo_train, **best_params)


final_map12 = mean_average_precision_at_k(model, csr_train, csr_val, K=12, show_progress=True, num_threads=4)
print('Baseline score: ',round(final_map12,5))
print('Improvement: {:.2%}'.format(final_map12/baseline_map12))

{'factors': 500, 'iterations': 3, 'regularization': 0.01}


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/10528 [00:00<?, ?it/s]

Baseline score:  0.007
Improvement: 142.67%


## Submission File

In [21]:
save_path = '../data/processed'
def submit(model, csr_train):
    preds = []
    batch_size = 2000
    to_generate = np.arange(len(ALL_USERS))
    for startidx in range(0, len(to_generate), batch_size):
        batch = to_generate[startidx : startidx + batch_size]
        ids, scores = model.recommend(batch, csr_train[batch], N=12, filter_already_liked_items=True)
        for i, userid in enumerate(batch):
            customer_id = user_ids[userid]
            user_items = ids[i]
            article_ids = [item_ids[item_id] for item_id in user_items]
            preds.append((customer_id, ' '.join(article_ids)))

    df_preds = pd.DataFrame(preds, columns=['customer_id', 'prediction'])

    
    display(df_preds.head())
    print(df_preds.shape)
    
    return df_preds

df_preds = submit(model, csr_train);

df_preds.to_csv(os.path.join(save_path,"submissions_v4.csv"), index=False)

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0858856005 0399256001 0779781015 0868823007 05...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0112679048 0111609001 0111593001 0111586001 01...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321011 0805000007 0614854005 0915529005 07...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0112679048 0111609001 0111593001 0111586001 01...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0112679048 0111609001 0111593001 0111586001 01...


(1371980, 2)


In [22]:
df_preds.groupby(['prediction']).agg({'customer_id':'nunique'}).sort_values(by=['customer_id'],ascending=False)

Unnamed: 0_level_0,customer_id
prediction,Unnamed: 1_level_1
0112679048 0111609001 0111593001 0111586001 0111565003 0111565001 0110065011 0110065002 0110065001 0108775051 0108775044 0108775015,1120820
0923128001 0908799002 0923028002 0909371001 0923388001 0923028001 0918443003 0903276001 0860285001 0939927001 0908728001 0876415003,202
0767423001 0902388001 0893141001 0868641002 0879605001 0801447001 0874891006 0918836001 0909014001 0898694002 0918292011 0924605001,161
0895487001 0915453002 0706016006 0928206001 0893059003 0821397007 0685813001 0817354001 0845729002 0911214001 0765308002 0610776068,144
0714790021 0714790017 0714790008 0448509018 0746069006 0900169001 0714790024 0799365027 0837939002 0905365001 0799365015 0516859008,121
...,...
0775382001 0810170018 0810169002 0905070001 0843940001 0834063001 0810169018 0810170014 0834063006 0810169020 0910528002 0933802002,1
0775382001 0810170018 0810169002 0905070001 0843940001 0834063001 0810169018 0810170014 0834063006 0810169020 0933802002 0810170013,1
0775382001 0810170018 0843940001 0905070001 0834063001 0810169018 0810170014 0834063006 0810169020 0610776105 0910528002 0752657001,1
0775382001 0810170018 0843940001 0905070001 0834063001 0810169018 0834063006 0810169020 0810170013 0910528002 0933802002 0851936011,1
