# Imports

In [154]:
# !pip install --upgrade implicit

In [36]:
import os; os.environ['OPENBLAS_NUM_THREADS']='1'
import numpy as np
import pandas as pd
import implicit
from scipy.sparse import coo_matrix
from implicit.evaluation import mean_average_precision_at_k

#source information and code from: 
# http://yifanhu.net/PUB/cf.pdf?fbclid=IwAR1XXu-uOKea06KXjvhR6oDNKqs_X-gk70YltpL4gUrDc0kwGFXivGcRrEI
# https://www.kaggle.com/code/julian3833/h-m-implicit-als-model-0-014
# https://medium.com/radon-dev/als-implicit-collaborative-filtering-5ed653ba39fe
# https://implicit.readthedocs.io/en/latest/quickstart.html

# Collaborative Filtering

In [37]:
df = pd.read_csv('cleaned_data2.csv', dtype={'StockCode': str}, parse_dates=['InvoiceDate'])

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 516528 entries, 0 to 516527
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    516528 non-null  object        
 1   StockCode    516528 non-null  object        
 2   Quantity     516528 non-null  int64         
 3   InvoiceDate  516528 non-null  datetime64[ns]
 4   UnitPrice    516528 non-null  float64       
 5   CustomerID   392980 non-null  float64       
 6   Country      516528 non-null  object        
 7   TotalValue   516528 non-null  float64       
 8   Description  516528 non-null  object        
dtypes: datetime64[ns](1), float64(3), int64(1), object(4)
memory usage: 35.5+ MB


In [38]:
df['InvoiceDate']=pd.to_datetime(df['InvoiceDate'])

In [39]:
df_cust = df.dropna()
df_cust = df_cust[df_cust["Quantity"]>0]
dfu=pd.DataFrame(df_cust.groupby(["CustomerID","StockCode"],as_index=False)["Quantity"].sum())
dfu.head()

Unnamed: 0,CustomerID,StockCode,Quantity
0,12346.0,23166,74215
1,12347.0,16008,24
2,12347.0,17021,36
3,12347.0,20665,6
4,12347.0,20719,40


In [10]:
# For validation this means 3 weeks of training and 1 week for validation
# For submission, it means 4 weeks of training
#df['InvoiceDate'].max()

Timestamp('2011-12-09 12:50:00')

## Assign autoincrementing ids starting from 0 to both users and items

In [40]:
ALL_USERS = dfu['CustomerID'].unique().tolist()
ALL_ITEMS = dfu['StockCode'].unique().tolist()

user_ids = dict(list(enumerate(ALL_USERS)))
item_ids = dict(list(enumerate(ALL_ITEMS)))

user_map = {u: uidx for uidx, u in user_ids.items()}
item_map = {i: iidx for iidx, i in item_ids.items()}

dfu['user_id'] = dfu['CustomerID'].map(user_map)
dfu['item_id'] = dfu['StockCode'].map(item_map)

dfu

Unnamed: 0,CustomerID,StockCode,Quantity,user_id,item_id
0,12346.0,23166,74215,0,0
1,12347.0,16008,24,1,1
2,12347.0,17021,36,1,2
3,12347.0,20665,6,1,3
4,12347.0,20719,40,1,4
...,...,...,...,...,...
256228,18287.0,84920,4,4328,1952
256229,18287.0,85039a,96,4328,1954
256230,18287.0,85039b,120,4328,2391
256231,18287.0,85040a,48,4328,302


In [74]:
row = dfu['user_id'].values
col = dfu['item_id'].values
data = dfu['Quantity'].values
coo_train = coo_matrix((data, (row, col)), shape=(len(ALL_USERS), len(ALL_ITEMS)))
coo_train

<4329x2487 sparse matrix of type '<class 'numpy.int64'>'
	with 256233 stored elements in COOrdinate format>

# Check that model works ok with data

In [75]:
%%time
model = implicit.als.AlternatingLeastSquares(factors=10, iterations=2)
model.fit(coo_train)

  0%|          | 0/2 [00:00<?, ?it/s]

Wall time: 464 ms


# Validation

## Functions required for validation

In [77]:
def to_user_item_coo(df):
    """ Turn a dataframe with transactions into a COO sparse items x users matrix"""
    row = df['user_id'].values
    col = df['item_id'].values
    data = df['Quantity'].values
    coo = coo_matrix((data, (row, col)), shape=(len(ALL_USERS), len(ALL_ITEMS)))
    return coo


def split_data(df, validation_days=60):
    """ Split a pandas dataframe into training and validation data, using <<validation_days>>
    """
#    validation_cut = df['InvoiceDate'].max() - pd.Timedelta(validation_days)
    validation_cut = '2011-10-09 12:50:00'

    df_train = df[df['InvoiceDate'] < validation_cut]
    df_val = df[df['InvoiceDate'] >= validation_cut]
    return df_train, df_val

def get_val_matrices(df_train, df_val):
    """ Split into training and validation and create various matrices
        
        Returns a dictionary with the following keys:
            coo_train: training data in COO sparse format and as (users x items)
            csr_train: training data in CSR sparse format and as (users x items)
            csr_val:  validation data in CSR sparse format and as (users x items)
    
    """
#    df_train, df_val = split_data(df, validation_days=validation_days)
    coo_train = to_user_item_coo(df_train)
    coo_val = to_user_item_coo(df_val)

    csr_train = coo_train.tocsr()
    csr_val = coo_val.tocsr()
    
    return {'coo_train': coo_train,
            'csr_train': csr_train,
            'csr_val': csr_val
          }


def validate(matrices, factors=200, iterations=20, regularization=0.01, show_progress=True):
    """ Train an ALS model with <<factors>> (embeddings dimension) 
    for <<iterations>> over matrices and validate with MAP@5
    """
    coo_train, csr_train, csr_val = matrices['coo_train'], matrices['csr_train'], matrices['csr_val']
    
    model = implicit.als.AlternatingLeastSquares(factors=factors, 
                                                 iterations=iterations, 
                                                 regularization=regularization, 
                                                 random_state=42)
    model.fit(matrices['coo_train'], show_progress=show_progress)
    
    # The MAPK by implicit doesn't allow to calculate allowing repeated items, which is the case.
    # TODO: change MAP@5 to a library that allows repeated items in prediction
    map5 = mean_average_precision_at_k(model, csr_train, csr_val, K=5, show_progress=show_progress)
    print(f"Factors: {factors:>3} - Iterations: {iterations:>2} - Regularization: {regularization:4.3f} ==> MAP@5: {map5:6.5f}")
    return map5

In [78]:
df_train, df_val= split_data(df_cust, validation_days=60)

In [79]:
df_val['InvoiceDate'].max()

Timestamp('2011-12-09 12:50:00')

In [80]:
df_train = pd.DataFrame(df_train.groupby(["CustomerID","StockCode"],as_index=False)["Quantity"].sum())
df_train['user_id'] = df_train['CustomerID'].map(user_map)
df_train['item_id'] = df_train['StockCode'].map(item_map)

df_val = pd.DataFrame(df_val.groupby(["CustomerID","StockCode"],as_index=False)["Quantity"].sum())
df_val['user_id'] = df_val['CustomerID'].map(user_map)
df_val['item_id'] = df_val['StockCode'].map(item_map)

In [81]:
matrices = get_val_matrices(df_train, df_val)

In [82]:
#precision with preset parameters
mean_average_precision_at_k(model, matrices['csr_train'], matrices['csr_val'], K=5, show_progress=False)
#implicit.evaluation.ranking_metrics_at_k(model, matrices['csr_train'], matrices['csr_val'], K=5, show_progress=False,)
#matrices['coo_train'], matrices['csr_train'], matrices['csr_val']

0.09883524032717536

In [83]:
%%time
best_map5 = 0
for factors in [40, 50, 60, 100, 200, 500, 1000]:
    for iterations in [3, 12, 14, 15, 20]:
        for regularization in [0.01]:
            map5 = validate(matrices, factors, iterations, regularization, show_progress=False)
            if map5 > best_map5:
                best_map5 = map5
                best_params = {'factors': factors, 'iterations': iterations, 'regularization': regularization}
                print(f"Best MAP@5 found. Updating: {best_params}")

Factors:  40 - Iterations:  3 - Regularization: 0.010 ==> MAP@5: 0.02632
Best MAP@5 found. Updating: {'factors': 40, 'iterations': 3, 'regularization': 0.01}
Factors:  40 - Iterations: 12 - Regularization: 0.010 ==> MAP@5: 0.03061
Best MAP@5 found. Updating: {'factors': 40, 'iterations': 12, 'regularization': 0.01}
Factors:  40 - Iterations: 14 - Regularization: 0.010 ==> MAP@5: 0.03073
Best MAP@5 found. Updating: {'factors': 40, 'iterations': 14, 'regularization': 0.01}
Factors:  40 - Iterations: 15 - Regularization: 0.010 ==> MAP@5: 0.03058
Factors:  40 - Iterations: 20 - Regularization: 0.010 ==> MAP@5: 0.03108
Best MAP@5 found. Updating: {'factors': 40, 'iterations': 20, 'regularization': 0.01}
Factors:  50 - Iterations:  3 - Regularization: 0.010 ==> MAP@5: 0.02718
Factors:  50 - Iterations: 12 - Regularization: 0.010 ==> MAP@5: 0.03214
Best MAP@5 found. Updating: {'factors': 50, 'iterations': 12, 'regularization': 0.01}
Factors:  50 - Iterations: 14 - Regularization: 0.010 ==> MA

In [13]:
#del matrices

In [106]:
best_params

In [86]:
model = implicit.als.AlternatingLeastSquares(**best_params,
                                            random_state=42)
model.fit(matrices['coo_train'], show_progress=True)

  0%|          | 0/14 [00:00<?, ?it/s]

In [87]:
#presicion with best parameters
mean_average_precision_at_k(model, matrices['csr_train'], matrices['csr_val'], K=5, show_progress=False)

0.037187413840639685

## Testing different timelines

In [96]:
batch_date_start = '2011-06-01 12:50:00'
batch_date_end = '2011-07-01 12:50:00'
df_cust_small = df_cust[(df_cust['InvoiceDate'] >= batch_date_start)&(df_cust['InvoiceDate'] < batch_date_end)]
df_cust_small

Unnamed: 0,InvoiceNo,StockCode,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,TotalValue,Description
196850,555234,48185,3,2011-06-01 12:54:00,7.95,14515.0,United Kingdom,23.85,doormat fairy cake
196851,555234,48194,6,2011-06-01 12:54:00,7.95,14515.0,United Kingdom,47.70,doormat hearts
196852,555234,48188,2,2011-06-01 12:54:00,7.95,14515.0,United Kingdom,15.90,doormat welcome puppies
196853,555234,22189,10,2011-06-01 12:54:00,3.95,14515.0,United Kingdom,39.50,cream heart card holder
196854,555234,22178,24,2011-06-01 12:54:00,1.25,14515.0,United Kingdom,30.00,victorian glass hanging t-light
...,...,...,...,...,...,...,...,...,...
232256,558703,23146,4,2011-07-01 12:47:00,3.29,13730.0,United Kingdom,13.16,triple hook antique ivory rose
232257,558703,22726,4,2011-07-01 12:47:00,3.75,13730.0,United Kingdom,15.00,alarm clock bakelike green
232258,558703,22729,4,2011-07-01 12:47:00,3.75,13730.0,United Kingdom,15.00,alarm clock bakelike orange
232259,558703,84879,16,2011-07-01 12:47:00,1.69,13730.0,United Kingdom,27.04,assorted colour bird ornament


In [98]:
validation_cut = '2011-06-22 12:50:00'

df_train = df_cust_small[df_cust_small['InvoiceDate'] < validation_cut]
df_val = df_cust_small[df_cust_small['InvoiceDate'] >= validation_cut]

In [99]:
df_train = pd.DataFrame(df_train.groupby(["CustomerID","StockCode"],as_index=False)["Quantity"].sum())
df_train['user_id'] = df_train['CustomerID'].map(user_map)
df_train['item_id'] = df_train['StockCode'].map(item_map)

df_val = pd.DataFrame(df_val.groupby(["CustomerID","StockCode"],as_index=False)["Quantity"].sum())
df_val['user_id'] = df_val['CustomerID'].map(user_map)
df_val['item_id'] = df_val['StockCode'].map(item_map)

In [100]:
matrices = get_val_matrices(df_train, df_val)

In [101]:
df_val

Unnamed: 0,CustomerID,StockCode,Quantity,user_id,item_id
0,12379.0,20665,6,28,3
1,12379.0,20674,8,28,578
2,12379.0,20676,8,28,250
3,12379.0,20750,12,28,254
4,12379.0,21080,12,28,255
...,...,...,...,...,...
6934,18283.0,84997b,1,4327,95
6935,18283.0,84997d,1,4327,97
6936,18283.0,85099b,3,4327,575
6937,18283.0,85099f,2,4327,577


In [102]:
%%time
best_map5 = 0
for factors in [40, 50, 60, 100, 200, 500, 1000]:
    for iterations in [3, 12, 14, 15, 20]:
        for regularization in [0.01]:
            map5 = validate(matrices, factors, iterations, regularization, show_progress=False)
            if map5 > best_map5:
                best_map5 = map5
                best_params = {'factors': factors, 'iterations': iterations, 'regularization': regularization}
                print(f"Best MAP@5 found. Updating: {best_params}")

Factors:  40 - Iterations:  3 - Regularization: 0.010 ==> MAP@5: 0.02278
Best MAP@5 found. Updating: {'factors': 40, 'iterations': 3, 'regularization': 0.01}
Factors:  40 - Iterations: 12 - Regularization: 0.010 ==> MAP@5: 0.02396
Best MAP@5 found. Updating: {'factors': 40, 'iterations': 12, 'regularization': 0.01}
Factors:  40 - Iterations: 14 - Regularization: 0.010 ==> MAP@5: 0.02341
Factors:  40 - Iterations: 15 - Regularization: 0.010 ==> MAP@5: 0.02321
Factors:  40 - Iterations: 20 - Regularization: 0.010 ==> MAP@5: 0.02301
Factors:  50 - Iterations:  3 - Regularization: 0.010 ==> MAP@5: 0.02953
Best MAP@5 found. Updating: {'factors': 50, 'iterations': 3, 'regularization': 0.01}
Factors:  50 - Iterations: 12 - Regularization: 0.010 ==> MAP@5: 0.02769
Factors:  50 - Iterations: 14 - Regularization: 0.010 ==> MAP@5: 0.02797
Factors:  50 - Iterations: 15 - Regularization: 0.010 ==> MAP@5: 0.02835
Factors:  50 - Iterations: 20 - Regularization: 0.010 ==> MAP@5: 0.02674
Factors:  60 -

In [103]:
best_params

{'factors': 50, 'iterations': 3, 'regularization': 0.01}

In [104]:
model = implicit.als.AlternatingLeastSquares(**best_params,
                                            random_state=42)
model.fit(matrices['coo_train'], show_progress=True)

  0%|          | 0/3 [00:00<?, ?it/s]

In [105]:
#for the last month of data
mean_average_precision_at_k(model, matrices['csr_train'], matrices['csr_val'], K=5, show_progress=False)

0.029527600849256905

# Training over the full dataset

In [107]:
coo_train = to_user_item_coo(dfu)
csr_train = coo_train.tocsr()

In [108]:
best_params

{'factors': 200, 'iterations': 14, 'regularization': 0.01}

In [109]:
model = implicit.als.AlternatingLeastSquares(**best_params,
                                            random_state=42)
model.fit(coo_train, show_progress=True)

  0%|          | 0/14 [00:00<?, ?it/s]

# Getting recommendations

In [110]:
customer_reference = dfu[["CustomerID",'user_id']].drop_duplicates()
item_reference = dfu[["StockCode",'item_id']].drop_duplicates()
item_reference=pd.merge(item_reference,df_cust[["StockCode","Description"]], on="StockCode", how="left").drop_duplicates()

In [117]:
#getting user recommendations
#customer_id = 14789
user_id = 1790

# Use the implicit recommender.
recommended = model.recommend(user_id, csr_train[user_id], N=5)

recomendations_list = recommended[0].tolist()
items = []
description = []
score = []


# Get artist names from ids
for item in recomendations_list:
    items.append(dfu["StockCode"][dfu.item_id == item].unique().tolist())

items = [item for sublist in items for item in sublist]
recommendations = pd.DataFrame({'items': items, 'score': recommended[1]})
print(recommendations)

   items     score
0  20972  0.581438
1  22644  0.503945
2  22898  0.499361
3  22617  0.499310
4  22646  0.498556


In [118]:
item_reference[item_reference["StockCode"].isin(items)]

Unnamed: 0,StockCode,item_id,Description
60510,22617,211,baking set spaceboy design
63142,22646,218,ceramic strawberry cake money bank
152491,22898,618,childrens apron apples design
182508,20972,756,pink cream felt craft trinket box
215122,22644,918,ceramic cherry cake money bank


In [119]:
#items purchased
cust_id_lookup=customer_reference["CustomerID"][customer_reference["user_id"]==user_id].tolist()
df_cust[df_cust["CustomerID"]==cust_id_lookup[0]]

Unnamed: 0,InvoiceNo,StockCode,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,TotalValue,Description
435145,575737,21108,36,2011-11-11 08:59:00,0.79,14789.0,United Kingdom,28.44,fairy cake flannel assorted colour
435146,575737,22367,16,2011-11-11 08:59:00,1.95,14789.0,United Kingdom,31.2,childrens apron spaceboy design
435147,575737,47591d,16,2011-11-11 08:59:00,1.95,14789.0,United Kingdom,31.2,pink fairy cake childrens apron
435148,575737,20971,24,2011-11-11 08:59:00,1.25,14789.0,United Kingdom,30.0,pink blue felt craft trinket box
435149,575737,22645,12,2011-11-11 08:59:00,1.45,14789.0,United Kingdom,17.4,ceramic heart fairy cake money bank
435150,575737,22138,12,2011-11-11 08:59:00,4.95,14789.0,United Kingdom,59.4,baking set 9 piece retrospot


In [120]:
#getting user recommendations
#customer_id = 13120
user_id = 586

# Use the implicit recommender.
recommended = model.recommend(user_id, csr_train[user_id], N=5)

recomendations_list = recommended[0].tolist()
items = []
description = []
score = []


# Get artist names from ids
for item in recomendations_list:
    items.append(dfu["StockCode"][dfu.item_id == item].unique().tolist())

items = [item for sublist in items for item in sublist]
recommendations = pd.DataFrame({'items': items, 'score': recommended[1]})
print(recommendations)

    items     score
0  82494l  0.638831
1   82483  0.251663
2   82486  0.224138
3   82484  0.194639
4   22766  0.190953


In [121]:
item_reference[item_reference["StockCode"].isin(items)]

Unnamed: 0,StockCode,item_id,Description
139133,82484,569,wood black board ant white finish
197158,22766,843,photo frame cornice
324020,82494l,1649,wooden frame antique white
328763,82483,1698,wood 2 drawer cabinet white finish
329284,82486,1699,wood s/3 cabinet ant white finish


In [122]:
#items purchased
cust_id_lookup=customer_reference["CustomerID"][customer_reference["user_id"]==user_id].tolist()
df_cust[df_cust["CustomerID"]==cust_id_lookup[0]]

Unnamed: 0,InvoiceNo,StockCode,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,TotalValue,Description
147137,550198,82482,12,2011-04-15 10:19:00,2.55,13120.0,United Kingdom,30.6,wooden picture frame white finish


In [123]:
#getting user recommendations
#customerID = 12853
user_id = 400

# Use the implicit recommender.
recommended = model.recommend(user_id, csr_train[user_id], N=5)

recomendations_list = recommended[0].tolist()
items = []
description = []
score = []


# Get artist names from ids
for item in recomendations_list:
    items.append(dfu["StockCode"][dfu.item_id == item].unique().tolist())

items = [item for sublist in items for item in sublist]
recommendations = pd.DataFrame({'items': items, 'score': recommended[1]})
print(recommendations)

   items     score
0  22854  0.765817
1  23208  0.729869
2  22726  0.720887
3  48194  0.706198
4  22173  0.662107


In [124]:
item_reference[item_reference["StockCode"].isin(items)]

Unnamed: 0,StockCode,item_id,Description
12214,22726,45,alarm clock bakelike green
50089,48194,179,doormat hearts
132129,23208,533,lunch bag vintage leaf design
319563,22173,1618,metal 4 hook hanger french chateau
358758,22854,2012,cream sweetheart egg holder


In [126]:
#items purchased
pd.set_option('display.max_rows', None)
cust_id_lookup=customer_reference["CustomerID"][customer_reference["user_id"]==user_id].tolist()
df_cust[df_cust["CustomerID"]==cust_id_lookup[0]]

Unnamed: 0,InvoiceNo,StockCode,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,TotalValue,Description
69233,542431,22988,12,2011-01-28 10:28:00,1.25,12853.0,United Kingdom,15.0,soldiers egg cup
69234,542431,22652,10,2011-01-28 10:28:00,1.65,12853.0,United Kingdom,16.5,travel sewing kit
69235,542431,84380,12,2011-01-28 10:28:00,1.25,12853.0,United Kingdom,15.0,set of 3 butterfly cookie cutters
69236,542431,84378,12,2011-01-28 10:28:00,1.25,12853.0,United Kingdom,15.0,set of 3 heart cookie cutters
69237,542431,22966,12,2011-01-28 10:28:00,1.25,12853.0,United Kingdom,15.0,gingerbread man cookie cutter
69238,542431,22666,6,2011-01-28 10:28:00,2.95,12853.0,United Kingdom,17.7,recipe box pantry yellow design
69239,542431,22170,4,2011-01-28 10:28:00,6.75,12853.0,United Kingdom,27.0,picture frame wood triple portrait
69240,542431,22169,4,2011-01-28 10:28:00,8.5,12853.0,United Kingdom,34.0,family album white picture frame
69241,542431,22171,2,2011-01-28 10:28:00,8.5,12853.0,United Kingdom,17.0,3 hook photo shelf antique white
69242,542431,22795,4,2011-01-28 10:28:00,6.75,12853.0,United Kingdom,27.0,sweetheart recipe book stand
