# Imports

In [1]:
import os; os.environ['OPENBLAS_NUM_THREADS']='1'
import numpy as np
import pandas as pd
import implicit
from scipy.sparse import coo_matrix
from implicit.evaluation import mean_average_precision_at_k
from implicit.evaluation import ndcg_at_k

# Load dataframes

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('data/training_set.csv')
df['rating'] = df['Click'] + 5*df['Purchase']
# df = df.groupby('UserId').tail(20)
df = df[['UserId', 'ItemId', 'rating']]

user_list = df['UserId'].unique()
# user_list_denoise = user_list[(user_list<=30)].index.to_list()

In [3]:
test = pd.read_csv('data/public_testset.csv', names=['user_id'] + [f'item_id_{i}' for i in range(1,1001)])
test_user_id = test['user_id'].values

In [4]:
user_map = {UserId: index for index, UserId in enumerate(user_list)}
user_map = pd.DataFrame(list(user_map.items()), columns=['UserId', 'index'])
user_map.head()

Unnamed: 0,UserId,index
0,tyviMi4b8Q,0
1,PGcvoaV6Gn,1
2,vZK9GTxbNA,2
3,Sr1MiLP6VX,3
4,y9iZADrbdu,4


## Assign autoincrementing ids starting from 0 to both users and items

In [5]:
ALL_USERS = df['UserId'].unique().tolist()
ALL_ITEMS = df['ItemId'].unique().tolist()

user_ids = dict(list(enumerate(ALL_USERS)))
item_ids = dict(list(enumerate(ALL_ITEMS)))

user_map = {u: uidx for uidx, u in user_ids.items()}
item_map = {i: iidx for iidx, i in item_ids.items()}

df['UserId'] = df['UserId'].map(user_map)
df['ItemId'] = df['ItemId'].map(item_map)



In [6]:
test['user_index'] = test['user_id'].map(user_map)
test_user_index = test['user_index'].values

In [7]:
# Split data into train and test sets (90% for train, 10% for test)
train_df = df.sample(frac=0.95, random_state=42)  # 90% for train
test_df = df.drop(train_df.index)  # Remaining 10% for test

## Create coo_matrix (user x item) and csr matrix (user x item)

It is common to use scipy sparse matrices in recommender systems, because the main core of the problem is typically modeled as a matrix with users and items, with the values representing whether the user purchased (or liked) an items. Since each user purchases only a small fraction of the catalog of products, this matrix is full of zero (aka: it's sparse).

In a very recent release they did an API breaking change, so be aware of that: https://github.com/benfred/implicit/releases
In this notebook we are using the latest version, so everything is aligned with (user x item)

**We are using (user x item) matrices, both for training and for evaluating/recommender.**

In the previous versions the training procedure required a COO item x user

For evaluation and prediction, on the other hand, CSR matrices with users x items format should be provided.


### About COO matrices
COO matrices are a kind of sparse matrix.
They store their values as tuples of `(row, column, value)` (the coordinates)

You can read more about them here: 
* https://en.wikipedia.org/wiki/Sparse_matrix#Coordinate_list_(COO)
* https://scipy-lectures.org/advanced/scipy_sparse/coo_matrix.html

From https://het.as.utexas.edu/HET/Software/Scipy/generated/scipy.sparse.coo_matrix.html

```python
>>> row  = np.array([0,3,1,0]) # user_ids
>>> col  = np.array([0,3,1,2]) # item_ids
>>> data = np.array([4,5,7,9]) # a bunch of ones of lenght unique(user) x unique(items)
>>> coo_matrix((data,(row,col)), shape=(4,4)).todense()
matrix([[4, 0, 9, 0],
        [0, 7, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 5]])
```

## About CSR matrices
* https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)


In [8]:
row = df['UserId'].values
col = df['ItemId'].values
# data = np.ones(train_df.shape[0])
data = df['rating'].values
coo_train = coo_matrix((data, (row, col)), shape=(len(ALL_USERS), len(ALL_ITEMS)))
coo_train

<COOrdinate sparse matrix of dtype 'int64'
	with 389923 stored elements and shape (36751, 83102)>

# Validation

## Functions required for validation

In [9]:
def to_user_item_coo(df):
    """ Turn a dataframe with transactions into a COO sparse items x users matrix"""
    row = df['UserId'].values
    col = df['ItemId'].values
    # data = np.ones(df.shape[0])
    data = df['rating'].values
    coo = coo_matrix((data, (row, col)), shape=(len(ALL_USERS), len(ALL_ITEMS)))
    return coo


def split_data(df, validation_days=7):
    """ Split a pandas dataframe into training and validation data, using <<validation_days>>
    """
    validation_cut = df['t_dat'].max() - pd.Timedelta(validation_days)

    df_train = df[df['t_dat'] < validation_cut]
    df_val = df[df['t_dat'] >= validation_cut]
    return df_train, df_val

def get_val_matrices(df, validation_days=7):
    """ Split into training and validation and create various matrices
        
        Returns a dictionary with the following keys:
            coo_train: training data in COO sparse format and as (users x items)
            csr_train: training data in CSR sparse format and as (users x items)
            csr_val:  validation data in CSR sparse format and as (users x items)
    
    """
    df_train, df_val = train_df, test_df
    coo_train = to_user_item_coo(df_train)
    coo_val = to_user_item_coo(df_val)

    csr_train = coo_train.tocsr()
    csr_val = coo_val.tocsr()
    
    return {'coo_train': coo_train,
            'csr_train': csr_train,
            'csr_val': csr_val
          }


def validate(model, matrices, factors=200, iterations=20, regularization=0.01, show_progress=True):
    """ Train an ALS model with <<factors>> (embeddings dimension) 
    for <<iterations>> over matrices and validate with MAP@12
    """
    coo_train, csr_train, csr_val = matrices['coo_train'], matrices['csr_train'], matrices['csr_val']
    
    # model = implicit.als.AlternatingLeastSquares(factors=factors, 
    #                                              iterations=iterations, 
    #                                              regularization=regularization, 
    #                                              random_state=42)
    # model.fit(coo_train, show_progress=show_progress)
    
    # The MAPK by implicit doesn't allow to calculate allowing repeated items, which is the case.
    # TODO: change MAP@12 to a library that allows repeated items in prediction
    # map12 = mean_average_precision_at_k(model, csr_train, csr_val, K=12, show_progress=show_progress, num_threads=4)
    ndcg = ndcg_at_k(model, csr_train, csr_val, K=10, show_progress=show_progress, num_threads=4)
    print(f"Factors: {factors:>3} - Iterations: {iterations:>2} - Regularization: {regularization:4.3f} ==> NDCG@10: {ndcg:6.5f}")
    return ndcg

In [10]:
matrices = get_val_matrices(df)

In [11]:
# %%time
# best_map12 = 0
# for factors in [40, 50, 60, 100, 200, 500, 1000]:
#     for regularization in [0.01]:
#         map12 = validate(matrices, factors, iterations, regularization, show_progress=False)
#         if map12 > best_map12:
#             best_map12 = map12
#             best_params = {'factors': factors, 'iterations': iterations, 'regularization': regularization}
#             print(f"Best MAP@12 found. Updating: {best_params}")

factors = 1000
iterations = 200
regularization = 0.01
best_params = {'factors': factors, 'iterations': iterations, 'regularization': regularization}

# Training over the full dataset

In [12]:
coo_train = to_user_item_coo(df)
csr_train = coo_train.tocsr()

In [13]:
def train(coo_train, factors=200, iterations=15, regularization=0.01, show_progress=True):
    model = implicit.als.AlternatingLeastSquares(factors=factors, 
                                                 iterations=iterations, 
                                                 regularization=regularization, 
                                                 random_state=42)
    model.fit(coo_train, show_progress=show_progress)
    return model

In [14]:
model = train(coo_train, **best_params)



  0%|          | 0/200 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [16]:
save_path = "runs/ALS"
os.makedirs(save_path, exist_ok=True)

In [19]:
import pickle
with open(f"{save_path}/user_embedding.pkl", "wb") as f:
    pickle.dump(model.user_factors, f)
    
with open(f"{save_path}/item_embedding.pkl", "wb") as f:
    pickle.dump(model.user_factors, f)
    
with open(f"{save_path}/model.pkl", "wb") as f:
    pickle.dump(model, f)

with open(f"{save_path}/usermap.pkl", "wb") as f:
    pickle.dump(user_map, f)

with open(f"{save_path}/itemmap.pkl", "wb") as f:
    pickle.dump(item_map, f)

with open(f"{save_path}/csr_train.pkl", "wb") as f:
    pickle.dump(csr_train, f)

In [20]:
user_ids

{0: 'tyviMi4b8Q',
 1: 'PGcvoaV6Gn',
 2: 'vZK9GTxbNA',
 3: 'Sr1MiLP6VX',
 4: 'y9iZADrbdu',
 5: 'URQmc5J6kH',
 6: '1j704xkcse',
 7: 'X2oDWpC701',
 8: '4KWQyiec7N',
 9: 'acDdQa67Fk',
 10: '6uuqsSXcN7',
 11: 'dCc4KKz7UU',
 12: '9UJHmCRccr',
 13: 'gn1UE5t7kE',
 14: 'C5ihgxLcrb',
 15: 'iNPv8pm7zx',
 16: 'Ff78ahEd7K',
 17: 'lxoL2Zg8Fh',
 18: 'IGVYUR8dM4',
 19: 'oYDmwKZ8UR',
 20: 'KquzOC1dco',
 21: 'r8bCq4T8jB',
 22: 'NQJPIwvdrX',
 23: 'uj0ckoN8zu',
 24: 'Q1hqChod6H',
 25: 'wJP3eZG9Ee',
 26: 'Tb6G6RieM1',
 27: 'ztnTYJA9TO',
 28: 'VCVh0Bcebl',
 29: '2UCuS339j7',
 30: 'Ymt7uwVeqU',
 31: '54bKMox9yr',
 32: 'bMIYogPe6E',
 33: '7fzlGYq9Db',
 34: 'exhyiQIeLy',
 35: 'AFOBAJkATL',
 36: 'hX5PcBCfai',
 37: 'Dpnc43eAi4',
 38: 'j7UpWv5fqR',
 39: 'GQB2ynXAxo',
 40: 'mitFQfzf5B',
 41: 'J0aTsYRADY',
 42: 'pIHgKQsfKv',
 43: 'LaztmIKASI',
 44: 'stg6EAmgae',
 45: 'OBNKg2EBh1',
 46: 'uT5X8vggpO',
 47: 'Rlmkan7Bxl',
 48: 'x3Tx2fZg48',
 49: 'UMBBUX1BCV',
 50: '0esOwPTgKs',
 51: 'WwZbOHuBRE',
 52: '3EHoqAMgZb',
 53

In [None]:
ndcg10 = validate(model, matrices, factors, iterations, regularization, show_progress=False)
print(ndcg10)