# Notebook #1: Designing and evaluating a recommendation algorithm

## Setup the working environment

- Python 3.6
- Requirements: matplotlib, numpy, pandas, scikit-learn, scipy, tensorflow-gpu==2.0

In [1]:
import sys 
import os

sys.path.append(os.path.join('..'))

In [2]:
! pip install -r '../requirements.txt'

Could not open requirements file: [Errno 2] No such file or directory: "'../requirements.txt'"
You are using pip version 18.1, however version 20.2b1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


## Import packages

In [3]:
import pandas as pd
import numpy as np

In [4]:
from helpers.train_test_splitter import *
from helpers.utils import *

In [5]:
data_path= '../data/'

## Split data in train and test sets

Define the train-test split parameters
- **dataset**: csv file present in the data/datasets folder
- **method**: 'uftime' for fixed timestamp split, 'utime' for time-based split per user, 'urandom' for random split per user 
- **percentage**: percentage of data to be included in the train set
- **min_train**: minimum number of train samples for a user to be included  
- **min_test**: minimum number of test samples for a user to be included
- **min_time**: start time of interactions to be included
- **max_time**: end time of interactions to be included
- **step_time**: timestamp step while computing the fixed timestamp splitter (only for method "uftime") 
- **user_field**: name of the user column in the dataset csv file
- **item_field**: name of the item column in the dataset csv file
- **rating_field**: name of the rating column in the dataset csv file
- **time_field**: name of the user column in the dataset csv file

In [6]:
dataset = 'ml1m'          
method = 'utime'
percentage = 0.80        
min_train = 8
min_test = 2
min_time = None
max_time = None
step_time = 1000
user_field = 'user_id'
item_field = 'item_id'
rating_field = 'rating'
time_field = 'timestamp'

Load dataset interactions

In [7]:
data = pd.read_csv('../data/datasets/' + dataset + '.csv', encoding='utf8')

In [8]:
data.head()

Unnamed: 0,user_id,item_id,rating,timestamp,type,type_id
0,1,1193,5.0,2000-12-31 23:12:40,Drama,7
1,2,1193,5.0,2000-12-31 22:33:33,Drama,7
2,12,1193,4.0,2000-12-31 00:49:39,Drama,7
3,15,1193,4.0,2000-12-30 19:01:19,Drama,7
4,17,1193,5.0,2000-12-30 07:41:11,Drama,7


Perform the train and tets split (splitting methods are defined in ./helpers/traintest_splitter.py)

In [9]:
if method == 'uftime':
    traintest = fixed_timestamp(data, min_train, min_test, min_time, max_time, step_time, user_field, item_field, time_field, rating_field)
elif method == 'utime':
    traintest = user_timestamp(data, percentage, min_train+min_test, user_field, item_field, time_field)
elif method == 'urandom':
    traintest = user_random(data, percentage, min_train+min_test, user_field, item_field)

> Parsing user 6000 of 6040
> Mean number of train ratings per learner: 133.07913907284768
> Mean number of test ratings per learner: 32.51837748344371


Print some statistics on train and test sets

In [10]:
print('Full dataset')
print('> Interactions', len(traintest.index))
print('> Users', len(np.unique(traintest[user_field].values)))
print('> Items', len(np.unique(traintest[item_field].values)))

print('Train dataset')
print('> Interactions', len(traintest[traintest['set']=='train'].index))
print('> Users', len(np.unique(traintest[traintest['set']=='train'][user_field].values)))
print('> Items', len(np.unique(traintest[traintest['set']=='train'][item_field].values)))

print('Test dataset')
print('> Interactions', len(traintest[traintest['set']=='test'].index))
print('> Users', len(np.unique(traintest[traintest['set']=='test'][user_field].values)))
print('> Items', len(np.unique(traintest[traintest['set']=='test'][item_field].values)))

Full dataset
> Interactions 1000209
> Users 6040
> Items 3706
Train dataset
> Interactions 803798
> Users 6040
> Items 3667
Test dataset
> Interactions 196411
> Users 6040
> Items 3532


Save train and test sets in ./data/outputs/splits

In [11]:
traintest.to_csv(os.path.join(data_path, 'outputs/splits/' + dataset + '_' + method + '.csv'))

## Run the model train and test

Define the experiment parameters
- **dataset**: csv file present in the data/datasets folder
- **method**: 'uftime' for fixed timestamp split, 'utime' for time-based split per user, 'urandom' for random split per user 
- **mode**: type of feedback to be used (i.e., 'implicit' or 'explicit')
- **user_field**: name of the user column in the dataset csv file
- **item_field**: name of the item column in the dataset csv file
- **rating_field**: name of the rating column in the dataset csv file
- **type_field**: name of the category id column in the dataset csv file
- **model_type**: identifier of the recommendation model to test
- **cutoffs**: comma-separated list of cutoffs to be used for test

In [12]:
dataset = 'ml1m'
method = 'utime'
mode = 'implicit'
user_field = 'user_id'
item_field = 'item_id'
rating_field = 'rating'
type_field = 'type_id'
cutoffs = '5,10,20,50,100,200'

Load pre-compute train and test sets

In [13]:
traintest = pd.read_csv('../data/outputs/splits/' + dataset + '_' + method + '.csv', encoding='utf8')
train = traintest[traintest['set']=='train'].copy()
test = traintest[traintest['set']=='test'].copy()
print('> Loaded', len(train.index), 'train interactions')
print('> Loaded', len(test.index), 'test interactions')

> Loaded 803798 train interactions
> Loaded 196411 test interactions


Show some statistics on users and items

In [14]:
users = list(np.unique(traintest[user_field].values))
items = list(np.unique(traintest[item_field].values))
users.sort()
items.sort()
print('> Loaded', len(users), 'users -', np.min(users), '-', np.max(users), '-', len(np.unique(users)), '-', users[:10])
print('> Loaded', len(items), 'items -', np.min(items), '-', np.max(items), '-', len(np.unique(items)), '-', items[:10])

> Loaded 6040 users - 0 - 6039 - 6040 - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
> Loaded 3706 items - 0 - 3705 - 3706 - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


Load category data for items

In [15]:
items_metadata = traintest.drop_duplicates(subset=['item_id'], keep='first')
print('> Retrieved', len(items_metadata.index), 'mapping indexes, one per course')
category_per_item = items_metadata[type_field].values
print('> Loading item categories identifiers -', len(set(category_per_item)), 'categories like', category_per_item[:3])

> Retrieved 3706 mapping indexes, one per course
> Loading item categories identifiers - 18 categories like [ 7 13  3]


Choose the type of feedback you want to work with

In [16]:
if mode == 'implicit':
    train[rating_field] = train[rating_field].apply(lambda x: 1.0)
    test[rating_field] = test[rating_field].apply(lambda x: 1.0)
    traintest[rating_field] = traintest[rating_field].apply(lambda x: 1.0)

Load the model architecture to train and test

In [17]:
model_type = 'neumf'
gen_mode = 'point' # pair for 'pair-wise' or point for 'point-wise'

In [18]:
from models.neumf import NeuMF

model = NeuMF(users, items, train, test, category_per_item, item_field, user_field, rating_field)

> Initializing user, item, and categories lists
> Initializing observed, unobserved, and predicted relevance scores
> Initializing item popularity lists
> Initializing category per item
> Initializing category preference per user
> Initializing metrics


Train the model

In [19]:
model.train(os.path.join(data_path, 'outputs/models/' + dataset + '_' + method + '_' + model_type + '_model.h5'))

Generating training instances of type point
> Making instances for interaction 800000 / 803798 of type point
> Making training - Epochs 20 Batch Size 1024 Learning Rate 0.001 Factors 10 Negatives 10 Mode point
Train on 7957600 samples, validate on 884178 samples
Epoch 1/20
Epoch 00001: val_loss improved from inf to 0.23358, saving model to ../data/outputs/models/ml1m_utime_neumf_model.h5
Epoch 2/20
Epoch 00002: val_loss did not improve from 0.23358
Epoch 3/20
Epoch 00003: val_loss did not improve from 0.23358
Epoch 00003: early stopping


Model architecture

In [20]:
model.print()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_input (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
item_input (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
mlp_embedding_user (Embedding)  (None, 1, 32)        193280      user_input[0][0]                 
__________________________________________________________________________________________________
mlp_embedding_item (Embedding)  (None, 1, 32)        118592      item_input[0][0]                 
______________________________________________________________________________________________

## Compute user-item relevance scores

In [None]:
model.predict()

> Making predictions for user 657 / 6040 / 6040 28 / 6040 29 / 6040 30 / 6040 32 / 6040 37 / 6040 43 / 6040 50 / 6040 56 / 6040 65 / 6040 66 / 6040 105 / 6040 122 / 6040 124 / 6040 125 / 6040 154 / 6040 155 / 6040 165 / 6040 184 / 6040 185 / 6040 186 / 6040 187 / 6040 188 / 6040 193 / 6040 201 / 6040 218 / 6040 248 / 6040 249 / 6040 277 / 6040 283 / 6040 304 / 6040 305 / 6040 335 / 6040 336 / 6040 337 / 6040 361 / 6040 362 / 6040 366 / 6040 367 / 6040 368 / 6040 397 / 6040 400 / 6040 432 / 6040 451 / 6040 459 / 6040 460 / 6040 461 / 6040 462 / 6040 491 / 6040 509 / 6040 515 / 6040 518 / 6040 521 / 6040 523 / 6040 533 / 6040 544 / 6040 546 / 6040 548 / 6040 565 / 6040 566 / 6040 578 / 6040 579 / 6040 582 / 6040 586 / 6040 598 / 6040 599 / 6040 606 / 6040 608 / 6040 613 / 6040 631 / 6040

In [28]:
save_obj(model.get_predictions(), os.path.join(data_path, 'outputs/predictions/' + dataset + '_' + method + '_' + model_type + '_pred.h5'))

## Calculate metrics

In [29]:
model.test(cutoffs=np.array([int(k) for k in cutoffs.split(',')]))

> Making metrics for user 6000 / 6040

In [30]:
save_obj(model.get_metrics(), os.path.join(data_path, 'outputs/metrics/' + dataset + '_' + method + '_' + model_type + '_metr.h5'))

In [31]:
model.show_metrics()

> Precision: 0.135 
 Recall: 0.0319 
 NDCG: 0.1409 
 Hit Rate: 0.4106 
 Avg Popularity: 1679.0851 
 Category Diversity: 0.2066 
 Novelty: 2.0525 
 Item Coverage: 0.23 
 User Coverage: 0.4106
