# Data loading 
Next cells are used to load the data we need.

In [1]:
import pandas as pd
import numpy as np 
import scipy.sparse as sps
import os
import sys
import matplotlib.pyplot as plt
from lightfm import LightFM
from lightfm.evaluation import auc_score 
sys.path.append('../../')




In [2]:
#data which contains users (row), items(col) and implicit interaction (data)
dataset = pd.read_csv('../data_train.csv')
dataset

Unnamed: 0,row,col,data
0,0,10080,1.0
1,0,19467,1.0
2,1,2665,1.0
3,1,7494,1.0
4,1,17068,1.0
...,...,...,...
113263,7945,2476,1.0
113264,7945,12319,1.0
113265,7945,21384,1.0
113266,7946,8699,1.0


# Data pre-processing
Pre-processing of data to check for missing users, items ecc...

In [3]:
users = dataset.row
items = dataset.col
data = dataset.data
URM_all = sps.coo_matrix((data, (users, items)))
URM_all = URM_all.tocsr() #fast row access -> fast access to users 
URM_all.shape

(7947, 25975)

In [4]:
ICM_df = pd.read_csv('../data_ICM_title_abstract.csv')
ICM_df

Unnamed: 0,row,col,data
0,0,1185,1.015524
1,0,2507,0.459024
2,0,3534,0.227742
3,0,8766,0.501549
4,0,10862,0.297011
...,...,...,...
490686,25974,12554,0.963016
490687,25974,13003,0.104613
490688,25974,16236,0.118760
490689,25974,18797,0.363301


In [5]:
items = ICM_df.row
features = ICM_df.col
data = ICM_df.data
ICM_all = sps.coo_matrix((data, (items, features)))
ICM_all = ICM_all.tocsr() #fast row access -> fast access to users 
ICM_all.shape

(25975, 20000)

In [6]:
from Base.Evaluation.Evaluator import EvaluatorHoldout
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

#splitting data into train and validation (from the previous training data)
URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.85)





In [7]:
URM_all

<7947x25975 sparse matrix of type '<class 'numpy.float64'>'
	with 113268 stored elements in Compressed Sparse Row format>

In [8]:
ICM_all

<25975x20000 sparse matrix of type '<class 'numpy.float64'>'
	with 490691 stored elements in Compressed Sparse Row format>

In [9]:
#evaluator for validation (used for hyperparameter tuning)
evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10])

In [10]:
## In order to evaluate put it in a dummy recommender class
from Base.BaseRecommender import BaseRecommender
from lightfm import LightFM


class LightFMWrapper(BaseRecommender):
    """LightFMWrapper"""

    RECOMMENDER_NAME = "LightFMWrapper"

    def __init__(self, URM_train, ICM_train):
        super(LightFMWrapper, self).__init__(URM_train)
        
        self.ICM_train = ICM_train.copy()


    def fit(self, ITEM_ALPHA, NUM_COMPONENTS, NUM_EPOCHS, NUM_THREADS):
        
        # Let's fit a WARP model
        self.lightFM_model = LightFM(loss='warp',
                                     item_alpha=ITEM_ALPHA,
                                     no_components=NUM_COMPONENTS)

        self.lightFM_model = self.lightFM_model.fit(URM_train, 
                                       item_features=self.ICM_train, 
                                       epochs=NUM_EPOCHS, 
                                       num_threads=NUM_THREADS)


    def _compute_item_score(self, user_id_array, items_to_compute = None):
        
        # Create a single (n_items, ) array with the item score, then copy it for every user
        if items_to_compute is None:
            items_to_compute = np.arange(self.n_items)
            item_features = self.ICM_train 
        else:     
            items_to_compute = np.array(items_to_compute)
            item_features = self.ICM_train[items_to_compute,:]
        
        item_scores = - np.ones((len(user_id_array), self.n_items)) * np.inf

        for user_index, user_id in enumerate(user_id_array):
            item_scores[user_index] = self.lightFM_model.predict(int(user_id), 
                                                                 items_to_compute,
                                                                 item_features = item_features)

        return item_scores

In [22]:
NUM_THREADS = 4
NUM_COMPONENTS = 50
NUM_EPOCHS = 400
ITEM_ALPHA = 1e-7
recommender = LightFMWrapper(URM_train, ICM_all)
recommender.fit(ITEM_ALPHA, NUM_COMPONENTS, NUM_EPOCHS, NUM_THREADS)

evaluator_validation.evaluateRecommender(recommender)

LightFMWrapper: URM Detected 36 (0.45 %) cold users.
LightFMWrapper: URM Detected 2019 (7.77 %) cold items.
EvaluatorHoldout: Processed 1000 ( 19.97% ) in 54.11 sec. Users per second: 18
EvaluatorHoldout: Processed 2000 ( 39.94% ) in 1.98 min. Users per second: 17
EvaluatorHoldout: Processed 3000 ( 59.90% ) in 3.08 min. Users per second: 16
EvaluatorHoldout: Processed 4000 ( 79.87% ) in 4.01 min. Users per second: 17
EvaluatorHoldout: Processed 5000 ( 99.84% ) in 5.07 min. Users per second: 16
EvaluatorHoldout: Processed 5008 ( 100.00% ) in 5.08 min. Users per second: 16


({10: {'ROC_AUC': 0.09889225651655767,
   'PRECISION': 0.02072683706070269,
   'PRECISION_RECALL_MIN_DEN': 0.0768728764136112,
   'RECALL': 0.07493532748564656,
   'MAP': 0.03243935047209924,
   'MRR': 0.070094990871748,
   'NDCG': 0.05111788286356636,
   'F1': 0.032472029673400606,
   'HIT_RATE': 0.20726837060702874,
   'ARHR': 0.07782904495664072,
   'NOVELTY': 0.004930330981442909,
   'AVERAGE_POPULARITY': 0.08710349155636665,
   'DIVERSITY_MEAN_INTER_LIST': 0.9901818604113986,
   'DIVERSITY_HERFINDAHL': 0.9989984140391348,
   'COVERAGE_ITEM': 0.4073532242540905,
   'COVERAGE_ITEM_CORRECT': 0.02883541867179981,
   'COVERAGE_USER': 0.6301749087706052,
   'COVERAGE_USER_CORRECT': 0.10607776519441299,
   'DIVERSITY_GINI': 0.15178569693479657,
   'SHANNON_ENTROPY': 11.947804253893292}},
 'CUTOFF: 10 - ROC_AUC: 0.0988923, PRECISION: 0.0207268, PRECISION_RECALL_MIN_DEN: 0.0768729, RECALL: 0.0749353, MAP: 0.0324394, MRR: 0.0700950, NDCG: 0.0511179, F1: 0.0324720, HIT_RATE: 0.2072684, ARHR:

In [12]:
test_users = pd.read_csv('../data_target_users_test.csv')
test_users

Unnamed: 0,user_id
0,0
1,1
2,2
3,3
4,4
...,...
7939,7942
7940,7943
7941,7944
7942,7945


In [43]:
recommender = Recommender(URM_all, model)

user_id = test_users['user_id']
recommendations = recommender.recommend(user_id, at = 10)


AssertionError: 

In [20]:
for index in range(len(recommendations)):
    #print(element)
    recommendations[index]=np.array(recommendations[index])
    #print(type(element))
print(len(recommendations))

7944


In [21]:
test_users['item_list']= recommendations
#test_users['item_list'] =  test_users['item_list'].apply(lambda x: x.replace('[','').replace(']','')) 
test_users['item_list'] = pd.DataFrame([str(line).strip('[').strip(']').replace("'","") for line in test_users['item_list']])

#convert the string columns to int
#test_users['item_list'] = test_users['item_list'].astype(int)
test_users
test_users.to_csv('submission.csv', index=False)


# Funk SVD test 

In [None]:
from MatrixFactorization.Cython.MatrixFactorization_Cython import MatrixFactorization_FunkSVD_Cython
recommender = MatrixFactorization_FunkSVD_Cython(URM_train)
recommender.fit(epochs = 100, num_factors = 200, use_bias = True,  user_reg = 0.1, item_reg = 0.1)

MatrixFactorization_FunkSVD_Cython_Recommender: URM Detected 34 (0.43 %) cold users.
MatrixFactorization_FunkSVD_Cython_Recommender: URM Detected 2004 (7.72 %) cold items.
FUNK_SVD: Processed 96000 ( 98.97% ) in 1.72 seconds. MSE loss 9.40E-01. Sample per second: 55704
FUNK_SVD: Epoch 1 of 100. Elapsed time 1.40 sec
FUNK_SVD: Processed 96000 ( 98.97% ) in 2.34 seconds. MSE loss 7.78E-01. Sample per second: 41105
FUNK_SVD: Epoch 2 of 100. Elapsed time 3.01 sec
FUNK_SVD: Processed 96000 ( 98.97% ) in 1.79 seconds. MSE loss 6.43E-01. Sample per second: 53762
FUNK_SVD: Epoch 3 of 100. Elapsed time 4.46 sec
FUNK_SVD: Processed 96000 ( 98.97% ) in 2.09 seconds. MSE loss 5.33E-01. Sample per second: 45869
FUNK_SVD: Epoch 4 of 100. Elapsed time 5.76 sec
FUNK_SVD: Processed 96000 ( 98.97% ) in 1.40 seconds. MSE loss 4.43E-01. Sample per second: 68515
FUNK_SVD: Epoch 5 of 100. Elapsed time 7.07 sec
FUNK_SVD: Processed 96000 ( 98.97% ) in 1.75 seconds. MSE loss 3.68E-01. Sample per second: 54963


In [None]:
evaluator_validation.evaluateRecommender(recommender)