# Data loading 
Next cells are used to load the data we need.

In [1]:
import pandas as pd
import numpy as np 
import scipy.sparse as sps
import os
import sys
import matplotlib.pyplot as plt
from lightfm import LightFM
from lightfm.evaluation import auc_score 
sys.path.append('../../')




In [2]:
#data which contains users (row), items(col) and implicit interaction (data)
dataset = pd.read_csv('../data_train.csv')
dataset

Unnamed: 0,row,col,data
0,0,10080,1.0
1,0,19467,1.0
2,1,2665,1.0
3,1,7494,1.0
4,1,17068,1.0
...,...,...,...
113263,7945,2476,1.0
113264,7945,12319,1.0
113265,7945,21384,1.0
113266,7946,8699,1.0


# Data pre-processing
Pre-processing of data to check for missing users, items ecc...

In [3]:
users = dataset.row
items = dataset.col
data = dataset.data
URM_all = sps.coo_matrix((data, (users, items)))
URM_all = URM_all.tocsr() #fast row access -> fast access to users 
URM_all.shape

(7947, 25975)

In [4]:
ICM_df = pd.read_csv('../data_ICM_title_abstract.csv')
ICM_df

Unnamed: 0,row,col,data
0,0,1185,1.015524
1,0,2507,0.459024
2,0,3534,0.227742
3,0,8766,0.501549
4,0,10862,0.297011
...,...,...,...
490686,25974,12554,0.963016
490687,25974,13003,0.104613
490688,25974,16236,0.118760
490689,25974,18797,0.363301


In [5]:
items = ICM_df.row
features = ICM_df.col
data = ICM_df.data
ICM_all = sps.coo_matrix((data, (items, features)))
ICM_all = ICM_all.tocsr() #fast row access -> fast access to users 
ICM_all.shape

(25975, 20000)

In [6]:
from Base.Evaluation.Evaluator import EvaluatorHoldout
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

#splitting data into train and validation (from the previous training data)
URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.85)





In [7]:
URM_all

<7947x25975 sparse matrix of type '<class 'numpy.float64'>'
	with 113268 stored elements in Compressed Sparse Row format>

In [8]:
ICM_all

<25975x20000 sparse matrix of type '<class 'numpy.float64'>'
	with 490691 stored elements in Compressed Sparse Row format>

In [9]:
#evaluator for validation (used for hyperparameter tuning)
evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10])

In [10]:
# Instantiate and train the model
model = LightFM(loss='warp', learning_schedule= 'adadelta', learning_rate = 1e-5)
model.fit(URM_train, epochs=100, num_threads=2, item_features = ICM_all)

<lightfm.lightfm.LightFM at 0x27c7cb88640>

In [11]:
# Evaluate the trained model
train_auc = auc_score(model, URM_train, item_features = ICM_all).mean()
test_auc = auc_score(model, URM_validation, item_features = ICM_all).mean()

print("Train auc WARP: {:.2f}".format(train_auc))
print("Test auc WARP: {:.2f}".format(test_auc))

Train auc WARP: 0.99
Test auc WARP: 0.87


In [42]:
n_items = 25976
class Recommender(object):

    def __init__(self, URM, model):
        self.URM = URM
        self.model = model

    def recommend(self, user_id, at=10, exclude_seen=True):
        # compute the scores using the dot product
        user_profile = self.URM[user_id]
        scores = model.predict(user_id, np.array(list(range(n_items))), item_features=ICM_all)

        if exclude_seen:
            scores = self.filter_seen(user_id, scores)

        # rank items
        ranking = scores.argsort()[::-1]

        return ranking[:at]

    def filter_seen(self, user_id, scores):
        start_pos = self.URM.indptr[user_id]
        end_pos = self.URM.indptr[user_id + 1]

        user_profile = self.URM.indices[start_pos:end_pos]

        scores[user_profile] = -np.inf

        return scores

In [12]:
test_users = pd.read_csv('../data_target_users_test.csv')
test_users

Unnamed: 0,user_id
0,0
1,1
2,2
3,3
4,4
...,...
7939,7942
7940,7943
7941,7944
7942,7945


In [17]:
def sample_recommendation(model, data, user_ids, item_features):
    recommendations = []
    

    n_users, n_items = URM_all.shape

    for user_id in user_ids:
        temp = []
        scores = model.predict(user_id, np.arange(n_items), item_features = item_features)
        top_items = scores.argsort()[::-1]
        
        for x in top_items[:10]:
            temp.append(x)
        recommendations.append(temp)
    return recommendations

In [14]:
type(test_users)

pandas.core.frame.DataFrame

In [18]:
user_id = test_users['user_id']
recommendations=sample_recommendation(model, URM_all, user_id, item_features = ICM_all)

In [19]:
recommendations

[[24075, 19480, 19089, 10269, 13797, 4927, 25693, 9438, 21552, 15830],
 [14640, 25013, 9468, 11623, 16956, 25209, 13602, 15437, 19792, 7494],
 [24075, 9438, 25675, 10269, 25693, 5942, 25013, 5044, 7494, 8538],
 [3997, 24093, 16857, 4668, 17820, 24484, 20990, 10844, 57, 15764],
 [19480, 4513, 7349, 19792, 14165, 3835, 3139, 19089, 17914, 7494],
 [3037, 17819, 8322, 15896, 11089, 24075, 10919, 24958, 15770, 19874],
 [15691, 9555, 7727, 7564, 9992, 16928, 25407, 3812, 25044, 3921],
 [17210, 9972, 17305, 7624, 19532, 16685, 230, 3014, 20478, 10195],
 [24075, 25675, 9438, 10269, 25407, 25013, 19468, 5044, 6327, 7115],
 [7030, 21890, 23069, 24153, 8059, 20130, 7788, 23504, 11024, 13939],
 [23069, 3572, 1628, 16834, 23505, 3711, 7117, 21433, 399, 15377],
 [5078, 4257, 9690, 12952, 18673, 18233, 19966, 23444, 21385, 2570],
 [24199, 13987, 15459, 2090, 23536, 13541, 18190, 15327, 15969, 20472],
 [11533, 25407, 9555, 6734, 8860, 7086, 5952, 1583, 6708, 2426],
 [24075, 25675, 10269, 9438, 6985, 1

In [43]:
recommender = Recommender(URM_all, model)

user_id = test_users['user_id']
recommendations = recommender.recommend(user_id, at = 10)


AssertionError: 

In [20]:
for index in range(len(recommendations)):
    #print(element)
    recommendations[index]=np.array(recommendations[index])
    #print(type(element))
print(len(recommendations))

7944


In [21]:
test_users['item_list']= recommendations
#test_users['item_list'] =  test_users['item_list'].apply(lambda x: x.replace('[','').replace(']','')) 
test_users['item_list'] = pd.DataFrame([str(line).strip('[').strip(']').replace("'","") for line in test_users['item_list']])

#convert the string columns to int
#test_users['item_list'] = test_users['item_list'].astype(int)
test_users
test_users.to_csv('submission.csv', index=False)


# Funk SVD test 

In [None]:
from MatrixFactorization.Cython.MatrixFactorization_Cython import MatrixFactorization_FunkSVD_Cython
recommender = MatrixFactorization_FunkSVD_Cython(URM_train)
recommender.fit(epochs = 100, num_factors = 200, use_bias = True,  user_reg = 0.1, item_reg = 0.1)

MatrixFactorization_FunkSVD_Cython_Recommender: URM Detected 34 (0.43 %) cold users.
MatrixFactorization_FunkSVD_Cython_Recommender: URM Detected 2004 (7.72 %) cold items.
FUNK_SVD: Processed 96000 ( 98.97% ) in 1.72 seconds. MSE loss 9.40E-01. Sample per second: 55704
FUNK_SVD: Epoch 1 of 100. Elapsed time 1.40 sec
FUNK_SVD: Processed 96000 ( 98.97% ) in 2.34 seconds. MSE loss 7.78E-01. Sample per second: 41105
FUNK_SVD: Epoch 2 of 100. Elapsed time 3.01 sec
FUNK_SVD: Processed 96000 ( 98.97% ) in 1.79 seconds. MSE loss 6.43E-01. Sample per second: 53762
FUNK_SVD: Epoch 3 of 100. Elapsed time 4.46 sec
FUNK_SVD: Processed 96000 ( 98.97% ) in 2.09 seconds. MSE loss 5.33E-01. Sample per second: 45869
FUNK_SVD: Epoch 4 of 100. Elapsed time 5.76 sec
FUNK_SVD: Processed 96000 ( 98.97% ) in 1.40 seconds. MSE loss 4.43E-01. Sample per second: 68515
FUNK_SVD: Epoch 5 of 100. Elapsed time 7.07 sec
FUNK_SVD: Processed 96000 ( 98.97% ) in 1.75 seconds. MSE loss 3.68E-01. Sample per second: 54963


In [None]:
evaluator_validation.evaluateRecommender(recommender)