# Data loading 
Next cells are used to load the data we need.

In [1]:
import pandas as pd
import numpy as np 
import scipy.sparse as sps
import os
import sys
import matplotlib.pyplot as plt
sys.path.append('../../')


In [2]:
#data which contains users (row), items(col) and implicit interaction (data)
dataset = pd.read_csv('../data_train.csv')
dataset

Unnamed: 0,row,col,data
0,0,10080,1.0
1,0,19467,1.0
2,1,2665,1.0
3,1,7494,1.0
4,1,17068,1.0
...,...,...,...
113263,7945,2476,1.0
113264,7945,12319,1.0
113265,7945,21384,1.0
113266,7946,8699,1.0


# Data pre-processing
Pre-processing of data to check for missing users, items ecc...

In [3]:
users = dataset.row
items = dataset.col
data = dataset.data
URM_all = sps.coo_matrix((data, (users, items)))
URM_all = URM_all.tocsr() #fast row access -> fast access to users 
URM_all.shape

(7947, 25975)

In [4]:
ICM_df = pd.read_csv('../data_ICM_title_abstract.csv')
ICM_df

Unnamed: 0,row,col,data
0,0,1185,1.015524
1,0,2507,0.459024
2,0,3534,0.227742
3,0,8766,0.501549
4,0,10862,0.297011
...,...,...,...
490686,25974,12554,0.963016
490687,25974,13003,0.104613
490688,25974,16236,0.118760
490689,25974,18797,0.363301


In [5]:
items = ICM_df.row
features = ICM_df.col
data = ICM_df.data
ICM_all = sps.coo_matrix((data, (items, features)))
ICM_all = ICM_all.tocsr() #fast row access -> fast access to users 
ICM_all.shape

(25975, 20000)

In [6]:
from Base.Evaluation.Evaluator import EvaluatorHoldout
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.85)
evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10])



In [7]:
URM_train_ICM_all = sps.vstack([URM_train*(1-0.65), ICM_all.T*0.65])

In [11]:
from MatrixFactorization.PureSVDRecommender import PureSVDRecommender
recommender = PureSVDRecommender(URM_train_ICM_all)
recommender.fit(num_factors = 1000)
evaluator_validation.evaluateRecommender(recommender)

PureSVDRecommender: URM Detected 29 (0.10 %) cold users.
PureSVDRecommender: URM Detected 1 (0.00 %) cold items.
PureSVDRecommender: Computing SVD decomposition...
PureSVDRecommender: Computing SVD decomposition... Done!
EvaluatorHoldout: Processed 4991 ( 100.00% ) in 6.72 sec. Users per second: 743


({10: {'ROC_AUC': 0.12910162896388133,
   'PRECISION': 0.02600681226207139,
   'PRECISION_RECALL_MIN_DEN': 0.09696708042730871,
   'RECALL': 0.09454216162505298,
   'MAP': 0.0412085364767692,
   'MRR': 0.09128081340062885,
   'NDCG': 0.06494510299640914,
   'F1': 0.04079238783957458,
   'HIT_RATE': 0.26006812262071727,
   'ARHR': 0.10173542535293663,
   'NOVELTY': 0.00537743259496588,
   'AVERAGE_POPULARITY': 0.11139099026897083,
   'DIVERSITY_MEAN_INTER_LIST': 0.9958198665413375,
   'DIVERSITY_HERFINDAHL': 0.9995620343426422,
   'COVERAGE_ITEM': 0.35530317613089507,
   'COVERAGE_ITEM_CORRECT': 0.03753609239653513,
   'COVERAGE_USER': 0.6280357367560085,
   'COVERAGE_USER_CORRECT': 0.1293569900591418,
   'DIVERSITY_GINI': 0.1425147263843011,
   'SHANNON_ENTROPY': 12.116664039086507}},
 'CUTOFF: 10 - ROC_AUC: 0.1291016, PRECISION: 0.0260068, PRECISION_RECALL_MIN_DEN: 0.0969671, RECALL: 0.0945422, MAP: 0.0412085, MRR: 0.0912808, NDCG: 0.0649451, F1: 0.0407924, HIT_RATE: 0.2600681, ARHR: 

In [13]:
item_factor = recommender.ITEM_factors

super_threshold_indices = abs(item_factor) < 0.0001
item_factor[super_threshold_indices] = 0

non_zero = np.count_nonzero(item_factor)
print(non_zero)

zero = np.count_nonzero(item_factor==0)
print(zero)

item_factor = sps.csr_matrix(item_factor)

25328314
646686


In [14]:
item_factor

<25975x1000 sparse matrix of type '<class 'numpy.float32'>'
	with 25328314 stored elements in Compressed Sparse Row format>

In [15]:
URM_train_ICM_all = sps.vstack([URM_train*(1-0.289), ICM_all.T*0.289])
URM_train_ICM_all

<27947x25975 sparse matrix of type '<class 'numpy.float64'>'
	with 586969 stored elements in COOrdinate format>

In [16]:
from GraphBased.RP3betaRecommender import RP3betaRecommender

recommender_rp3Beta = RP3betaRecommender(item_factor.T)
recommender_rp3Beta.fit(topK=430, alpha = 0.47, beta=0.27, implicit = False)

#recommender_rp3Beta.W_sparse.data = np.power(recommender_rp3Beta.W_sparse.data, 0.89)
recommender_rp3Beta.URM_train = URM_train_ICM_all
####recommender_rp3Beta_user.URM_train = URM_train
evaluator_validation.evaluateRecommender(recommender_rp3Beta)

RP3betaRecommender: URM Detected 1 (0.00 %) cold items.


  return self._with_data(data ** n)


RP3betaRecommender: Processed 1200 ( 4.62% ) in 1.04 minutes. Rows per second: 19
RP3betaRecommender: Processed 2600 ( 10.01% ) in 2.16 minutes. Rows per second: 20
RP3betaRecommender: Processed 4000 ( 15.40% ) in 3.27 minutes. Rows per second: 20
RP3betaRecommender: Processed 5400 ( 20.79% ) in 4.41 minutes. Rows per second: 20
RP3betaRecommender: Processed 6800 ( 26.18% ) in 5.54 minutes. Rows per second: 20
RP3betaRecommender: Processed 8200 ( 31.57% ) in 6.68 minutes. Rows per second: 20
RP3betaRecommender: Processed 9600 ( 36.96% ) in 7.83 minutes. Rows per second: 20
RP3betaRecommender: Processed 10800 ( 41.58% ) in 8.92 minutes. Rows per second: 20
RP3betaRecommender: Processed 12000 ( 46.20% ) in 10.05 minutes. Rows per second: 20
RP3betaRecommender: Processed 13200 ( 50.82% ) in 11.15 minutes. Rows per second: 20
RP3betaRecommender: Processed 14400 ( 55.44% ) in 12.21 minutes. Rows per second: 20
RP3betaRecommender: Processed 15600 ( 60.06% ) in 13.23 minutes. Rows per second:

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [None]:
test_users = pd.read_csv('data_target_users_test.csv')
test_users

In [None]:
recommender = ItemKNNCBFRecommender(URM_all, ICM_all)
recommender.fit(topK=600, shrink=200, feature_weighting = 'TF-IDF')

user_id = test_users['user_id']
recommendations = recommender.recommend(user_id,cutoff = 10)


In [None]:
for index in range(len(recommendations)):
    recommendations[index]=np.array(recommendations[index])
    
print(len(recommendations))

test_users['item_list']= recommendations
test_users['item_list'] = pd.DataFrame([str(line).strip('[').strip(']').replace("'","") for line in test_users['item_list']])

test_users
test_users.to_csv('submission.csv', index=False)
