In [3]:
import pandas as pd
import numpy as np 
import scipy.sparse as sps
import os
import sys
import matplotlib.pyplot as plt
sys.path.append('../../')


In [5]:
#data which contains users (row), items(col) and implicit interaction (data)
dataset = pd.read_csv('../data_train.csv')
dataset

Unnamed: 0,row,col,data
0,0,10080,1.0
1,0,19467,1.0
2,1,2665,1.0
3,1,7494,1.0
4,1,17068,1.0
...,...,...,...
113263,7945,2476,1.0
113264,7945,12319,1.0
113265,7945,21384,1.0
113266,7946,8699,1.0


In [6]:
users = dataset.row
items = dataset.col
data = dataset.data
URM_all = sps.coo_matrix((data, (users, items)))
URM_all = URM_all.tocsr() #fast row access -> fast access to users 
URM_all.shape

(7947, 25975)

In [7]:
items_per_user = np.ediff1d(URM_all.indptr) #tells in which position each of the rows begin
users_per_item = np.ediff1d(sps.csc_matrix(URM_all).indptr)
URM_all = sps.csr_matrix(URM_all)

In [8]:
items_per_user = np.sort(items_per_user) #sorting based on the single value. Losing reference to the user. 
users_per_item = np.sort(users_per_item)

In [11]:
ICM_df = pd.read_csv('../data_ICM_title_abstract.csv')
ICM_df

Unnamed: 0,row,col,data
0,0,1185,1.015524
1,0,2507,0.459024
2,0,3534,0.227742
3,0,8766,0.501549
4,0,10862,0.297011
...,...,...,...
490686,25974,12554,0.963016
490687,25974,13003,0.104613
490688,25974,16236,0.118760
490689,25974,18797,0.363301


In [12]:
items = ICM_df.row
features = ICM_df.col
data = ICM_df.data
ICM_all = sps.coo_matrix((data, (items, features)))
ICM_all = ICM_all.tocsr() #fast row access -> fast access to users 
ICM_all.shape

(25975, 20000)

In [13]:
ICM_all = sps.csr_matrix(ICM_all)
features_per_item = np.ediff1d(ICM_all.indptr)

ICM_all = sps.csc_matrix(ICM_all)
items_per_feature = np.ediff1d(ICM_all.indptr)

ICM_all = sps.csr_matrix(ICM_all)

In [14]:
features_per_item = np.sort(features_per_item)
items_per_feature = np.sort(items_per_feature)

In [20]:
from Base.Evaluation.Evaluator import EvaluatorHoldout
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.80)

evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10])




In [32]:
from KNN.ItemKNNCBFRecommender import ItemKNNCBFRecommender
from KNN.ItemKNNSimilarityHybridRecommender import ItemKNNSimilarityHybridRecommender
from GraphBased.RP3betaRecommender import RP3betaRecommender

recommender_beta = RP3betaRecommender(URM_train)
recommender_beta.fit(topK=100, alpha=0.45, beta=0.15, implicit=True)


evaluator_validation.evaluateRecommender(recommender_beta)

RP3betaRecommender: URM Detected 68 (0.86 %) cold users.
RP3betaRecommender: URM Detected 2421 (9.32 %) cold items.
EvaluatorHoldout: Processed 5660 ( 100.00% ) in 4.39 sec. Users per second: 1289


({10: {'ROC_AUC': 0.15300950698300536,
   'PRECISION': 0.03303886925794991,
   'PRECISION_RECALL_MIN_DEN': 0.11264176341914894,
   'RECALL': 0.10830238865839441,
   'MAP': 0.052245149550135435,
   'MRR': 0.11154236076055861,
   'NDCG': 0.07822154012867398,
   'F1': 0.05063190340822059,
   'HIT_RATE': 0.3303886925795053,
   'ARHR': 0.12933493185259978,
   'NOVELTY': 0.005280931900356047,
   'AVERAGE_POPULARITY': 0.05645476688737145,
   'DIVERSITY_MEAN_INTER_LIST': 0.995702208621059,
   'DIVERSITY_HERFINDAHL': 0.9995526289502928,
   'COVERAGE_ITEM': 0.596689124157844,
   'COVERAGE_ITEM_CORRECT': 0.04685274302213667,
   'COVERAGE_USER': 0.7122184472127847,
   'COVERAGE_USER_CORRECT': 0.17365043412608533,
   'DIVERSITY_GINI': 0.2838192131143616,
   'SHANNON_ENTROPY': 12.878131711791845}},
 'CUTOFF: 10 - ROC_AUC: 0.1530095, PRECISION: 0.0330389, PRECISION_RECALL_MIN_DEN: 0.1126418, RECALL: 0.1083024, MAP: 0.0522451, MRR: 0.1115424, NDCG: 0.0782215, F1: 0.0506319, HIT_RATE: 0.3303887, ARHR: 

In [33]:
recommender_CBF = ItemKNNCBFRecommender(URM_train, ICM_all)
recommender_CBF.fit(topK=625, shrink=50, feature_weighting = 'TF-IDF')
evaluator_validation.evaluateRecommender(recommender_CBF)

ItemKNNCBFRecommender: URM Detected 68 (0.86 %) cold users.
ItemKNNCBFRecommender: URM Detected 2421 (9.32 %) cold items.
Similarity column 25975 ( 100 % ), 3301.42 column/sec, elapsed time 0.13 min
EvaluatorHoldout: Processed 5660 ( 100.00% ) in 5.04 sec. Users per second: 1124


({10: {'ROC_AUC': 0.09818653598070567,
   'PRECISION': 0.02067137809187256,
   'PRECISION_RECALL_MIN_DEN': 0.07094305625666045,
   'RECALL': 0.06863734608553383,
   'MAP': 0.030806615503811766,
   'MRR': 0.06936150372987816,
   'NDCG': 0.04847911504827861,
   'F1': 0.03177357072839517,
   'HIT_RATE': 0.2067137809187279,
   'ARHR': 0.07808927870323616,
   'NOVELTY': 0.005282353422456795,
   'AVERAGE_POPULARITY': 0.017660811691653244,
   'DIVERSITY_MEAN_INTER_LIST': 0.998054813714918,
   'DIVERSITY_HERFINDAHL': 0.9997878478942177,
   'COVERAGE_ITEM': 0.6460827718960539,
   'COVERAGE_ITEM_CORRECT': 0.03996150144369586,
   'COVERAGE_USER': 0.7122184472127847,
   'COVERAGE_USER_CORRECT': 0.11614445702780923,
   'DIVERSITY_GINI': 0.3253556816784398,
   'SHANNON_ENTROPY': 13.27060751677278}},
 'CUTOFF: 10 - ROC_AUC: 0.0981865, PRECISION: 0.0206714, PRECISION_RECALL_MIN_DEN: 0.0709431, RECALL: 0.0686373, MAP: 0.0308066, MRR: 0.0693615, NDCG: 0.0484791, F1: 0.0317736, HIT_RATE: 0.2067138, ARHR:

In [52]:
from KNN.ItemKNNCFRecommender import ItemKNNCFRecommender

recommender_itemKNN = ItemKNNCFRecommender(URM_train)
recommender_itemKNN.fit(shrink=550, topK=500, feature_weighting = 'TF-IDF', similarity = 'cosine')
evaluator_validation.evaluateRecommender(recommender_itemKNN)

ItemKNNCFRecommender: URM Detected 68 (0.86 %) cold users.
ItemKNNCFRecommender: URM Detected 2421 (9.32 %) cold items.
Similarity column 25975 ( 100 % ), 5118.71 column/sec, elapsed time 0.08 min
EvaluatorHoldout: Processed 5660 ( 100.00% ) in 4.42 sec. Users per second: 1282


({10: {'ROC_AUC': 0.15811360003365338,
   'PRECISION': 0.03397526501766716,
   'PRECISION_RECALL_MIN_DEN': 0.11631162712434827,
   'RECALL': 0.11218593482003421,
   'MAP': 0.05516114221012301,
   'MRR': 0.11860795613887498,
   'NDCG': 0.08229659173897609,
   'F1': 0.05215538557425322,
   'HIT_RATE': 0.33975265017667844,
   'ARHR': 0.13659494363116298,
   'NOVELTY': 0.005095320970285993,
   'AVERAGE_POPULARITY': 0.06655494185049264,
   'DIVERSITY_MEAN_INTER_LIST': 0.9938162356844876,
   'DIVERSITY_HERFINDAHL': 0.9993640649777122,
   'COVERAGE_ITEM': 0.5123772858517806,
   'COVERAGE_ITEM_CORRECT': 0.04423484119345525,
   'COVERAGE_USER': 0.7122184472127847,
   'COVERAGE_USER_CORRECT': 0.17780294450736125,
   'DIVERSITY_GINI': 0.19750835236875353,
   'SHANNON_ENTROPY': 12.295707015664501}},
 'CUTOFF: 10 - ROC_AUC: 0.1581136, PRECISION: 0.0339753, PRECISION_RECALL_MIN_DEN: 0.1163116, RECALL: 0.1121859, MAP: 0.0551611, MRR: 0.1186080, NDCG: 0.0822966, F1: 0.0521554, HIT_RATE: 0.3397527, ARH

In [70]:
from KNN.UserKNNCFRecommender import UserKNNCFRecommender

recommender_userKNN = UserKNNCFRecommender(URM_train)
recommender_userKNN.fit(shrink=50, topK=100, feature_weighting = "TF-IDF")

evaluator_validation.evaluateRecommender(recommender_userKNN)

UserKNNCFRecommender: URM Detected 68 (0.86 %) cold users.
UserKNNCFRecommender: URM Detected 2421 (9.32 %) cold items.
Similarity column 7947 ( 100 % ), 14876.00 column/sec, elapsed time 0.01 min
EvaluatorHoldout: Processed 5660 ( 100.00% ) in 3.70 sec. Users per second: 1532


({10: {'ROC_AUC': 0.1628756730607442,
   'PRECISION': 0.0345406360424021,
   'PRECISION_RECALL_MIN_DEN': 0.1195931488025132,
   'RECALL': 0.11544989338720221,
   'MAP': 0.055473339149165746,
   'MRR': 0.11895226877559048,
   'NDCG': 0.0834954089876725,
   'F1': 0.0531728604970762,
   'HIT_RATE': 0.3454063604240283,
   'ARHR': 0.1366473161702846,
   'NOVELTY': 0.004880389420450504,
   'AVERAGE_POPULARITY': 0.09236068704472408,
   'DIVERSITY_MEAN_INTER_LIST': 0.9899317263785071,
   'DIVERSITY_HERFINDAHL': 0.9989756826780207,
   'COVERAGE_ITEM': 0.4044658325312801,
   'COVERAGE_ITEM_CORRECT': 0.0411934552454283,
   'COVERAGE_USER': 0.7122184472127847,
   'COVERAGE_USER_CORRECT': 0.1833396250157292,
   'DIVERSITY_GINI': 0.12850829006606806,
   'SHANNON_ENTROPY': 11.590050135379803}},
 'CUTOFF: 10 - ROC_AUC: 0.1628757, PRECISION: 0.0345406, PRECISION_RECALL_MIN_DEN: 0.1195931, RECALL: 0.1154499, MAP: 0.0554733, MRR: 0.1189523, NDCG: 0.0834954, F1: 0.0531729, HIT_RATE: 0.3454064, ARHR: 0.136

In [77]:
from KNN.ItemKNNScoresHybridMultipleRecommender import ItemKNNScoresHybridMultipleRecommender

hybridrecommender = ItemKNNScoresHybridMultipleRecommender(URM_train, recommender_CBF, recommender_itemKNN, recommender_userKNN)
hybridrecommender.fit(alpha = 0.6, beta=0.4)

evaluator_validation.evaluateRecommender(hybridrecommender)

ItemKNNScoresHybridMultipleRecommender: URM Detected 68 (0.86 %) cold users.
ItemKNNScoresHybridMultipleRecommender: URM Detected 2421 (9.32 %) cold items.
shape of item_weights: (1000, 25975)
shape of item_weights: (1000, 25975)
shape of item_weights: (1000, 25975)
shape of item_weights: (1000, 25975)
shape of item_weights: (1000, 25975)
shape of item_weights: (660, 25975)
EvaluatorHoldout: Processed 5660 ( 100.00% ) in 7.71 sec. Users per second: 735


({10: {'ROC_AUC': 0.1782835408603966,
   'PRECISION': 0.038745583038868395,
   'PRECISION_RECALL_MIN_DEN': 0.1350507599977569,
   'RECALL': 0.1305995959977029,
   'MAP': 0.06419083923485623,
   'MRR': 0.13433913848224813,
   'NDCG': 0.09544268851151999,
   'F1': 0.05976145905492692,
   'HIT_RATE': 0.3874558303886926,
   'ARHR': 0.15532180716809726,
   'NOVELTY': 0.005109158891532785,
   'AVERAGE_POPULARITY': 0.06344774549155514,
   'DIVERSITY_MEAN_INTER_LIST': 0.9948815389601104,
   'DIVERSITY_HERFINDAHL': 0.9994705764836619,
   'COVERAGE_ITEM': 0.5283926852743022,
   'COVERAGE_ITEM_CORRECT': 0.05370548604427334,
   'COVERAGE_USER': 0.7122184472127847,
   'COVERAGE_USER_CORRECT': 0.20234050585126462,
   'DIVERSITY_GINI': 0.21186161240438306,
   'SHANNON_ENTROPY': 12.459289535961915}},
 'CUTOFF: 10 - ROC_AUC: 0.1782835, PRECISION: 0.0387456, PRECISION_RECALL_MIN_DEN: 0.1350508, RECALL: 0.1305996, MAP: 0.0641908, MRR: 0.1343391, NDCG: 0.0954427, F1: 0.0597615, HIT_RATE: 0.3874558, ARHR: 

In [None]:
test_users = pd.read_csv('data_target_users_test.csv')
test_users

In [None]:
recommender_alpha = P3alphaRecommender(URM_all)
recommender_alpha.fit(topK=475, alpha = 0.45, implicit = True)

recommender_alpha_ICM = P3alphaRecommender(ICM_all.T)
recommender_alpha_ICM.fit(topK=175, alpha = 0.45)
recommender_alpha_ICM.URM_train = URM_train

hybridrecommender = ItemKNNSimilarityHybridRecommender(URM_all, recommender_alpha_ICM.W_sparse, recommender_alpha.W_sparse)
hybridrecommender.fit(topK=600, alpha = 0.45)

user_id = test_users['user_id']
recommendations = hybridrecommender.recommend(user_id,cutoff = 10)


In [None]:
for index in range(len(recommendations)):
    #print(element)
    recommendations[index]=np.array(recommendations[index])
    #print(type(element))
print(len(recommendations))

In [None]:
test_users['item_list']= recommendations
#test_users['item_list'] =  test_users['item_list'].apply(lambda x: x.replace('[','').replace(']','')) 
test_users['item_list'] = pd.DataFrame([str(line).strip('[').strip(']').replace("'","") for line in test_users['item_list']])

#convert the string columns to int
#test_users['item_list'] = test_users['item_list'].astype(int)
test_users
test_users.to_csv('submission.csv', index=False)
