# Recommender with splitted user 

In [57]:
import pandas as pd
import numpy as np 
import scipy.sparse as sps
import os
import sys
import matplotlib.pyplot as plt
sys.path.append('..')

dataset = pd.read_csv('data_train.csv', names=["user", "item", "interaction"], header=0)

# Get for each user only the number of interaction, this will allow to create group of users 
# based on how much interaction they had
grouped_users = dataset.groupby(['user']).count()
splitted_user = list()

for group_id in range(1, 11):
    grouped_users_local = grouped_users.copy()
    grouped_users_local = grouped_users_local[grouped_users_local.item == group_id]
    grouped_users_local = grouped_users_local.reset_index()['user']
    splitted_user.append(grouped_users_local)
    print("**********: ", group_id, " Lenght: ", len(grouped_users_local))

grouped_users_local = grouped_users.copy()
grouped_users_local = grouped_users_local[grouped_users_local.item > 10]
grouped_users_local = grouped_users_local.reset_index()['user']
splitted_user.append(grouped_users_local)
#print("**********: ", 11, "> Lenght: ", len(grouped_users_local))


**********:  1  Lenght:  93
**********:  2  Lenght:  995
**********:  3  Lenght:  1148
**********:  4  Lenght:  933
**********:  5  Lenght:  605
**********:  6  Lenght:  490
**********:  7  Lenght:  413
**********:  8  Lenght:  607
**********:  9  Lenght:  250
**********:  10  Lenght:  179


# Data pre-processing

In [58]:
users = dataset.user
items = dataset.item
data = dataset.interaction
URM_all = sps.coo_matrix((data, (users, items)))
URM_all = URM_all.tocsr() #fast row access -> fast access to users 
URM_all.shape

(7947, 25975)

In [59]:
ICM_df = pd.read_csv('data_ICM_title_abstract.csv')

items = ICM_df.row
features = ICM_df.col
data = ICM_df.data
ICM_all = sps.coo_matrix((data, (items, features)))
ICM_all = ICM_all.tocsr() #fast row access -> fast access to users 

In [60]:
from Base.Evaluation.Evaluator import EvaluatorHoldout
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.80)

users_not_in_group_three = grouped_users.copy()
users_not_in_group_three = users_not_in_group_three.loc[users_not_in_group_three.item > 10]
users_not_in_group_three = users_not_in_group_three.reset_index()['user']
evaluator_validation_three = EvaluatorHoldout(URM_validation, cutoff_list=[10], ignore_users = users_not_in_group_three)

users_not_in_group_other = grouped_users.copy()
users_not_in_group_other = users_not_in_group_other[users_not_in_group_other.item < 11]
users_not_in_group_other = users_not_in_group_other.reset_index()['user']
evaluator_validation_other = EvaluatorHoldout(URM_validation, cutoff_list=[10], ignore_users = users_not_in_group_other)

EvaluatorHoldout: Ignoring 2234 Users
EvaluatorHoldout: Ignoring 5713 Users


In [61]:
from GraphBased.P3alphaRecommender import P3alphaRecommender
from KNN.ItemKNNSimilarityHybridRecommender import ItemKNNSimilarityHybridRecommender

recommender_alpha = P3alphaRecommender(URM_train)
recommender_alpha.fit(topK=475, alpha = 0.45, implicit = True)

recommender_alpha_ICM = P3alphaRecommender(ICM_all.T)
recommender_alpha_ICM.fit(topK=175, alpha = 0.45)
recommender_alpha_ICM.URM_train = URM_train

hybridrecommender = ItemKNNSimilarityHybridRecommender(URM_train, recommender_alpha_ICM.W_sparse, recommender_alpha.W_sparse)
hybridrecommender.fit(topK=600, alpha = 0.45)

P3alphaRecommender: URM Detected 60 (0.76 %) cold users.
P3alphaRecommender: URM Detected 2397 (9.23 %) cold items.
P3alphaRecommender: URM Detected 2 (0.01 %) cold users.
P3alphaRecommender: URM Detected 25 (0.10 %) cold items.
ItemKNNSimilarityHybridRecommender: URM Detected 60 (0.76 %) cold users.
ItemKNNSimilarityHybridRecommender: URM Detected 2397 (9.23 %) cold items.


In [62]:
evaluator_validation_three.evaluateRecommender(hybridrecommender)

EvaluatorHoldout: Processed 3440 ( 100.00% ) in 2.34 sec. Users per second: 1471


({10: {'ROC_AUC': 0.13686496400885942,
   'PRECISION': 0.023749999999999917,
   'PRECISION_RECALL_MIN_DEN': 0.1548982558139533,
   'RECALL': 0.1548982558139533,
   'MAP': 0.06889180355604765,
   'MRR': 0.09424245570321148,
   'NDCG': 0.09665051106320217,
   'F1': 0.041185216825319206,
   'HIT_RATE': 0.2375,
   'ARHR': 0.0997895902547064,
   'NOVELTY': 0.004981258280496145,
   'AVERAGE_POPULARITY': 0.07893341853265165,
   'DIVERSITY_MEAN_INTER_LIST': 0.9913587812844459,
   'DIVERSITY_HERFINDAHL': 0.9991070595592212,
   'COVERAGE_ITEM': 0.41162656400384984,
   'COVERAGE_ITEM_CORRECT': 0.02436958614051973,
   'COVERAGE_USER': 0.6021354804831087,
   'COVERAGE_USER_CORRECT': 0.12882898652196745,
   'DIVERSITY_GINI': 0.17870348344804865,
   'SHANNON_ENTROPY': 12.076212813374747}},
 'CUTOFF: 10 - ROC_AUC: 0.1368650, PRECISION: 0.0237500, PRECISION_RECALL_MIN_DEN: 0.1548983, RECALL: 0.1548983, MAP: 0.0688918, MRR: 0.0942425, NDCG: 0.0966505, F1: 0.0411852, HIT_RATE: 0.2375000, ARHR: 0.0997896,

In [63]:
evaluator_validation_other.evaluateRecommender(hybridrecommender)

EvaluatorHoldout: Processed 2180 ( 100.00% ) in 1.89 sec. Users per second: 1155


({10: {'ROC_AUC': 0.2935476372506191,
   'PRECISION': 0.07683486238532017,
   'PRECISION_RECALL_MIN_DEN': 0.15523227027814152,
   'RECALL': 0.14273973953589766,
   'MAP': 0.07336384524502951,
   'MRR': 0.2307339449541283,
   'NDCG': 0.12330563749964221,
   'F1': 0.09989669249717853,
   'HIT_RATE': 0.768348623853211,
   'ARHR': 0.2889354885685162,
   'NOVELTY': 0.004697332852374485,
   'AVERAGE_POPULARITY': 0.12090351350583702,
   'DIVERSITY_MEAN_INTER_LIST': 0.9773213451166473,
   'DIVERSITY_HERFINDAHL': 0.9976873032573016,
   'COVERAGE_ITEM': 0.21343599615014436,
   'COVERAGE_ITEM_CORRECT': 0.03942252165543792,
   'COVERAGE_USER': 0.9758281110116384,
   'COVERAGE_USER_CORRECT': 0.4695613249776186,
   'DIVERSITY_GINI': 0.07288270329987032,
   'SHANNON_ENTROPY': 10.591414411052797}},
 'CUTOFF: 10 - ROC_AUC: 0.2935476, PRECISION: 0.0768349, PRECISION_RECALL_MIN_DEN: 0.1552323, RECALL: 0.1427397, MAP: 0.0733638, MRR: 0.2307339, NDCG: 0.1233056, F1: 0.0998967, HIT_RATE: 0.7683486, ARHR: 0.