# Recommender with splitted user 

In [15]:
import pandas as pd
import numpy as np 
import scipy.sparse as sps
import os
import sys
import matplotlib.pyplot as plt
sys.path.append('..')

dataset = pd.read_csv('data_train.csv', names=["user", "item", "interaction"], header=0)

# Get for each user only the number of interaction, this will allow to create group of users 
# based on how much interaction they had
grouped_users = dataset.groupby(['user']).count()
splitted_user = list()

for group_id in range(1, 11):
    grouped_users_local = grouped_users.copy()
    grouped_users_local = grouped_users_local[grouped_users_local.item == group_id]
    grouped_users_local = grouped_users_local.reset_index()['user']
    splitted_user.append(grouped_users_local)
    print("**********: ", group_id, " Lenght: ", len(grouped_users_local))

grouped_users_local = grouped_users.copy()
grouped_users_local = grouped_users_local[grouped_users_local.item > 10]
grouped_users_local = grouped_users_local.reset_index()['user']
splitted_user.append(grouped_users_local)
#print("**********: ", 11, "> Lenght: ", len(grouped_users_local))


**********:  1  Lenght:  93
**********:  2  Lenght:  995
**********:  3  Lenght:  1148
**********:  4  Lenght:  933
**********:  5  Lenght:  605
**********:  6  Lenght:  490
**********:  7  Lenght:  413
**********:  8  Lenght:  607
**********:  9  Lenght:  250
**********:  10  Lenght:  179


# Data pre-processing

In [16]:
users = dataset.user
items = dataset.item
data = dataset.interaction
URM_all = sps.coo_matrix((data, (users, items)))
URM_all = URM_all.tocsr() #fast row access -> fast access to users 
URM_all.shape

(7947, 25975)

In [17]:
ICM_df = pd.read_csv('data_ICM_title_abstract.csv')

items = ICM_df.row
features = ICM_df.col
data = ICM_df.data
ICM_all = sps.coo_matrix((data, (items, features)))
ICM_all = ICM_all.tocsr() #fast row access -> fast access to users 

In [18]:
from Base.Evaluation.Evaluator import EvaluatorHoldout
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.80)

users_not_in_group_three = grouped_users.copy()
users_not_in_group_three = users_not_in_group_three.loc[users_not_in_group_three.item > 10]
users_not_in_group_three = users_not_in_group_three.reset_index()['user']
evaluator_validation_three = EvaluatorHoldout(URM_validation, cutoff_list=[10], ignore_users = users_not_in_group_three)

users_not_in_group_other = grouped_users.copy()
users_not_in_group_other = users_not_in_group_other[users_not_in_group_other.item < 11]
users_not_in_group_other = users_not_in_group_other.reset_index()['user']
evaluator_validation_other = EvaluatorHoldout(URM_validation, cutoff_list=[10], ignore_users = users_not_in_group_other)

EvaluatorHoldout: Ignoring 2234 Users
EvaluatorHoldout: Ignoring 5713 Users


In [19]:
from GraphBased.P3alphaRecommender import P3alphaRecommender
from KNN.ItemKNNSimilarityHybridRecommender import ItemKNNSimilarityHybridRecommender

recommender_alpha = P3alphaRecommender(URM_train)
recommender_alpha.fit(topK=475, alpha = 0.45, implicit = True)

recommender_alpha_ICM = P3alphaRecommender(ICM_all.T)
recommender_alpha_ICM.fit(topK=175, alpha = 0.45)
recommender_alpha_ICM.URM_train = URM_train

hybridrecommender = ItemKNNSimilarityHybridRecommender(URM_train, recommender_alpha_ICM.W_sparse, recommender_alpha.W_sparse)
hybridrecommender.fit(topK=600, alpha = 0.45)

P3alphaRecommender: URM Detected 65 (0.82 %) cold users.
P3alphaRecommender: URM Detected 2400 (9.24 %) cold items.
P3alphaRecommender: URM Detected 2 (0.01 %) cold users.
P3alphaRecommender: URM Detected 25 (0.10 %) cold items.
ItemKNNSimilarityHybridRecommender: URM Detected 65 (0.82 %) cold users.
ItemKNNSimilarityHybridRecommender: URM Detected 2400 (9.24 %) cold items.


In [20]:
evaluator_validation_three.evaluateRecommender(hybridrecommender)

EvaluatorHoldout: Processed 3417 ( 100.00% ) in 2.10 sec. Users per second: 1624


({10: {'ROC_AUC': 0.14360097272739963,
   'PRECISION': 0.023675738952297264,
   'PRECISION_RECALL_MIN_DEN': 0.15562871914935117,
   'RECALL': 0.15562871914935117,
   'MAP': 0.07306810369255493,
   'MRR': 0.10064511708497645,
   'NDCG': 0.10058179367110155,
   'F1': 0.04109908885780849,
   'HIT_RATE': 0.23675738952297337,
   'ARHR': 0.10605643584133474,
   'NOVELTY': 0.005014630554343979,
   'AVERAGE_POPULARITY': 0.0792229026587922,
   'DIVERSITY_MEAN_INTER_LIST': 0.9914424982128893,
   'DIVERSITY_HERFINDAHL': 0.9991152348228045,
   'COVERAGE_ITEM': 0.41482194417709334,
   'COVERAGE_ITEM_CORRECT': 0.02406159769008662,
   'COVERAGE_USER': 0.5981095746542973,
   'COVERAGE_USER_CORRECT': 0.1262033957640469,
   'DIVERSITY_GINI': 0.1819584143270315,
   'SHANNON_ENTROPY': 12.105971567525009}},
 'CUTOFF: 10 - ROC_AUC: 0.1436010, PRECISION: 0.0236757, PRECISION_RECALL_MIN_DEN: 0.1556287, RECALL: 0.1556287, MAP: 0.0730681, MRR: 0.1006451, NDCG: 0.1005818, F1: 0.0410991, HIT_RATE: 0.2367574, ARHR

In [21]:
evaluator_validation_other.evaluateRecommender(hybridrecommender)

EvaluatorHoldout: Processed 2191 ( 100.00% ) in 1.86 sec. Users per second: 1178


({10: {'ROC_AUC': 0.28532006838944374,
   'PRECISION': 0.07334550433591873,
   'PRECISION_RECALL_MIN_DEN': 0.14771956706005065,
   'RECALL': 0.1355553589888799,
   'MAP': 0.06784646857796482,
   'MRR': 0.2216325081683365,
   'NDCG': 0.11620048567395962,
   'F1': 0.09518750676503931,
   'HIT_RATE': 0.7334550433591968,
   'ARHR': 0.2753828794563618,
   'NOVELTY': 0.004686456863786418,
   'AVERAGE_POPULARITY': 0.12497498235299115,
   'DIVERSITY_MEAN_INTER_LIST': 0.9759862784450294,
   'DIVERSITY_HERFINDAHL': 0.997554082601306,
   'COVERAGE_ITEM': 0.21474494706448508,
   'COVERAGE_ITEM_CORRECT': 0.03726660250240616,
   'COVERAGE_USER': 0.9807520143240823,
   'COVERAGE_USER_CORRECT': 0.4565801253357207,
   'DIVERSITY_GINI': 0.0728484053742267,
   'SHANNON_ENTROPY': 10.554828333711729}},
 'CUTOFF: 10 - ROC_AUC: 0.2853201, PRECISION: 0.0733455, PRECISION_RECALL_MIN_DEN: 0.1477196, RECALL: 0.1355554, MAP: 0.0678465, MRR: 0.2216325, NDCG: 0.1162005, F1: 0.0951875, HIT_RATE: 0.7334550, ARHR: 0.2