# SLIM MODEL
Since the KNN_Item_CF is the one that perfomed the best so far, now i'm trying a SLIM Model, which is basically the same algorithm but Machine Learning based. It uses the infos from the training data instead of some heuristic (such as cosine similarity ecc...)
https://notebook.community/trangel/Insight-Data-Science/general-docs/recommendation-validation/.ipynb_checkpoints/recommender_systems-validation-checkpoint

# Data loading 
Next cells are used to load the data we need.

In [None]:
import pandas as pd
import numpy as np 
import scipy.sparse as sps
import os
import sys
import matplotlib.pyplot as plt
sys.path.append('..')


In [None]:
#data which contains users (row), items(col) and implicit interaction (data)
dataset = pd.read_csv('data_train.csv')
dataset

# Data pre-processing
Pre-processing of data to check for missing users, items ecc...

In [None]:
unique_users, min_users, max_users = len(dataset.row.unique()), dataset.row.min(), dataset.row.max()
unique_items, min_items, max_items = len(dataset.col.unique()), dataset.col.min(), dataset.col.max()
print('users stats:',unique_users, min_users, max_users)
print('items stats:',unique_items, min_items, max_items)

If the difference between the unique elements and the max/min ones is really big, it's better to remap users and items in a new dataframe. In this case, it seems I can leave it as it is

In [None]:
users = dataset.row
items = dataset.col
data = dataset.data
URM_all = sps.coo_matrix((data, (users, items)))
URM_all = URM_all.tocsr() #fast row access -> fast access to users 
URM_all.shape

CSR basically has a pointer to the exact beginning of every possible row. For example if i have <code>matrix.inptr = [0,1,2,4,6] </code> this means that at <code>matrix.inptr[n]</code> I have my n row which "last" until the next cell in the array. <br> In this case (for example) my third row goes from 2 to 4. <br>

What do I do with it? <br>

I use this index (2 to 4) to access the other two arrays of the crs format: <code>indices</code> and <code>data</code>.
<code>indices</code> gives me the column index and <code>data</code> gives me the actual data to take

In [None]:
items_per_user = np.ediff1d(URM_all.indptr) #tells in which position each of the rows begin
users_per_item = np.ediff1d(sps.csc_matrix(URM_all).indptr)
URM_all = sps.csr_matrix(URM_all)

In [None]:
items_per_user = np.sort(items_per_user) #sorting based on the single value. Losing reference to the user. 
users_per_item = np.sort(users_per_item)

The following 2 graphs plots the <code>items_per_user</code> and the <code>users_per_item</code>. It shows that slightly more than 2500 users and 2500 items have a 0 count. This is coherent with the original data we got: 30910-27255 for users and 18494-15277 for items. These differences are the missing users/items in the dataset (Basically users/items that have no interaction at all)

In [None]:
plt.figure(figsize=(15,12))
plt.plot(items_per_user, 'ro') #biased dataset, but not so much compared with the one seen at lessons 

In [None]:
plt.figure(figsize=(15,12))
plt.plot(users_per_item, 'ro') #biased dataset, but not so much compared with the one seen at lessons 

In [None]:
ICM_df = pd.read_csv('data_ICM_title_abstract.csv')
ICM_df

In [None]:
unique_items, min_items, max_items = len(ICM_df.row.unique()), ICM_df.row.min(), ICM_df.row.max()
unique_features, min_features, max_features = len(ICM_df.col.unique()), ICM_df.col.min(), ICM_df.col.max()
print('items stats:',unique_items, min_items, max_items)
print('features stats:',unique_features, min_features, max_features)

In [None]:
items = ICM_df.row
features = ICM_df.col
data = ICM_df.data
ICM_all = sps.coo_matrix((data, (items, features)))
ICM_all = ICM_all.tocsr() #fast row access -> fast access to users 
ICM_all.shape

In [None]:
ICM_all = sps.csr_matrix(ICM_all)
features_per_item = np.ediff1d(ICM_all.indptr)

ICM_all = sps.csc_matrix(ICM_all)
items_per_feature = np.ediff1d(ICM_all.indptr)

ICM_all = sps.csr_matrix(ICM_all)

In [None]:
features_per_item = np.sort(features_per_item)
items_per_feature = np.sort(items_per_feature)

In [None]:
import matplotlib.pyplot as pyplot
%matplotlib inline  

plt.plot(features_per_item, 'ro')
plt.ylabel('Num features ')
plt.xlabel('Item Index')
plt.show()

In [None]:
plt.plot(items_per_feature, 'ro')
plt.ylabel('Num items ')
plt.xlabel('Feature Index')
plt.show()

In [35]:
from Base.Evaluation.Evaluator import EvaluatorHoldout
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample
from GraphBased.P3alphaRecommender import P3alphaRecommender
from GraphBased.RP3betaRecommender import RP3betaRecommender
from KNN.ItemKNNCBFRecommender import ItemKNNCBFRecommender
from KNN.ItemKNNSimilarityHybridRecommender import ItemKNNSimilarityHybridRecommender
from KNN.UserKNNCFRecommender import UserKNNCFRecommender
from KNN.ItemKNNScoresHybridRecommender import ItemKNNScoresHybridRecommender

URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.85)
evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10])




In [36]:

# This is for the evaluation

recommender_alpha = RP3betaRecommender(URM_train)
recommender_alpha.fit(topK=450, alpha = 0.5, implicit = True)

recommender_itemKNNCBF = ItemKNNCBFRecommender(URM_train, ICM_all)
recommender_itemKNNCBF.fit(topK=200, shrink=250, feature_weighting = 'TF-IDF')

recommender_userKNNCBF = UserKNNCFRecommender(URM_train)
recommender_userKNNCBF.fit(shrink=50, topK=100, feature_weighting = "TF-IDF")

hybridrecommender = ItemKNNSimilarityHybridRecommender(URM_train, recommender_itemKNNCBF.W_sparse, recommender_alpha.W_sparse)
hybridrecommender.fit(topK=500, alpha = 0.57)

RP3betaRecommender: URM Detected 46 (0.58 %) cold users.
RP3betaRecommender: URM Detected 2007 (7.73 %) cold items.
ItemKNNCBFRecommender: URM Detected 46 (0.58 %) cold users.
ItemKNNCBFRecommender: URM Detected 2007 (7.73 %) cold items.
Similarity column 25975 ( 100 % ), 3488.25 column/sec, elapsed time 0.12 min
UserKNNCFRecommender: URM Detected 46 (0.58 %) cold users.
UserKNNCFRecommender: URM Detected 2007 (7.73 %) cold items.
Similarity column 7947 ( 100 % ), 15171.73 column/sec, elapsed time 0.01 min
ItemKNNSimilarityHybridRecommender: URM Detected 46 (0.58 %) cold users.
ItemKNNSimilarityHybridRecommender: URM Detected 2007 (7.73 %) cold items.


In [37]:

hybridrecommender_final = ItemKNNScoresHybridRecommender(URM_train, hybridrecommender, recommender_userKNNCBF)
hybridrecommender_final.fit(alpha = 0)

evaluator_validation.evaluateRecommender(hybridrecommender_final)

ItemKNNScoresHybridRecommender: URM Detected 46 (0.58 %) cold users.
ItemKNNScoresHybridRecommender: URM Detected 2007 (7.73 %) cold items.
EvaluatorHoldout: Processed 4978 ( 100.00% ) in 4.56 sec. Users per second: 1091


({10: {'ROC_AUC': 0.10268745974350647,
   'PRECISION': 0.020891924467657506,
   'PRECISION_RECALL_MIN_DEN': 0.08881810123272556,
   'RECALL': 0.08715369815069404,
   'MAP': 0.0391128521554299,
   'MRR': 0.0711919748480616,
   'NDCG': 0.0583341716133528,
   'F1': 0.03370443771281591,
   'HIT_RATE': 0.20891924467657694,
   'ARHR': 0.07891938816506912,
   'NOVELTY': 0.005397437973363721,
   'AVERAGE_POPULARITY': 0.017223221197659976,
   'DIVERSITY_MEAN_INTER_LIST': 0.9980816940731705,
   'DIVERSITY_HERFINDAHL': 0.9997881195540813,
   'COVERAGE_ITEM': 0.6101636188642926,
   'COVERAGE_ITEM_CORRECT': 0.03684311838306063,
   'COVERAGE_USER': 0.6263998993330817,
   'COVERAGE_USER_CORRECT': 0.1049452623631559,
   'DIVERSITY_GINI': 0.3154960823620233,
   'SHANNON_ENTROPY': 13.22763761274088}},
 'CUTOFF: 10 - ROC_AUC: 0.1026875, PRECISION: 0.0208919, PRECISION_RECALL_MIN_DEN: 0.0888181, RECALL: 0.0871537, MAP: 0.0391129, MRR: 0.0711920, NDCG: 0.0583342, F1: 0.0337044, HIT_RATE: 0.2089192, ARHR: 0

In [None]:
# This is for the final evaluation

recommender_alpha = P3alphaRecommender(URM_all)
recommender_alpha.fit(topK=450, alpha = 0.5, implicit = True)

recommender_itemKNNCBF = ItemKNNCBFRecommender(URM_all, ICM_all)
recommender_itemKNNCBF.fit(topK=200, shrink=250, feature_weighting = 'TF-IDF')

hybridrecommender = ItemKNNSimilarityHybridRecommender(URM_all, recommender_itemKNNCBF.W_sparse, recommender_alpha.W_sparse)
hybridrecommender.fit(topK=500, alpha = 0.57)

user_id = test_users['user_id']
recommendations = hybridrecommender.recommend(user_id,cutoff = 10)


In [None]:
test_users = pd.read_csv('data_target_users_test.csv')
test_users

In [None]:
for index in range(len(recommendations)):
    #print(element)
    recommendations[index]=np.array(recommendations[index])
    #print(type(element))
print(len(recommendations))

In [None]:
test_users['item_list']= recommendations
#test_users['item_list'] =  test_users['item_list'].apply(lambda x: x.replace('[','').replace(']','')) 
test_users['item_list'] = pd.DataFrame([str(line).strip('[').strip(']').replace("'","") for line in test_users['item_list']])

#convert the string columns to int
#test_users['item_list'] = test_users['item_list'].astype(int)
test_users
test_users.to_csv('submission.csv', index=False)
