# Data loading 
Next cells are used to load the data we need.

In [1]:
import pandas as pd
import numpy as np 
import scipy.sparse as sps
import os
import sys
import matplotlib.pyplot as plt
sys.path.append('../../')


In [2]:
#data which contains users (row), items(col) and implicit interaction (data)
dataset = pd.read_csv('../data_train.csv')
dataset

Unnamed: 0,row,col,data
0,0,10080,1.0
1,0,19467,1.0
2,1,2665,1.0
3,1,7494,1.0
4,1,17068,1.0
...,...,...,...
113263,7945,2476,1.0
113264,7945,12319,1.0
113265,7945,21384,1.0
113266,7946,8699,1.0


# Data pre-processing
Pre-processing of data to check for missing users, items ecc...

In [3]:
users = dataset.row
items = dataset.col
data = dataset.data
URM_all = sps.coo_matrix((data, (users, items)))
URM_all = URM_all.tocsr() #fast row access -> fast access to users 
URM_all.shape

(7947, 25975)

In [4]:
ICM_df = pd.read_csv('../data_ICM_title_abstract.csv')
ICM_df

Unnamed: 0,row,col,data
0,0,1185,1.015524
1,0,2507,0.459024
2,0,3534,0.227742
3,0,8766,0.501549
4,0,10862,0.297011
...,...,...,...
490686,25974,12554,0.963016
490687,25974,13003,0.104613
490688,25974,16236,0.118760
490689,25974,18797,0.363301


In [5]:
items = ICM_df.row
features = ICM_df.col
data = ICM_df.data
ICM_all = sps.coo_matrix((data, (items, features)))
ICM_all = ICM_all.tocsr() #fast row access -> fast access to users 
ICM_all.shape

(25975, 20000)

In [6]:
from Base.Evaluation.Evaluator import EvaluatorHoldout
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.8)
evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10])



In [7]:
URM_train_ICM_all = sps.vstack([URM_train*(1-0.289), ICM_all.T*0.289])

In [9]:
from KNN.UserKNNCFRecommender import UserKNNCFRecommender

recommender = UserKNNCFRecommender(URM_train_ICM_all)
recommender.fit(shrink=100, topK=150, feature_weighting = "TF-IDF")
evaluator_validation.evaluateRecommender(recommender)

UserKNNCFRecommender: URM Detected 79 (0.28 %) cold users.
UserKNNCFRecommender: URM Detected 4 (0.02 %) cold items.
Similarity column 27947 ( 100 % ), 2487.15 column/sec, elapsed time 0.19 min
EvaluatorHoldout: Processed 5622 ( 100.00% ) in 6.51 sec. Users per second: 864


({10: {'ROC_AUC': 0.18787272083029866,
   'PRECISION': 0.041337602276768894,
   'PRECISION_RECALL_MIN_DEN': 0.1415110986882604,
   'RECALL': 0.1370342412650141,
   'MAP': 0.06739762735304788,
   'MRR': 0.14065321610679157,
   'NDCG': 0.1008316255171264,
   'F1': 0.06351525948527875,
   'HIT_RATE': 0.4133760227676983,
   'ARHR': 0.16499268745800266,
   'NOVELTY': 0.005330878198660603,
   'AVERAGE_POPULARITY': 0.13634728645401004,
   'DIVERSITY_MEAN_INTER_LIST': 0.9888821085689553,
   'DIVERSITY_HERFINDAHL': 0.9988706213494504,
   'COVERAGE_ITEM': 0.44153994225216553,
   'COVERAGE_ITEM_CORRECT': 0.05201154956689124,
   'COVERAGE_USER': 0.7074367685919215,
   'COVERAGE_USER_CORRECT': 0.20850635459921982,
   'DIVERSITY_GINI': 0.14263662037814617,
   'SHANNON_ENTROPY': 11.632074137353067}},
 'CUTOFF: 10 - ROC_AUC: 0.1878727, PRECISION: 0.0413376, PRECISION_RECALL_MIN_DEN: 0.1415111, RECALL: 0.1370342, MAP: 0.0673976, MRR: 0.1406532, NDCG: 0.1008316, F1: 0.0635153, HIT_RATE: 0.4133760, ARHR:

In [12]:
test_users = pd.read_csv('data_target_users_test.csv')
test_users

Unnamed: 0,user_id
0,0
1,1
2,2
3,3
4,4
...,...
7939,7942
7940,7943
7941,7944
7942,7945


In [13]:
recommender = UserKNNCFRecommender(URM_all)
recommender.fit(shrink=100, topK=150, feature_weighting = "TF-IDF")

user_id = test_users['user_id']
recommendations = recommender.recommend(user_id,cutoff = 10)


UserKNNCFRecommender: URM Detected 1079 (4.15 %) cold items.
Similarity column 7947 ( 100 % ), 15535.90 column/sec, elapsed time 0.01 min


In [14]:
for index in range(len(recommendations)):
    #print(element)
    recommendations[index]=np.array(recommendations[index])
    #print(type(element))
print(len(recommendations))

test_users['item_list']= recommendations
test_users['item_list'] = pd.DataFrame([str(line).strip('[').strip(']').replace("'","") for line in test_users['item_list']])

test_users
test_users.to_csv('submission.csv', index=False)


7944
