In [76]:
## RECSYS CHALLENGE 2020 ##
import numpy as np
import scipy.sparse as sps
from Notebooks_utils.data_splitter import train_test_holdout

# Evaluation or scoring?
eval = True

In [77]:
# Loading from CSV files...
from RecLib.DataLoad import *
UCM_age, ICM_subclass, ICM_asset, ICM_price, UCM_region, target_users, URM = dataLoad()

In [78]:
URM = URM.tocsr()

In [79]:
# Split dataset (train % of .9999 gives similar performance on test set and competition set)
URM_train, URM_test = train_test_holdout(URM, train_perc = 0.8)

if not eval:
    URM_train = URM
else:
    from Base.Evaluation.Evaluator import EvaluatorHoldout
    from RecLib.Evaluate import *
    evaluator_validation = EvaluatorHoldout(URM_test, cutoff_list=[10])

In [80]:
# Load Item & User Content matrix
ICM, UCM = contentMatrixLoad(URM_train, ICM_subclass, ICM_price, ICM_asset, UCM_age, UCM_region, True)

In [None]:
if True:
    from SLIM_ElasticNet.SLIMElasticNetRecommender import SLIMElasticNetRecommender
    SLIMElasticNet = SLIMElasticNetRecommender(URM_train.tocsr())
    if True or not eval:
        MAP_LIST = []
        tklist = [60]
        penalties = [1]
        alphas = [1e-4]
        tol = [1e-5]
        max_iter = [100]
        positives = [True]
        warm_start = [True]
        for tk in tklist:
            for penalty in penalties:
                for alpha in alphas:
                    for t in tol:
                        for iters in max_iter:
                            for positivity in positives:
                                for ws in warm_start:
                                    SLIMElasticNet.fit(l1_ratio=penalty, alpha = alpha, positive_only=positivity, topK = tk, warm_start = ws,tol = 1e-4, max_iter=100)
                                    if eval:
                                        dict_scores = (evaluator_validation.evaluateRecommender(SLIMElasticNet))[0][10]
                                        MAP_LIST.append(('tol, positives, warm_start, iterations :' + str(t) + ' ' + str(positivity) + ' ' + str(ws) + ' ' + str(iters), dict_scores['MAP']))
        if eval:
            sortMap(MAP_LIST)
        SLIMElasticNet.save_model('.')
    else:
        SLIMElasticNet.load_model('.')

In [None]:
# SLIM BPR Recommender
if True:
    from SLIM_BPR.Cython.SLIM_BPR_Cython import SLIM_BPR_Cython 
    slim_rec = SLIM_BPR_Cython(URM_train, recompile_cython=False, verbose = False)
    MAP_LIST = []
    epochsList = [300]
    batchSize = [50]
    tklist = [10]
    lrs = [1e-3]

    for epochsN in epochsList:
        for bs in batchSize:
            for tk in tklist:
                for lr in lrs:
                    do_not_display_hystory = slim_rec.fit(epochs=epochsN, batch_size=bs, sgd_mode='adagrad', learning_rate=lr, topK = tk)
                    if eval:
                        dict_scores = (evaluator_validation.evaluateRecommender(slim_rec))[0][10]
                        MAP_LIST.append(('epoch, batch, topK, lr :' + str(epochsN) + ' ' + str(bs)+ ' ' + str(tk) + ' ' + str(lr), dict_scores['MAP']))
    if eval:
        sortMap(MAP_LIST) 

In [82]:
# ItemKNN Content Based Filtering
if True:
    from sklearn import preprocessing
    from KNN.ItemKNNCBFRecommender import ItemKNNCBFRecommender 
    MAP_LIST = []
    tklist = [3]
    shrinklist = [10]
    for bins in [5]:
        for tk in tklist:
            for sr in shrinklist:
                le = preprocessing.KBinsDiscretizer(n_bins=bins, encode='ordinal', strategy='kmeans')
                X = np.reshape(np.ediff1d(URM_train.tocsc().indptr), (-1, 1))
                le.fit_transform(X)
                classList = le.transform(X)
                ones = np.ones(len(classList))
                ICM_pop = coo_matrix((ones, (np.arange(0, URM_train.shape[1]), classList.reshape(-1,))), shape=(URM_train.shape[1], bins))
                ICM_pop = ICM_pop.tocsr()
                ICM_mod = sps.hstack([ICM, ICM_pop], format='csr')
                itemKNNCBF = ItemKNNCBFRecommender(URM_train, ICM_mod)
                itemKNNCBF.fit(shrink=sr, topK = tk, similarity = 'cosine')
                if eval:
                    dict_scores = (evaluator_validation.evaluateRecommender(itemKNNCBF))[0][10]
                    MAP_LIST.append(('topK, shrink, bins :' + str(tk) + ' ' + str(sr) + ' '+str(bins), dict_scores['MAP']))
    if eval:
        sortMap(MAP_LIST)

Similarity column 18495 ( 100 % ), 9237.09 column/sec, elapsed time 0.03 min
EvaluatorHoldout: Processed 20335 ( 100.00% ) in 9.67 sec. Users per second: 2103
('topK, shrink, bins :3 10 5', 0.01380680616553551)


In [None]:
if True:
    from MatrixFactorization.PureSVDRecommender import PureSVDRecommender 
    pureSVD = PureSVDRecommender(URM_train)
    MAP_LIST = []
    nfactorlist = [400]

    for n in nfactorlist:
        pureSVD.fit(num_factors=n, random_seed = None)
        if eval:
            dict_scores = (evaluator_validation.evaluateRecommender(pureSVD))[0][10]
            MAP_LIST.append(('num factors :' + str(n) , dict_scores['MAP']))
    if eval:
        print(MAP_LIST)  

In [85]:
# UserKNN Content Based Filtering
augment_with_profile_lenght = False

if eval:
    to_compute_mask = np.ediff1d(URM_train.tocsr().indptr) == 0
    to_ignore_mask = np.invert(to_compute_mask)
    to_ignore = np.arange(URM_train.shape[0])[to_ignore_mask]
    evalTest = EvaluatorHoldout(URM_test, cutoff_list=[10], ignore_users=to_ignore)

if True:
    from KNN.UserKNNCBFRecommender import UserKNNCBFRecommender 
    MAP_LIST = []
    tklist = [2500]
    shrinklist = [10]
    
    for tk in tklist:
        for sr in shrinklist:
            for bins in [106]:
                if augment_with_profile_lenght:
                    le = preprocessing.KBinsDiscretizer(n_bins=bins, encode='ordinal', strategy='kmeans')
                    X = np.reshape(np.ediff1d(URM_train.indptr), (-1, 1))
                    le.fit_transform(X)
                    classList = le.transform(X)
                    ones = np.ones(len(classList))
                    UCM_profile = coo_matrix((ones, (np.arange(0, URM_train.shape[0]), classList.reshape(-1,))), shape=(URM_train.shape[0], bins))
                    UCM_profile = UCM_profile.tocsr()
                    UCM_mod = sps.hstack([UCM, UCM_profile], format='csr')
                    userKNNCBF = UserKNNCBFRecommender(URM_train, UCM_mod)
                else:
                    userKNNCBF = UserKNNCBFRecommender(URM_train, UCM)
                userKNNCBF.fit(shrink=sr, topK = tk, similarity = 'cosine')
                if eval:
                    dict_scores = (evalTest.evaluateRecommender(userKNNCBF))[0][10]
                    MAP_LIST.append(('topK, shrink, bins :' + str(tk) + ' ' + str(sr) + ' '+str(bins), dict_scores['MAP']))
    if eval:
        print(MAP_LIST)
        

EvaluatorHoldout: Ignoring 26641 Users
Similarity column 30911 ( 100 % ), 1943.81 column/sec, elapsed time 0.27 min
EvaluatorHoldout: Processed 614 ( 100.00% ) in 0.64 sec. Users per second: 964
[('topK, shrink, bins :2500 10 106', 0.014604553366768353)]


In [62]:



#ICM_pop = coo_matrix((np.ones(len(item_pop)), (np.arange(0, URM_train.shape[1]), item_pop)))
#profile_length = np.ediff1d(URM_train.indptr)
#UCM_profile = coo_matrix((np.ones(len(profile_length)), (np.arange(0, URM_train.shape[0]), profile_length)))
#UCM_mod = sps.hstack([UCM, UCM_profile], format='csr')
#ICM_mod = sps.hstack([ICM, ICM_pop], format='csr')

#entropy = np.ediff1d(URM_train.tocsc().indptr)
#ICM_pop = coo_matrix((np.ones(len(item_pop)), (np.arange(0, URM_train.shape[1]), item_pop)))
#ICM_entropy = sps.hstack([ICM, ICM_pop], format='csr')

#popularity_bias = URM_train.toarray().dot(item_pop)

In [None]:
#print(popularity_bias//profile_length)

In [84]:
# LightFM Recommender
if False:
    from RecLib.LFMRec import LFM 
    lfm_rec = LFM(URM_train, None, ICM)
    MAP_LIST = []
    epochsList = [20]
    batchSize = [50]
    tklist = [10]
    lrs = [0.05]
    losses=['bpr', 'warp']

    for epochsN in epochsList:
        for loss in losses:
            for tk in tklist:
                for lr in lrs:
                    do_not_display_hystory = lfm_rec.fit(epochs=epochsN, no_components=10,
            k=5,
            n=10,
            learning_schedule="adagrad",
            loss=loss,
            learning_rate=lr,
            rho=0.95,
            epsilon=1e-6,
            item_alpha=0.0,
            user_alpha=0.0,
            max_sampled=10,
            random_state=None, num_threads=1,
            verbose=False)
                    if eval:
                        dict_scores = (evaluator_validation.evaluateRecommender(lfm_rec))[0][10]
                        MAP_LIST.append(('epoch, loss, lr :' + str(epochsN) + ' ' + loss + ' '  + str(lr), dict_scores['MAP']))
    if eval:
        sortMap(MAP_LIST)

EvaluatorHoldout: Processed 13001 ( 63.93% ) in 31.04 sec. Users per second: 419
EvaluatorHoldout: Processed 20335 ( 100.00% ) in 45.53 sec. Users per second: 447
EvaluatorHoldout: Processed 13001 ( 63.93% ) in 31.37 sec. Users per second: 414
EvaluatorHoldout: Processed 20335 ( 100.00% ) in 46.02 sec. Users per second: 442
('epoch, loss, lr :20 warp 0.05', 0.0002989437116631022)
('epoch, loss, lr :20 bpr 0.05', 0.006133635959916304)


In [None]:
# UserKNN Collaborative Filtering
if False:
    from KNN.UserKNNCFRecommender import UserKNNCFRecommender 
    userKNNCF = UserKNNCFRecommender(URM_train)
    MAP_LIST = []
    tklist = [600]
    shrinklist = [5]

    for tk in tklist:
        for sr in shrinklist:
            userKNNCF.fit(shrink=sr, topK = tk, similarity = 'cosine')
            if eval:
                dict_scores = (evaluator_validation.evaluateRecommender(userKNNCF))[0][10]
                MAP_LIST.append(('topK, shrink :' + str(tk) + ' ' + str(sr), dict_scores['MAP']))
    if eval:
        sortMap(MAP_LIST)    

In [None]:
# ItemKNN Collaborative Filtering
if True:
    from KNN.ItemKNNCFRecommender import ItemKNNCFRecommender
    itemKNN = ItemKNNCFRecommender(URM_train)
    
    MAP_LIST = []
    normalize = [True]
    tklist = [5]
    shrinkList = [25]

    for simil in normalize:
        for tk in tklist:
            for shrink in shrinkList:
                itemKNN.fit(shrink=shrink, topK=tk, normalize=simil)
                if eval:
                    dict_scores = (evaluator_validation.evaluateRecommender(itemKNN))[0][10]
                    MAP_LIST.append(('Normalize, topK, shrink :' + str(simil)+ ' ' + str(tk) + ' ' + str(shrink), dict_scores['MAP']))
    if eval:
        sortMap(MAP_LIST)


In [None]:
#TEST REMOVE TOP POP
if False:
    hybridList = []
    from Base.NonPersonalizedRecommender import TopPop
    top = TopPop(URM_train)
    top.fit()
    from KNN.ItemKNNCFRecommender import ItemKNNCFRecommender


    
    MAP_LIST = []
    normalize = [True]
    tklist = [5]
    shrinkList = [25]

    for simil in normalize:
        for tk in tklist:
            for shrink in shrinkList:
                for x in range(0, 11):
                    itemKNN = ItemKNNCFRecommender(URM_train)
                    itemKNN.filterTopPop = True
                    itemKNN.filterTopPop_ItemsID = top.recommend(0, cutoff = x)
                    itemKNN.RECOMMENDER_NAME = 'Item' + str(x)
                    itemKNN.fit(shrink=shrink, topK=tk, normalize=simil)
                    if eval:
                        dict_scores = (evaluator_validation.evaluateRecommender(itemKNN))[0][10]
                        MAP_LIST.append(('Normalize, topK, shrink, topKremoved :' + str(simil)+ ' ' + str(tk) + ' ' + str(shrink)+ ' ' + str(x), dict_scores['MAP']))
                    hybridList.append(itemKNN)
    if eval:
        sortMap(MAP_LIST)

In [None]:
# Graph Based
if True:
    from GraphBased.P3alphaRecommender import P3alphaRecommender
    p3alpha_rec = P3alphaRecommender(URM_train)
    MAP_LIST = []
    similList = [True]
    tklist = [30]
    alphaList = [0.4]

    for simil in similList:
        for tk in tklist:
            for alpha in alphaList:
                do_not_display_hystory = p3alpha_rec.fit(topK=tk, alpha=alpha, normalize_similarity=simil)
                if eval:
                    dict_scores = (evaluator_validation.evaluateRecommender(p3alpha_rec))[0][10]
                    MAP_LIST.append(('Normalize, topK, alpha :' + str(simil)+ ' ' + str(tk) + ' ' + str(alpha), dict_scores['MAP']))
    if eval:
        sortMap(MAP_LIST)
    

In [None]:
if True:
    from GraphBased.RP3betaRecommender import RP3betaRecommender
    p3beta_rec = RP3betaRecommender(URM_train)
    MAP_LIST = []
    betaList = [0.1]
    similList = [True]
    tklist = [50]
    alphaList = [0.4]

    for beta in betaList:
        for simil in similList:
            for tk in tklist:
                for alpha in alphaList:
                    do_not_display_hystory = p3beta_rec.fit(topK=tk, alpha=alpha, beta=beta, normalize_similarity=simil)
                    if eval:
                        dict_scores = (evaluator_validation.evaluateRecommender(p3beta_rec))[0][10]
                        MAP_LIST.append(('Normalize, topK, alpha, beta :' + str(simil)+ ' ' + str(tk) + ' ' + str(alpha) + ' ' + str(beta), dict_scores['MAP']))

    if eval:
        sortMap(MAP_LIST)

In [None]:
if True:
    from FeatureWeighting.CFW_D_Similarity_Linalg import CFW_D_Similarity_Linalg
    W_sparse_CF = itemKNN.W_sparse
    CFW_weithing = CFW_D_Similarity_Linalg(URM_train, ICM, W_sparse_CF)
    MAP_LIST = []
    quotas = [0.1]
    similList = [False]
    tklist = [700]

    for quota in quotas:
        for normalization in similList:
            for tk in tklist:  
                    CFW_weithing.fit(topK = tk, add_zeros_quota = quota, normalize_similarity = normalization)
                    if eval:
                        dict_scores = (evaluator_validation.evaluateRecommender(CFW_weithing))[0][10]
                        MAP_LIST.append(('Normalize, topK, quota :' + str(normalization)+ ' ' + str(tk) + ' ' + str(quota), dict_scores['MAP']))
    if eval:
        sortMap(MAP_LIST)

In [None]:
# Hybrid Recommender
#hybridList = []
from RecLib.HybridRecommender import *
if True:
    params = [(0.7, 0.1, 0.4, 0.05, 0.5, 1, 2 , 0.5)]
    MAP_LIST = []
    for param in params:
                hybridrecommender = HybridRecommender(URM_train, userKNNCBF, itemKNN, itemKNNCBF, slim_rec, pureSVD, p3alpha_rec, p3beta_rec, SLIMElasticNet, CFW_weithing)
                hybridrecommender.fit(*param)
                if eval:
                    dict_scores = (evaluator_validation.evaluateRecommender(hybridrecommender))[0][10]
                    MAP_LIST.append(('params :' + str(param), dict_scores['MAP']))
    if eval:
        sortMap(MAP_LIST)


In [75]:
# Evaluate performance of a recommender against users with 0,1,...,itrMax interactions
rec_to_eval = userKNNCBF
itrMax = 2

if eval:
    evaluateAgainstUsers(rec_to_eval, itrMax, URM_train, URM_test)

EvaluatorHoldout: Ignoring 26568 Users
EvaluatorHoldout: Processed 687 ( 100.00% ) in 0.68 sec. Users per second: 1005
MAP at 0 interactions: 0.011992560245835355
EvaluatorHoldout: Ignoring 27843 Users
EvaluatorHoldout: Processed 842 ( 100.00% ) in 0.77 sec. Users per second: 1096
MAP at 1 interactions: 0.009028131244580174


In [None]:
# Compare different recommenders
recommendersToCompare = [hybridrecommender]

if eval:
    from RecLib.Evaluate import *
    compare(URM_train, URM_test, recommendersToCompare, enableTop = False, enablePureSVD = False)

In [None]:
# Predict
final_Rec = hybridrecommender

if not eval:
    output = []
    for user_id in target_users:
        output.append((user_id, final_Rec.recommend(user_id, cutoff=10)))

In [None]:
# Writedown results

if not eval:
    import csv
    with open('submission.csv', 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["user_id", "item_list"])
        for row in output:
          ranking = ''
          for val in row[1]:
            ranking = ranking + str(val) + ' '
          writer.writerow([row[0], ranking[:-1]])