In [None]:
import os
import sys
sys.path.append(os.path.abspath('') + '/..')

# imports

In [None]:
import pandas as pd 
import joblib

from data import ImplicitData, getBucketsHoldouts
from eval_implicit import EvaluateHoldouts 
from recommenders_implicit import UserKNN

from plot_utils import recall_heatmap
from data_utils.transfer_learning_scores import *
from dataset_evaluation_utils import * 


import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('whitegrid')

# import plotly.offline as py
# pd.options.plotting.backend = "plotly"
# py.init_notebook_mode() # graphs charts inline (IPython).

a4_dims = (11.7, 8.27)

# functions

In [None]:
def run_evaluate_UKNN(data:pd.DataFrame,
                        interval_type:str, 
                        intervals_path:str, 
                        use_data_unique_users:bool,
                        frequent_users_path:str,
                        buckets_path:str,
                        holdouts_path:str,
                        cold_start_buckets:int,
                        to_grid_search:bool,
                        k:int,
                        similarity:str,
                        random_seed:int,
                        results_matrix_path:str,
                        recall_heatmap_title:str,
                        recall_heatmap_path:str,
                        incrementalTraining_time_record_path:str,
                        evaluateHoldouts_time_record_path:str,
                        eval_files_path:str, 
                        save_eval_files:bool,
                        user_col:str = 'user_id',
                        item_col:str = 'item_id'):
    '''
        data: pd.DataFrame, assumes columns ['user_id', 'item_id', 'date', 'timestamp']
        interval_type: string | M, Q, S

    '''

    
    frequent_users = joblib.load(frequent_users_path)
    frequent_users_idx = None

    if interval_type=='Q':
        frequent_users_idx = 1
        interval_type ='QS'
        intervals = joblib.load(intervals_path)

        frequent_users[frequent_users_idx] = list(map(int, frequent_users[frequent_users_idx]))

        if use_data_unique_users:
            frequent_users = (frequent_users[0], data[user_col].unique(), frequent_users[2])

    elif interval_type=='S':
        frequent_users_idx = 2
        interval_type ='QS'
        intervals = joblib.load(intervals_path)

        frequent_users[frequent_users_idx] = list(map(int, frequent_users[frequent_users_idx]))

        if use_data_unique_users:
            frequent_users = (frequent_users[0], frequent_users[1], data[user_col].unique())

    else:
        # assumes monthly interval
        interval_type = 'M'
        frequent_users_idx = 0 
        intervals = None   

        frequent_users[frequent_users_idx] = list(map(int, frequent_users[frequent_users_idx]))
        
        if use_data_unique_users:
            frequent_users = (data[user_col].unique(), frequent_users[1], frequent_users[2])
        
    
    

    print('\nSTAGE -> getBucketsHoldouts')
    buckets, holdouts = getBucketsHoldouts( data = data,
                                            user_col = user_col,
                                            item_col = item_col,
                                            frequent_users = frequent_users[frequent_users_idx],
                                            interval_type = interval_type,
                                            intervals = intervals, 
                                            cold_start_buckets = cold_start_buckets)
    

    joblib.dump(buckets, buckets_path)
    joblib.dump(holdouts, holdouts_path)
    
    print('\nBucket size, number of users, number of items\n', [(b.size, len(b.userset), len(b.itemset)) for b in buckets])
    print('Holdouts size, number of users, number of items\n', [(h.size, len(h.userset), len(h.itemset)) for h in holdouts])


    print('\n\nSTAGE -> to_grid_search')
    if to_grid_search:
        prop = 0.05 
        hp_sample = data.iloc[:round( data.shape[0]*prop )]
        stream = ImplicitData(hp_sample[user_col], hp_sample[item_col]) 
        grid, results = grid_search(model = UserKNN, 
                                    stream = stream, 
                                    random_seed = random_seed, 
                                    interleaved = 100 )
        num_factors, num_iter, learn_rate, regularization, _ = grid[ np.argmax( results ) ]

    print('num_factors, num_iter, learn_rate, regularization\n', (num_factors, num_iter, learn_rate, regularization))

    print('\n\nSTAGE -> ISGD model')
    empty_stream = ImplicitData([], []) 
    model = UserKNN(empty_stream, k=k, similarity=similarity)


    print('\n\nSTAGE -> EvaluateHoldouts')
    eval = EvaluateHoldouts(model = model,
                            buckets = buckets,
                            holdouts = holdouts)
    
    if save_eval_files:
        eval.Train_Evaluate_Save(eval_files_path, N_recommendations=20, exclude_known_items=True, default_user='none')
    else:
        eval.Train_Evaluate(N_recommendations=20, exclude_known_items=True, default_user='none')

    rm = eval.results_matrix
    rm_df = pd.DataFrame(rm)
    rm_df.to_csv(results_matrix_path, index=False)
    print(rm_df)

    recall_heatmap( rm_df,
                    round_point = 4,
                    title = recall_heatmap_title,
                    filepath = recall_heatmap_path) 
    

    print('\n\nstage -> metrics')    
    arecall = avg_recall(rm_df)
    BWT_lr, meanBWT_lr = compute_BWT_lopes_ranzato(rm_df)
    BWT_r, meanBWT_r = compute_BWT_rodrigues(rm_df)
    FWT_r = compute_FWT_rodrigues(rm_df)

    print('avg recall', arecall.round(6))
    print('BWT (v. Lopez-Paz e Ranzato GEM 2017), meanBWT', (BWT_lr, meanBWT_lr))
    print('BWT (v. Díaz-Rodriguez et al. 2018), meanBWT', (BWT_r, meanBWT_r))
    print('FWT', FWT_r.round(6))

    joblib.dump(eval.IncrementalTraining_time_record, incrementalTraining_time_record_path)
    joblib.dump(eval.EvaluateHoldouts_time_record, evaluateHoldouts_time_record_path)


    return rm_df, arecall, (BWT_lr, meanBWT_lr), (BWT_r, meanBWT_r), FWT_r


def print_heatmap(results_matrix_path, recall_heatmap_title, filepath=False):
    rm_df = pd.read_csv(results_matrix_path)
    recall_heatmap( rm_df,
                    round_point = 4,
                    title = recall_heatmap_title,
                    filepath = filepath) 


# model name

In [None]:
MODEL_NAME = 'UKNN'

# paths

In [None]:
dataset_folderpath = '../datasets/lastfm1b/'

dataset_name = 'LastFM1b'

dump_foldername ='lastfm1b_dump/'
_, base_outputpath, _, _, _ = get_folderpaths(dump_foldername)


period=['2011-07', '2014-01']
sample = 'sample_'+str(period[0])+'_until_'+str(period[1])

sample_version_dump_foldername=dump_foldername+sample+'/'
filename = 'tracks_inter_merged_coldstart_11M'


# rule: what/which_data_set/sample_version/what/
images_path, output_path, heatmaps_path, diversity_graphpath, diversity_filepath = get_folderpaths(sample_version_dump_foldername)

# load data

In [None]:
interactions_df = pd.read_csv(output_path+filename+'_interactions_df.csv', index_col=0)
interactions_df['date'] = interactions_df['date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
interactions_df.head()

In [None]:
interactions_df.columns = ['user_id','item_id','timestamp','date']

In [None]:
sample_year_month = [(period[0], '%Y-%m'), (period[1], '%Y-%m')]
interval_type = 'S'

use_data_unique_users = False
frequent_users_thr = 0.75
cold_start_buckets = 1
to_grid_search = False
k=10
similarity='cosine'
random_seed = 10

In [None]:
output_path+filename

In [None]:
diversity_filepath

# run evaluate

In [None]:
run_evaluate_UKNN(data = interactions_df,
             interval_type = 'S',
             intervals_path = output_path+filename+'_semesters.joblib',
             use_data_unique_users = False,
             frequent_users_path = output_path+filename+'_frequent_users_'+str(frequent_users_thr)+'.joblib',
             buckets_path = output_path+filename+'_semesterly_buckets.joblib',
             holdouts_path = output_path+filename+'_semesterly_holdouts.joblib',
             cold_start_buckets = 1,
             to_grid_search = False,
             k=k,
             similarity=similarity,
             random_seed = random_seed,
             results_matrix_path = output_path+filename+'_semesterly_bucket_'+MODEL_NAME+'_results.csv',
             recall_heatmap_title = 'Recall@20 for '+MODEL_NAME+' checkpoints across Holdouts (2013/2014) - '+dataset_name,
             recall_heatmap_path = heatmaps_path+filename+'_semesterly_bucket_'+MODEL_NAME+'_heatmap.png',
             incrementalTraining_time_record_path = output_path+filename+'_semesterly_bucket_'+MODEL_NAME+'_training_time.joblib',
             evaluateHoldouts_time_record_path = output_path+filename+'_semesterly_bucket_'+MODEL_NAME+'_eval_time.joblib',
             eval_files_path = diversity_filepath+MODEL_NAME+'/'+filename+'_',
             save_eval_files = True)