# Calculating Diversity - Goodreads

In [None]:
import os
import sys
sys.path.append(os.path.abspath('') + '/..')

# imports

In [None]:
import pandas as pd 
import joblib
from sklearn.preprocessing import MinMaxScaler


from dataset_evaluation_utils import * 
from data_utils.transfer_learning_scores import *

import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('whitegrid')

# import plotly.offline as py
# pd.options.plotting.backend = "plotly"
# py.init_notebook_mode() # graphs charts inline (IPython).

a4_dims = (11.7, 8.27)

# functions

In [None]:
def read_all_reclists_to_frame(filename:str, n_holdouts:int, rec_list_as_rows:bool=False):
    '''
        Reads all recommended list outputed by the model at the different buckets and holdouts
        to a pd.DataFrame with bucket_idx, holdout_idx,	reclist_idx	as the DataFrame's index
        (does not keep the score for each item in the recommended list)

        filename: expects 'path/to/file/<file_name>'
                    string is completed with '_b<i>_h<j>.joblib' in this function
                    full string rule : 'path/to/file/<file_name>_b<i>_h<j>.joblib'

                  the file shape is assumed to be 
                    [ 
                        [ ['B001PZ06PS' '0.5036084630168064']
                          ['B001386NZE' '0.504954758011992']
                          ['B00PJU8VFM' '0.5300628103145297'] ... ],
                        [ ... ], 
                      ... 
                    ]

                    
        if rec_list_as_rows: returns a pd.Dataframe like so
                                            0	          1	          2          # rank of item in the Top N Rec List
        bucket_idx	bucket_idx	reclist_idx			
        b0	        h0	        0.0	        B00PBW27VW  B00RDEZFN8	B009G3S0F4 # 1st list of rec items of '<file_name>_b0_h0.joblib'
                                1.0	        B00PBW27VW	B00RDEZFN8	B009G3S0F4
                    h1	        0.0	        B00SYTTBMC	B00KJJS6HQ	B001L0TTS2
                                1.0	        B00SYTTBMC	B00KJJS6HQ	B001L0TTS2
                                2.0	        B00SYTTBMC	B00KJJS6HQ	B001L0TTS2
                                3.0	        B00SYTTBMC	B00KJJS6HQ	<item_id>
        else: returns
                                              item_id
        bucket_idx	holdout_idx	reclist_idx	
        b0	        h0	        0	            B001PZ06PS
                                0	            B001386NZE
                                0	            B00PJU8VFM
                                0	            B00PBW27VW
                                0	            B00RDEZFN8
        
    
    '''
        
    fileshape = (0,20,2)
    n = fileshape[1]
    all_rec_lists = np.empty(fileshape, dtype=object)
    # rec_lists_b0_h0 = joblib.load(filename+'_b0_h0.joblib')
    # n = len(rec_lists_b0_h0[0]) # len(all_rec_lists.T[0].T[0]) would also work
    # rs = np.empty((0, rec_lists_b0_h0.shape[1], rec_lists_b0_h0.shape[2]))

    df_bucket_idxs = []
    df_holdout_idxs = []
    df_reclist_idxs = []


    if rec_list_as_rows:

      for bucket_i in range(n_holdouts):
        print(bucket_i)
        for holdout_j in range(n_holdouts):

          reclists_bi_hj = np.array(joblib.load(filename+'_b'+str(bucket_i)+'_h'+str(holdout_j)+'.joblib'),  dtype=object)
          
          # with different len arrays, np array becomes 1d, raises an error when merging later on
          # when 1d, it means there are empty rec lists
          # solution, reshape those so the array has shape (20, 2)
          if reclists_bi_hj.ndim == 1:
            reclists_bi_hj = np.array([np.empty((20,2), dtype=object) if l==[] else l for l in reclists_bi_hj])

          n_reclists = len(reclists_bi_hj)

          df_bucket_idxs = np.concatenate([df_bucket_idxs, ['b'+str(bucket_i)]*n_reclists*n])
          df_holdout_idxs = np.concatenate([df_holdout_idxs, ['h'+str(holdout_j)]*n_reclists*n])
          df_reclist_idxs = np.concatenate([df_reclist_idxs, np.array(range(n_reclists), dtype=str)])

          all_rec_lists = np.vstack([all_rec_lists, reclists_bi_hj])
          # print(all_rec_lists)

      df_idxs = np.array([df_bucket_idxs, df_holdout_idxs, df_reclist_idxs])
      df = pd.DataFrame(all_rec_lists.T[0].T, index=list(df_idxs), columns=list(range(n)))
      df.index.set_names(['bucket_idx', 'holdout_idx', 'reclist_idx'], inplace=True)
    
      print('how many rec lists are empty: (empty: True)')
      print(df.T.isna().value_counts()/df.shape[0])

    else:

      for bucket_i in range(n_holdouts):
          print(bucket_i)
          for holdout_j in range(n_holdouts):
            # print(bucket_i, holdout_j)

            reclists_bi_hj = np.array(joblib.load(filename+'_b'+str(bucket_i)+'_h'+str(holdout_j)+'.joblib'),  dtype=object)
            
            # with different len arrays, np array becomes 1d, raises an error when merging later on
            # when 1d, it means there are empty rec lists
            # solution, reshape those so the array has shape (20, 2)
            if reclists_bi_hj.ndim == 1:
              reclists_bi_hj = np.array([np.empty((20,2), dtype=object) if l==[] else l for l in reclists_bi_hj])

            # print(reclists_bi_hj.shape)
            n_reclists = len(reclists_bi_hj)

            df_bucket_idxs = np.concatenate([df_bucket_idxs, ['b'+str(bucket_i)]*n_reclists*n])
            df_holdout_idxs = np.concatenate([df_holdout_idxs, ['h'+str(holdout_j)]*n_reclists*n])
            # df_reclist_idxs = np.concatenate([df_reclist_idxs, np.array(range(n_reclists), dtype=str)])
            df_reclist_idxs = np.concatenate([df_reclist_idxs, np.array(np.repeat(range(n_reclists), n), dtype=str)])

            all_rec_lists = np.vstack([all_rec_lists, reclists_bi_hj])
            # print(all_rec_lists)


      df_idxs = np.array([df_bucket_idxs, df_holdout_idxs, df_reclist_idxs])
      shape = all_rec_lists.T[0].T.shape
      df = pd.DataFrame(all_rec_lists.T[0].T.reshape(shape[0]*shape[1]), index=list(df_idxs), columns=['item_id'])
      df.index.set_names(['bucket_idx', 'holdout_idx', 'reclist_idx'], inplace=True)

    df.sort_index(inplace=True)
    return df


In [None]:
def get_user_profile_div_frame(user_div_col:str, 
                               user_profile_size_col:str, 
                               model_data:ImplicitData, 
                               bucket_idx:str, 
                               meta_df:pd.DataFrame):
    # user_div_col = 'user_'+div_col
    # user_profile_size_col = 'n_seen_items'

    user_profile_cols = ['bucket_idx', 'user_id', user_profile_size_col, user_div_col]
    # user_profile_div_df = pd.DataFrame(columns=user_profile_cols)
    user_profile_div = np.empty((0, len(user_profile_cols)), dtype=object)

    for u in (model_data.userset):
        # df (bucket_idx, u, n_seen_items, sum=diversity)
        user_profile = model_data.GetUserItems(u, internal=False)
        # div = meta_df.loc[meta_df['item_id'].isin(user_profile), div_col].sum()
        div = meta_df.loc[meta_df['item_id'].isin(user_profile), user_div_col].sum()
        user_profile_div = np.vstack([user_profile_div, np.array([bucket_idx, u, len(user_profile), div])])

    return pd.DataFrame(user_profile_div, columns=user_profile_cols)




def get_column_binned(df:pd.DataFrame, col_name:str, cut:bool=False):
    ''' 
        this function adds a new column with binned values, of the given column, to the given dataframe
    '''

    try:
        df[col_name] = df[col_name].astype(int)
    except:
        print('@get_column_binned(): Aborted! Needs to be a column dtype int!')
        raise

    if cut:
        try:
            _, b = pd.cut(df[col_name], 4, retbins=True, labels=False)
        except:
            try:
                _, b = pd.cut(df[col_name], 3, retbins=True, labels=False)
            except:
                _, b = pd.cut(df[col_name], 4, retbins=True, labels=False, duplicates='drop')

    else:
        try:
            _, b = pd.qcut(df[col_name], 4, retbins=True, labels=False)
        except:
            try:
                _, b = pd.qcut(df[col_name], 3, retbins=True, labels=False)
            except:
                _, b = pd.qcut(df[col_name], 4, retbins=True, labels=False, duplicates='drop')

    col_bin_name = col_name+'_binned'
    df[col_bin_name] =  b[1:][_]

    return col_bin_name #, df




def get_avg_recdiv_per_bin(df:pd.DataFrame, rec_div_col:str, bin_col:str):
    '''
       get average diversity of the recommended lists in each bin
    '''
    try:
        df[rec_div_col] = df[rec_div_col].astype(int)
    except:
        print('@get_avg_recdiv_per_bin(): Aborted! Needs to be a column dtype int!')
        raise

    avg_recdiv_bin_col = 'avg_'+rec_div_col+'_per_'+bin_col
    avg_recdiv_per_bin = df[['bucket_idx','holdout_idx',bin_col, rec_div_col]]\
                            .groupby(['bucket_idx','holdout_idx', bin_col])\
                                .mean().reset_index()
    avg_recdiv_per_bin.columns = ['bucket_idx','holdout_idx',bin_col, avg_recdiv_bin_col]
    return avg_recdiv_per_bin, avg_recdiv_bin_col




def validate_folderpath(folderpath:str):
    if not os.path.exists(folderpath):
        os.makedirs(folderpath)


def save_plot(title:str, bi:int, hj:int, df:pd.DataFrame, x:str, y:str, path:str):
    full_title = title+'- B'+str(bi)+'H'+str(hj)
    # islice = pd.IndexSlice
    # ax = df.loc[islice['b'+str(bi), 'h'+str(hj), :], [x, y]]\
    # _ = df.loc[(df.bucket_idx=='b'+str(bi)) & (df.holdout_idx=='h'+str(hj)), :]
    ax = df.loc[(df.bucket_idx=='b'+str(bi)) & (df.holdout_idx=='h'+str(hj)), [x, y]]\
                .plot(  x=x,
                        y=y,
                        kind='scatter',
                        marker='+',
                        # xunits=UnitData(list(_[x].unique())),
                        title=full_title)
    fig = ax.get_figure()
    fig.savefig(path+full_title+'.png')
    plt.close(fig)
    

# variables

## model name

In [None]:
MODEL_NAME = 'UKNN'

In [None]:
dataset_folderpath = '../datasets/goodreads/'

dataset_name = 'Goodreads'
filename = 'inter_dedup_coldstart_3stars_4x714k'

period=['2012-07', '2015-01']
sample = 'sample_'+str(period[0])+'_until_'+str(period[1])


dump_foldername = 'goodreads_dump/'
_, base_outputpath, _, _, _ = get_folderpaths(dump_foldername)

sample_version_dump_foldername ='goodreads_dump/'+'pos_rates_only/'+sample+'/'
# rule: what/which_data_set/sample_version/what/
images_path, output_path, heatmaps_path, diversity_graphpath, diversity_filepath = get_folderpaths(sample_version_dump_foldername)

model_diversity_filepath = diversity_filepath+MODEL_NAME+'/'+filename+'_'
model_diversity_graphpath = diversity_graphpath+MODEL_NAME+'/'+filename+'_'


n_holdouts = 5
sample_year_month = [(period[0], '%Y-%m'), (period[1], '%Y-%m')]
interval_type = 'S'
drop_user_empty_profiles = False

# Meta DF

In [None]:
div_col = 'n_genre' # TODO: UNCOMMENT
metainfo_col = 'genres'
item_div_col = 'item_'+div_col
item_meta_onecol_df = pd.read_csv(base_outputpath+'item_meta_onecol_df.csv', index_col=0)
item_meta_onecol_df.columns = ['item_id', metainfo_col, item_div_col]
# item_meta_onecol_df.head()

meta_df = item_meta_onecol_df.copy()
meta_df.head()

# STAGE: load all recommended lists to frame

In [None]:
#########################################################################################
print('STAGE: load all recommended lists to frame')
all_reclist_df = read_all_reclists_to_frame(filename=model_diversity_filepath+'rec_lists',
                                            n_holdouts=n_holdouts)

print('does the meta dataframe covers all items recommended?', len(set(meta_df.item_id.unique()).intersection(all_reclist_df.item_id.unique())) == all_reclist_df.item_id.nunique())

total_n_reclists = len(all_reclist_df.index.drop_duplicates())
# n_holdouts = len(all_reclist_df.index.levels[0])

print('Create meta_recommendations frame')
meta_rec_df = pd.merge(all_reclist_df.reset_index(), 
                    meta_df, 
                    how='left')


print('fill in Na values')
meta_rec_df.loc[meta_rec_df['item_id'].isna(), 'item_id'] = '0'
meta_rec_df.loc[meta_rec_df[item_div_col].isna(), item_div_col] = 0
meta_rec_df.loc[meta_rec_df[metainfo_col].isna(), [metainfo_col]] = '0'

print('scale diversity column between 0 and 1')
item_div_col_minmax = item_div_col+'_minmax'
scaler = MinMaxScaler(feature_range=(0,1))
meta_rec_df[item_div_col_minmax] = scaler.fit_transform(meta_rec_df[[item_div_col]])

# bc we have all values in one df it won't make a difference
# item_div_col_perc = item_div_col+'_perc'
# meta_rec_df[item_div_col_perc] = meta_rec_df[item_div_col]/meta_rec_df[item_div_col].max()

joblib.dump(meta_rec_df, diversity_filepath+filename+'_meta_rec_df.joblib')

In [None]:
meta_rec_df.head()

# STAGE: calculate the recommended lists diversity 
(aka intra list diversity)

In [None]:
# repeat_div_col = 'repeat_'+div_col
unique_div_col = 'unique_'+div_col

## unique count

In [None]:
#########################################################################################
print('STAGE: calculate the recommended lists diversity aka intra list diversity')
print('Note: this is a 1st approach to measuring diveristy, We are only counting the number of different values  ')

# this df will sum all genres occurences in the items, considering repeated genres
# _ = meta_rec_df[['bucket_idx','holdout_idx','reclist_idx','item_id' ,item_div_col]].drop_duplicates()
# reclist_repeatdiv_df = _[['bucket_idx','holdout_idx','reclist_idx',item_div_col]]\
#                 .groupby(['bucket_idx','holdout_idx','reclist_idx'])\
#                     .sum().reset_index()
# rec_repdiv_col = 'intra_list_'+repeat_div_col
# reclist_repeatdiv_df.columns = ['bucket_idx','holdout_idx','reclist_idx', rec_repdiv_col]


# this df will sum all different genres occurences in the list, NOT considering repeated genres
reclist_uniquediv_df = meta_rec_df[['bucket_idx','holdout_idx','reclist_idx',metainfo_col]]\
            .drop_duplicates()\
                .groupby(['bucket_idx','holdout_idx','reclist_idx'])\
                    .count().reset_index()
rec_uniqdiv_col = 'intra_list_'+unique_div_col
reclist_uniquediv_df.columns = ['bucket_idx','holdout_idx','reclist_idx', rec_uniqdiv_col]


# reclist_div_df = pd.merge(reclist_repeatdiv_df, reclist_uniquediv_df)
reclist_div_df = reclist_uniquediv_df

print('scale intra list diversity column between 0 and 1')
# rec_repdiv_col_minmax = rec_repdiv_col+'_minmax'
# scaler = MinMaxScaler(feature_range=(0,1))
# reclist_div_df[rec_repdiv_col_minmax] = scaler.fit_transform(reclist_div_df[[rec_repdiv_col]])


rec_uniqdiv_col_minmax = rec_uniqdiv_col+'_minmax'
scaler = MinMaxScaler(feature_range=(0,1))
reclist_div_df[rec_uniqdiv_col_minmax] = scaler.fit_transform(reclist_div_df[[rec_uniqdiv_col]])

joblib.dump(reclist_div_df, diversity_filepath+filename+'_reclist_div_df.joblib')

In [None]:
reclist_div_df.head()

# STAGE: create diversity df

In [None]:
def get_unique_diversity_count(df:pd.DataFrame, user_list:list, col:str):
    return df.loc[df['item_id'].isin(user_list), [col]].drop_duplicates().nunique()[0]
    
def get_repeat_diversity_count(df:pd.DataFrame, user_list:list, col:str):
    return df.loc[df['item_id'].isin(user_list), col].drop_duplicates().sum()


def count_diversity_df_NA(div_df:pd.DataFrame, rec_div_col:str, rec_div_col_minmax:str, user_profile_size_col:str, user_div_col:str, drop_user_empty_profiles:bool):
    print('\nNA count')
    print('rec_div_col ('+str(rec_div_col)+') ',div_df[rec_div_col].isna().sum())
    print('rec_div_col_minmax ('+str(rec_div_col_minmax)+') ',div_df[rec_div_col_minmax].isna().sum())
    print('user_profile_size_col ('+str(user_profile_size_col)+') ',div_df[user_profile_size_col].isna().sum())
    print('user_div_col ('+str(user_div_col)+') ',div_df[user_div_col].isna().sum())

    if drop_user_empty_profiles:
        div_df = div_df.loc[~div_df[user_profile_size_col].isna(), :]
    else:
        for col in [rec_div_col, rec_div_col_minmax, user_profile_size_col, user_div_col]:
            div_df.loc[div_df[col].isna(), col] = '0'

    print('\n')
    print('rec_div_col ('+str(rec_div_col)+') ',div_df[rec_div_col].isna().sum())
    print('rec_div_col_minmax ('+str(rec_div_col_minmax)+') ',div_df[rec_div_col_minmax].isna().sum())
    print('user_profile_size_col ('+str(user_profile_size_col)+') ',div_df[user_profile_size_col].isna().sum())
    print('user_div_col ('+str(user_div_col)+') ',div_df[user_div_col].isna().sum())

    return div_df


In [None]:
# 165min
#########################################################################################
print('STAGE: create diversity df')

# sort the frame, so the user in the holdout paired with the respective recommended list
# (so the order of the holdouts user list is compatible with it)
reclist_div_df['reclist_idx'] = reclist_div_df['reclist_idx'].astype(int)
reclist_div_df.set_index(['bucket_idx','holdout_idx','reclist_idx'], inplace=True)
reclist_div_df.sort_index(inplace=True)



# user_repdiv_col = 'user_'+repeat_div_col
# user_profile_size_col = 'n_seen_items'
# user_profile_cols = ['bucket_idx', 'user_id', user_profile_size_col, user_repdiv_col]
# user_profile_div = np.empty((0, len(user_profile_cols)), dtype=object)

user_uniqdiv_col = 'user_'+unique_div_col
user_profile_size_col = 'n_seen_items'
user_profile_cols = ['bucket_idx', 'user_id', user_profile_size_col, user_uniqdiv_col]
user_profile_div = np.empty((0, len(user_profile_cols)), dtype=object)


# index slice to access multi-index
islice = pd.IndexSlice

# initialize user id column
reclist_div_df['user_id'] = None

holdouts = joblib.load(output_path+filename+'_semesterly_holdouts.joblib')
for hj, holdoutj in enumerate(holdouts): 

    # load user profile @holdout_j -> read model that has seen all *buckets* until *jth holdout*, aka bj
    model_data_bucketj = joblib.load(model_diversity_filepath+'model_data_b'+str(hj)+'.joblib')
    print('\tcalculate user diversity @'+str(hj))
    for u in (model_data_bucketj.userset):
        # df (bucket_idx, u, n_seen_items, sum=diversity)
        user_profile = model_data_bucketj.GetUserItems(u, internal=False)

        div = get_unique_diversity_count(meta_df, user_profile, metainfo_col)
        # div = get_repeat_diversity_count(meta_df, user_profile, item_div_col)
        
        user_profile_div = np.vstack([user_profile_div, np.array(['b'+str(hj), u, len(user_profile), div])])

    print('\tadd user to the respective rec lists @'+str(hj))
    for bi in range(n_holdouts):
        # reclist_div_df['user_id'] = h.userlist
        reclist_div_df.loc[islice['b'+str(bi), 'h'+str(hj), :], 'user_id'] = list(holdoutj.userlist)
    

# merge the user profile info to the diversity df
user_profile_div_df = pd.DataFrame(user_profile_div, columns=user_profile_cols)
diversity_df = pd.merge(reclist_div_df.reset_index(),
                        user_profile_div_df,
                        how='left')


diversity_df = count_diversity_df_NA(diversity_df, 
                                     rec_uniqdiv_col, 
                                     rec_uniqdiv_col_minmax, 
                                     user_profile_size_col, 
                                     user_uniqdiv_col, 
                                     False)

joblib.dump(reclist_div_df, diversity_filepath+filename+'_diversity_df.joblib')

In [None]:
diversity_df.head()

# STAGE: bin columns

# unique

In [None]:
#########################################################################################
print('STAGE: bin columns - Qcut')

# qcut function requires an int type column, binning the standardized column will put all values in the 0 bin

# USER PROFILE
user_profile_size_bin_col = get_column_binned(diversity_df, user_profile_size_col)

# min max
# _, avg_recrepdiv_profilebin_col_minmax = get_avg_recdiv_per_bin(diversity_df,
#                                                              rec_repdiv_col_minmax, 
#                                                              user_profile_size_bin_col)
# diversity_df = pd.merge(diversity_df, _)

# not scaled
_, avg_recrepdiv_profilebin_col = get_avg_recdiv_per_bin(diversity_df,
                                                      rec_uniqdiv_col, 
                                                      user_profile_size_bin_col)
diversity_df = pd.merge(diversity_df, _)




# USER DIVERSITY
user_repdiv_bin_col = get_column_binned(diversity_df, user_uniqdiv_col)

# minmax
# _, avg_recrepdiv_userrepdivbin_col_minmax = get_avg_recdiv_per_bin(diversity_df,
#                                                                 rec_repdiv_col_minmax, 
#                                                                 user_repdiv_bin_col)
# diversity_df = pd.merge(diversity_df, _)

# not scaled
_, avg_recrepdiv_userrepdivbin_col = get_avg_recdiv_per_bin(diversity_df,
                                                        rec_uniqdiv_col, 
                                                        user_repdiv_bin_col)
diversity_df = pd.merge(diversity_df, _)


joblib.dump(diversity_df, diversity_filepath+filename+'_diversity_df+binned_cols.joblib')

In [None]:
diversity_df.head()

# STAGE: plot diversity values

## unique

In [None]:
model_diversity_graphpath

In [None]:
#########################################################################################
print('STAGE: plot diversity values')
for bi in range(n_holdouts):
    for hj in range(n_holdouts):
        # save_plot(title='Profile size vs Avg intra-list diversity (standardized)',
        #           bi=bi,
        #           hj=hj,
        #           df=diversity_df,
        #           x=user_profile_size_bin_col,
        #           y=avg_recrepdiv_userrepdivbin_col_minmax,
        #           path=model_diversity_graphpath)
        
        save_plot(title='Profile size vs Avg intra-list diversity',
                  bi=bi,
                  hj=hj,
                  df=diversity_df,
                  x=user_profile_size_bin_col,
                  y=avg_recrepdiv_userrepdivbin_col,
                  path=model_diversity_graphpath)
        
        save_plot(title='User diversity vs Avg intra-list diversity',
                  bi=bi,
                  hj=hj,
                  df=diversity_df,
                  x=user_repdiv_bin_col,
                  y=avg_recrepdiv_userrepdivbin_col,
                  path=model_diversity_graphpath)
        
        # save_plot(title='User diversity vs Avg intra-list diversity (standardized)',
        #           bi=bi,
        #           hj=hj,
        #           df=diversity_df,
        #           x=user_repdiv_bin_col,
        #           y=avg_recrepdiv_userrepdivbin_col_minmax,
        #           path=diversity_graphpath)                          