In [1]:
from datetime import datetime
import settings.config as cfg
import pandas as pd
import numpy as np


preprocessed_dataset_folder = cfg.preprocessed_dataset_folder
individual_rs_strategy = cfg.individual_rs_strategy
aggregation_strategies = cfg.aggregation_strategies
recommendations_number = cfg.recommendations_number
individual_rs_validation_folds_k = cfg.individual_rs_validation_folds_k
group_rs_evaluation_folds_k = cfg.group_rs_evaluation_folds_k
evaluation_strategy = cfg.evaluation_strategy
metrics = cfg.metrics
group_types = cfg.group_types

display(cfg.dataset_folder,cfg.preprocessed_dataset_folder)

'ml-1m'

'preprocessed_dataset'

In [2]:
import pandas as pd
ratings_df = pd.read_csv(preprocessed_dataset_folder+"/ratings.csv")

import pickle

group_composition = pickle.load(open(preprocessed_dataset_folder+"/group_composition.pkl", "rb"))
len(group_composition)

240

# Train individual RS, Prepare groundtruth, Construct group recs.
- it is expected that individual RS are already trained and stored in pkl
- it is expected that group recommendations are already generated and stored in pkl

# Evaluate GRS for Individualists vs. Collectivist tendencies
- individualists RS will more often propose items from user's top-k items w.r.t. RS's predictions
- evaluate as hit_rate, relative borda and twin DCG (borda and DCG penalizes relevance of items on lower ranks)


In [47]:
import math
def jaccard_sim(a,b):
    return len(a.intersection(b))/len(a.union(b))


def process_ind_vs_collect(group_composition,group_recommendations,test_pred_df,topk=20):

    hitRate = {}
    twinDCG = {}
    relativeBorda = {}
    hitRateAVG = {}
    twinDCGAVG = {}
    relativeBordaAVG = {}

    for (idx, g) in group_composition.items():
        if idx % 50 == 0:
            print("done "+str(idx))
            
        groupRec = group_recommendations[idx]
        hitRate[idx] = {}
        twinDCG[idx] = {}
        relativeBorda[idx] = {}
        hitRateAVG[idx] = {}
        twinDCGAVG[idx] = {}
        relativeBordaAVG[idx] = {}

        for (idAlg, groupAlg) in groupRec.items():
            hitRate[idx][idAlg] = []
            twinDCG[idx][idAlg] = []
            relativeBorda[idx][idAlg] = []
            hitRateAVG[idx][idAlg] = 0.0
            twinDCGAVG[idx][idAlg] = 0.0
            relativeBordaAVG[idx][idAlg] = 0.0

            groupAlgDF = pd.DataFrame({"items_g": groupAlg, "weight_g": [1/math.log2(i+2) for i in range(len(groupAlg))], "weight_borda_g": [(topk-i)/topk for i in range(len(groupAlg))]})
            groupAlgDF.set_index("items_g", inplace=True)

            for gm in g["group_members"]:
                userData = test_pred_df.loc[test_pred_df.user == gm].sort_values("predicted_rating", ascending=False).iloc[:topk]
                idcg = np.array([1/math.log2(i+2)**2 for i in range(len(userData))]).sum()
                iBorda = np.array([(topk-i)/topk for i in range(len(userData))]).sum()
                #compare groupAlg against individual recommendations for all users
                userItems = set(userData.item.values)
                groupItems = set(groupAlg)
                simValue = jaccard_sim(userItems, groupItems)

                hitRate[idx][idAlg].append(simValue)
                hitRateAVG[idx][idAlg] += simValue

                userData["weight"] = [1/math.log2(i+2) for i in range(len(userData))]
                userData["weight_borda"] = [(topk-i)/topk for i in range(len(userData))]
                userData = userData.set_index("item")

                userData = userData.join(groupAlgDF, how="inner", rsuffix="_r")
                userData["weightTot"] = userData["weight"] * userData["weight_g"]
                userData["weightTotBorda"] = userData["weight_borda"] * userData["weight_borda_g"]
                twDCGVal = userData["weightTot"].sum() / idcg
                twBordaVal = userData["weightTotBorda"].sum() / iBorda
                
                twinDCG[idx][idAlg].append(twDCGVal)
                twinDCGAVG[idx][idAlg] += twDCGVal
                
                relativeBorda[idx][idAlg].append(twBordaVal)
                relativeBordaAVG[idx][idAlg] += twBordaVal
                #print(userData)

            hitRateAVG[idx][idAlg] = hitRateAVG[idx][idAlg] / len(g["group_members"])
            twinDCGAVG[idx][idAlg] = twinDCGAVG[idx][idAlg] / len(g["group_members"])
            relativeBordaAVG[idx][idAlg] = relativeBordaAVG[idx][idAlg] / len(g["group_members"])


        
    return (hitRate,hitRateAVG,twinDCG,twinDCGAVG,relativeBorda,relativeBordaAVG)

In [48]:

res = {}
import os
import warnings
warnings.filterwarnings('ignore')
lst = os.listdir(preprocessed_dataset_folder)
folds = [i for i in lst if (os.path.isdir(preprocessed_dataset_folder+"/"+i) and i.startswith("fold"))]
for f in folds:
    current_fold = int(f.replace("fold_",""))
    path_to_fold = preprocessed_dataset_folder+"/"+f
    recommenders = ["LENSKIT_ALS","LENSKIT_CF_USER", "LENSKIT_CF_ITEM"]
    res[f] = {}
    for r in recommenders:
        print(f,r)

        path_to_recommender = path_to_fold + "/" +r
        test_pred_df = pickle.load(open(path_to_recommender+"/test_pred_df.pkl", "rb"))
        group_recommendations = pickle.load(open(path_to_recommender+"/group_recommendations.pkl", "rb"))
        
        (hitRate,hitRateAVG,twinDCG,twinDCGAVG,relativeBorda,relativeBordaAVG) = process_ind_vs_collect(group_composition,group_recommendations,test_pred_df)
        
        gs = []
        gt = []
        for (idx, g) in group_composition.items():
            gt.append(g["group_similarity"])
            gs.append(g["group_size"])
        
        results = (hitRate,hitRateAVG,twinDCG,twinDCGAVG,relativeBorda,relativeBordaAVG,gs,gt)
        pickle.dump(results, open(path_to_recommender+"/results.pkl", "wb"))
        res[f][r] = results
    break
        

fold_0 LENSKIT_ALS
done 0
done 50
done 100
done 150
done 200
fold_0 LENSKIT_CF_USER
done 0
done 50
done 100
done 150
done 200
fold_0 LENSKIT_CF_ITEM
done 0
done 50
done 100
done 150
done 200


In [None]:
#TODO: make it relative w.r.t. similarity of user-pairs in the RS?

In [67]:
hitRateAVG = res[f]["LENSKIT_ALS"][1]
twinDCGAVG = res[f]["LENSKIT_ALS"][3]
relativeBordaAVG = res[f]["LENSKIT_ALS"][5]
gs = res[f]["LENSKIT_ALS"][6]
gt = res[f]["LENSKIT_ALS"][7]

In [68]:
hitRateDF = pd.DataFrame(hitRateAVG).T
hitRateDF["group_size"] = gs
hitRateDF["group_type"] = gt
hitRateDF.loc[((hitRateDF.group_size==8)&(hitRateDF.group_type=="similar_one_divergent"))].mean()
#hitRateDF.mean()

ADD           0.020115
MUL           0.016931
LMS           0.013526
MPL           0.018678
GFAR          0.016754
EPFuzzDA      0.016950
group_size    8.000000
dtype: float64

In [69]:
twinDCGDF = pd.DataFrame(twinDCGAVG).T
twinDCGDF["group_size"] = gs
twinDCGDF["group_type"] = gt
twinDCGDF.loc[((twinDCGDF.group_size==8)&(twinDCGDF.group_type=="divergent"))].mean()
#twinDCGDF.mean()

ADD           0.034500
MUL           0.032967
LMS           0.023692
MPL           0.033260
GFAR          0.021221
EPFuzzDA      0.030792
group_size    8.000000
dtype: float64

In [70]:
relativeBordaDF = pd.DataFrame(relativeBordaAVG).T
relativeBordaDF["group_size"] = gs
relativeBordaDF["group_type"] = gt
relativeBordaDF.loc[relativeBordaDF.group_size==8].mean()
#relativeBordaDF.mean()

ADD           0.023905
MUL           0.021156
LMS           0.014118
MPL           0.021602
GFAR          0.015783
EPFuzzDA      0.019940
group_size    8.000000
dtype: float64