In [1]:
from pathlib import Path
import sys, os
from typing import Dict, List, NamedTuple, Tuple
import numpy as np
import pandas as pd
from collections import defaultdict
import time
from scipy import sparse



In [2]:
# fold, directory
def get_folds(data_dir: str) -> List[Tuple[int, str]]:
    folds = []
    for dir in [f for f in Path(data_dir).iterdir() if f.is_dir()]:
        dir_name = os.path.basename(dir)
        if str(dir_name).isnumeric():
            folds.append((int(dir_name), str(dir)))
    folds.sort()
    return folds


In [3]:
# returns 2d numpy array where 1. index is userId and 2. index is itemId, values are float ratings
# returns 2d numpy array where 1. index is userId and 2. index is itemId, values are float ratings
def load_data_old(data_dir: str, fold: int) -> np.ndarray:
    #TODO: rewrite to consider test.csv
    return np.load(os.path.join(data_dir, str(fold), "mf_data.npy"))

def load_data(data_dir: str, fold: int) -> np.ndarray:
    #TODO: rewrite to consider test.csv
    #return np.load(os.path.join(data_dir, str(fold), "mf_data.npy"))
    fullMatrix = np.load(os.path.join(data_dir, str(fold), "mf_data.npy"))
    dataList = pd.read_csv(os.path.join(data_dir, str(fold), "test.csv"), sep=",", names=["uid","oid","val"])
    
    dataMatrix = sparse.coo_matrix((dataList.val.tolist(), (dataList.uid.tolist(), dataList.oid.tolist())), shape = fullMatrix.shape)
    dt =  dataMatrix.todense()
    dt[dt < 4] = 0
    dt[dt >= 4] = 1
    return np.asarray(dt)

def load_train_data(data_dir: str, fold: int) -> np.ndarray:
    fullMatrix = np.load(os.path.join(data_dir, str(fold), "mf_data.npy"))
    dataList = pd.read_csv(os.path.join(data_dir, str(fold), "train.csv"), sep=",", names=["uid","oid","val"])    
    dataMatrix = sparse.coo_matrix((dataList.val.tolist(), (dataList.uid.tolist(), dataList.oid.tolist())), shape = fullMatrix.shape)
    dt =  dataMatrix.todense()
    dt[dt < 4] = 0
    dt[dt >= 4] = 1
    return np.asarray(dt)    

class Group(NamedTuple):
    id: int
    members: List[int]
    
class GroupWeights(NamedTuple):
    id: int
    members: List[float]    

In [4]:
def calculate_inverse_propensity_score(train_data, propensity_gama):
    items = range(train_data.shape[1])
    propsensity_per_item = {}
    for item in items:        
        n_i_star = train_data[:,item].sum() 

        
        P_ui = n_i_star**((propensity_gama+1)/2)
        if P_ui ==0.0:
            P_ui = 1.0
        propsensity_per_item[item] = P_ui
        
        #print(sorted_items)
        #print(idcg)
        #exit()
        
    return propsensity_per_item    

In [5]:
def calculate_per_user_normalization(test_data, propensities):
    users = range(test_data.shape[0])
    normalization_per_user = {}
    for user in users:        
        per_user_items = test_data[user] 
        indices = np.nonzero(per_user_items)
        #print(indices[0])
        positive_items_propensity = np.array([propensities[i] for i in indices[0]])
        
        per_user_propensity = (1/positive_items_propensity).sum()
        if per_user_propensity == 0.0:
            per_user_propensity = 1.0

        normalization_per_user[user] = 1/per_user_propensity

        
    return normalization_per_user    

In [6]:
newData = load_data("data/ml1m", 1)
oldData = load_data_old("data/ml1m", 1)
trainData = load_train_data("data/ml1m", 1)
propensities = calculate_inverse_propensity_score(trainData, 1.5)
normalizations = calculate_per_user_normalization(newData,propensities)
normalizations

{0: 204.15573382446823,
 1: 76.74909293304869,
 2: 200.30599483339122,
 3: 885.7028931237724,
 4: 8.375187847191103,
 5: 38.406231385104796,
 6: 331.64783081212045,
 7: 44.207808991287166,
 8: 265.2422322758035,
 9: 2.985149975235497,
 10: 122.39746897527256,
 11: 81.08270311667331,
 12: 24.029368357160866,
 13: 1647.6814045131134,
 14: 117.27097418374795,
 15: 521.4651782180755,
 16: 22.18851295619805,
 17: 27.343005541954263,
 18: 9.25406086359854,
 19: 1110.5941436671933,
 20: 832.8651063723461,
 21: 4.1175279564515685,
 22: 55.1866149678813,
 23: 47.88224683570105,
 24: 317.90797926642944,
 25: 6.5290185448636935,
 26: 234.41481212554393,
 27: 85.34463770944662,
 28: 123.72436073711783,
 29: 140.98579768182532,
 30: 39.13661832686568,
 31: 362.994441075813,
 32: 11.3665447589832,
 33: 19.50423042616921,
 34: 23.445498046168996,
 35: 12.403566485169982,
 36: 19.699753154503764,
 37: 104.5660120230046,
 38: 569.0471672427352,
 39: 43.33157848261902,
 40: 244.33111944835005,
 41: 32.3

In [7]:
# group data must be in file formated with groupId, userid1, userid2...
# separated by tabs
def load_group_data(data_dir: str, group_type: str, group_size: int) -> List[Group]:
    groups = []
    filename = group_type + "_group_" + str(group_size)
    path = os.path.join(data_dir, filename)
    with open(path) as group_file:
        lines = group_file.readlines()
        for line in lines:
            items = line.replace('\n', '').split("\t")
            items = list(map(int, items))
            groups.append(Group(items[0], items[1:]))
            if len(items) < group_size + 1:
                raise Exception("Group file invalid: " + path)
                                               
    return groups

In [8]:
def get_recommendation_files(data_dir: str, fold: int, group: str, group_size: int) -> List[str]:
    rec_path = os.path.join(data_dir, str(fold), group, str(group_size)) 
    return list([str(f) for f in Path(rec_path).iterdir() if f.is_file()])

class AlgRecommendations(NamedTuple):
    alg_name: str
    # dict indexed by groupId
    group_recommendations: Dict[int, List[int]] = {} 


In [9]:
# items are sorted from best to worst
# returns list of tuples where first is the agreg name and second is dictionary of recommendations indexed by group id
def load_agregated_recommendations(data_dir: str, fold: int, group: str, group_size: int) -> List[AlgRecommendations]:
    whitelist = ['GFAR', '_AVG', 'FuzzyDHondtDirectOptimize_1', 'GreedyLM',  'FuzzyDHondt_1',  'SPGreedy',  'fai',  'xpo']
    blacklist = "rec_rel"

    files = get_recommendation_files(data_dir, fold, group, group_size)
    r_files = []
    for file in files:
        for item in whitelist:
            if item in file and blacklist not in file:
                r_files.append(file)
    #print(r_files)
    #exit()    
    
    returnList = []
    for file in r_files:
        recommendationsMap = defaultdict(list) 
        with open(file) as recommendation_file:
            lines = recommendation_file.readlines()
            for line in lines:
                items = line.replace('\n', '').split("\t")[:2]
                items = list(map(int, items))
                group_id = items[0]
                recommendationsMap[group_id].append(items[1])
        alg_name = os.path.basename(file)
        returnList.append(AlgRecommendations(alg_name, recommendationsMap))
    return returnList


In [10]:
#calculates discounted cumulative gain on the array of relevances
def calculate_dcg(values):
    values = np.array(values)
    if values.size: #safety check
        return np.sum(values / np.log2(np.arange(2, values.size + 2)))
    return 0.0  

#order items of user, cut best topk_size, calculate DCG of the cut
#test_data = uidxoid matrix of ratings
#topk_size = volume of items per user on which to calculate IDCG
#return dictionary {userID:IDCG_value}
def calculate_per_user_IDCG(test_data, topk_size, propensities):
    users = range(test_data.shape[0])
    idcg_per_user = {}
    propensity_array = np.array([propensities[i] for i in range(test_data.shape[1])])
    for user in users:        
        per_user_items = test_data[user] 
        ratings_with_IPS = per_user_items / propensity_array # possibly dangerous - counts on the way how propensities are ordered in dict
        sorted_items = np.sort(ratings_with_IPS)[::-1]
        sorted_items = sorted_items[0:20]
        
        idcg = calculate_dcg(sorted_items)
        idcg_per_user[user] = idcg
        
        #print(sorted_items)
        #print(idcg)
        #exit()
        
    return idcg_per_user
        
    

In [11]:
class Result(NamedTuple):
    alg: str
    group_id: str
    user_id: int
    metric: str
    result: float



In [12]:
def compute_metrics(fold, test_data: np.ndarray, train_data: np.ndarray, groups: List[Group],  
                    alg_data: AlgRecommendations, propensity_gama:float) -> List[Result]:
    # test_data are triplets: user_id, item_id, and rating
    #LP: test data is matrix user_id x item_id !!!!!! a ja si rikal, jakto ze ti to prirazeni funguje...
    propensities = calculate_inverse_propensity_score(train_data, propensity_gama)
    normalizations = calculate_per_user_normalization(newData,propensities)
    
    idcg_per_user = calculate_per_user_IDCG(test_data, 20, propensities)
    #print(idcg_per_user)
    
    results = []
    
    i = 0    
    for group in groups:
        #print(single_group_weights)
        group_users_sum_ratings = []
        group_users_ndcg_ratings = []
        group_id = group.id 
        rec_for_group = alg_data.group_recommendations[group_id]
        if len(rec_for_group) >0:
            j = 0
            for group_user_id in group.members:
                user_sum = 0.0
                user_list = []
                user_normalization = normalizations[group_user_id]
                for item_id in rec_for_group:
                    rating = test_data[group_user_id, item_id]
                    item_propensity = propensities[item_id]
                    #print(group_user_id, item_id, rating)
                    #print(type(test_data))
                    #print(test_data.shape)
                    #print(test_data[group_user_id])
                    #exit()
                    user_sum += (rating/item_propensity)
                    user_list.append(rating/item_propensity)
                dcg = calculate_dcg(user_list)
                idcg = idcg_per_user[group_user_id] 
                if idcg != 0:
                    ndcg = dcg / idcg
                else:
                    ndcg = 0
              
                group_users_sum_ratings.append(user_sum*user_normalization) #is this the correct normalization?
                group_users_ndcg_ratings.append(ndcg) #IDCG already normalizes the results
                j += 1
              
            group_users_mean_ratings = [i/len(rec_for_group) for i in group_users_sum_ratings] 
          
  
            for k in range(len(group_users_mean_ratings)):
                results.append(Result(alg_data.alg_name, str(group_id)+"_"+str(fold), group.members[k], "AR", group_users_mean_ratings[k])   )
                results.append(Result(alg_data.alg_name, str(group_id)+"_"+str(fold), group.members[k], "nDCG", group_users_ndcg_ratings[k])   )
         
            i += 1
        
    return results

In [13]:
def process_fold(groups: List[Group],  data_dir: str, fold: int, group: str, group_size: int, gamma:float) -> List[Result]:
    algs_data = load_agregated_recommendations(data_dir, fold, group, group_size)
    #print([i[0] for i in algs_data])
    #exit()
    test_data = load_data(data_dir, fold)
    train_data = load_train_data(data_dir, fold)
    results = []
    for alg_data in algs_data:
        results.extend(compute_metrics(fold, test_data, train_data, groups,  alg_data, gamma))
    #for result in results:
    #    print(result)
    return results

In [14]:
def main(data_folder, group_type, group_size, gamma):
    print(data_folder, group_type, group_size, gamma)
    folds = get_folds(data_folder)
    groups: List[Group] = load_group_data(data_folder, group_type, int(group_size))
    #group_weights: List[GroupWeights] = load_group_weights_data(data_folder, group_type, int(group_size))
    
    results = []
    for fold, _ in folds:
        results.extend(process_fold(groups,  data_folder, fold, group_type, int(group_size), gamma))

        
    algs = set(map(lambda x:x.alg, results))
    metrics = set(map(lambda x:x.metric, results))
    res = ""
    for result in results:
        result = [str(i) for i in result]
        res += ",".join(result)+"\n"
    return res

In [15]:
gammas = [0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0]
for gamma in gammas:
    str_gamma = str(int(gamma*10))
    f = open("results/result_raw_coupled_unbiased_"+str_gamma,"w")
    res = "alg,group_id,user_id,metric,result\n"
    f.write(res)
    #for group_type in ["sim", "div", "random"]:
    #    for group_size in ["2","3","4","8"]:
    for group_type in ["sim", "div"]:
            for group_size in ["2","3","4","8"]:
                f2 = open("results/resultRaw_coupled_unbiased_"+str_gamma+"_"+group_type+"_"+group_size,"w")

                results = main("data/ml1m", group_type, group_size, gamma)            
                f.write(results)
                f2.write(results)
                #exit()

data/ml1m sim 2 0.0
data/ml1m sim 3 0.0
data/ml1m sim 4 0.0
data/ml1m sim 8 0.0
data/ml1m div 2 0.0
data/ml1m div 3 0.0
data/ml1m div 4 0.0
data/ml1m div 8 0.0
data/ml1m sim 2 0.5
data/ml1m sim 3 0.5
data/ml1m sim 4 0.5
data/ml1m sim 8 0.5
data/ml1m div 2 0.5
data/ml1m div 3 0.5
data/ml1m div 4 0.5
data/ml1m div 8 0.5
data/ml1m sim 2 1.0
data/ml1m sim 3 1.0
data/ml1m sim 4 1.0
data/ml1m sim 8 1.0
data/ml1m div 2 1.0
data/ml1m div 3 1.0
data/ml1m div 4 1.0
data/ml1m div 8 1.0
data/ml1m sim 2 1.5
data/ml1m sim 3 1.5
data/ml1m sim 4 1.5
data/ml1m sim 8 1.5
data/ml1m div 2 1.5
data/ml1m div 3 1.5
data/ml1m div 4 1.5
data/ml1m div 8 1.5
data/ml1m sim 2 2.0
data/ml1m sim 3 2.0
data/ml1m sim 4 2.0
data/ml1m sim 8 2.0
data/ml1m div 2 2.0
data/ml1m div 3 2.0
data/ml1m div 4 2.0
data/ml1m div 8 2.0
data/ml1m sim 2 2.5
data/ml1m sim 3 2.5
data/ml1m sim 4 2.5
data/ml1m sim 8 2.5
data/ml1m div 2 2.5
data/ml1m div 3 2.5
data/ml1m div 4 2.5
data/ml1m div 8 2.5
data/ml1m sim 2 3.0
data/ml1m sim 3 3.0


In [16]:
get_folds("data/ml1m")

[(1, 'data\\ml1m\\1'),
 (2, 'data\\ml1m\\2'),
 (3, 'data\\ml1m\\3'),
 (4, 'data\\ml1m\\4'),
 (5, 'data\\ml1m\\5')]