## Training/generating and other long running code

In [2]:
import settings.config as cfg

preprocessed_dataset_folder = cfg.preprocessed_dataset_folder
group_sizes_to_create = cfg.group_sizes_to_create
group_similarity_to_create = cfg.group_similarity_to_create
group_number = cfg.group_number

In [3]:
import pandas as pd
ratings_df = pd.read_csv(preprocessed_dataset_folder+"/ratings.csv")
ratings_df

Unnamed: 0,user,item,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
...,...,...,...
942220,6040,1091,1
942221,6040,1094,5
942222,6040,562,5
942223,6040,1096,4


In [None]:
# Do the split immediately so that we can generate groups based on similarity on training data only, vs on full data
import pickle
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=42, shuffle=True, stratify=ratings_df["user"])
pickle.dump(train_df, open(preprocessed_dataset_folder+"/train_df.pkl", "wb"))
pickle.dump(test_df, open(preprocessed_dataset_folder+"/test_df.pkl", "wb"))

In [None]:
from lenskit.algorithms.als import BiasedMF, ImplicitMF
from lenskit.algorithms import Recommender

In [None]:
biased_mf = BiasedMF(features=30, iterations=20, reg=0.1, rng_spec=42)                
recsys_biased_mf = Recommender.adapt(biased_mf)
recsys_biased_mf = recsys_biased_mf.fit(train_df)

In [None]:
implicit_mf = ImplicitMF(features=30, iterations=20, reg=0.1, rng_spec=42)               
recsys_implicit_mf = Recommender.adapt(implicit_mf)
recsys_implicit_mf = recsys_implicit_mf.fit(train_df)

In [None]:
pickle.dump(recsys_biased_mf, open(preprocessed_dataset_folder+"/biasedMf.pkl", "wb"))
pickle.dump(recsys_implicit_mf, open(preprocessed_dataset_folder+"/implicitMf.pkl", "wb"))

## Groups generation

In [None]:
def gen_baseline_groups(sm, sm_text, sim_threshold_override):
    
    old_sim_threshold = cfg.similar_threshold
    cfg.similar_threshold = sim_threshold_override
        
    print(f"####################### Using similarity matrix = {sm_text}")
    from synthetic_groups_generation.groups_generators import GroupsGenerator

    group_list = list()
    for group_type in group_similarity_to_create:
        print("###########################################################################")
        print(f"#################### group_type={group_type} #############################")
        print("###########################################################################")
        grpGenerator = GroupsGenerator.getGroupsGenerator(group_type)
        current_list = grpGenerator.generateGroups(user_id_indexes, user_id_set, sm.copy(), group_sizes_to_create, group_number)

        display(pd.DataFrame.from_records(current_list))

        group_list = group_list + current_list
        

    cfg.similar_threshold = old_sim_threshold
    
    return group_list

In [None]:
similarity_dict = {
    "sim_full": sim_matrix_full,
    "sim_train": sim_matrix_train,
    "sim_biased_mf_cos": sim_matrix_biased_mf_cos,
    "sim_biased_mf_l2": sim_matrix_biased_mf_l2,
    "sim_implicit_mf_cos": sim_matrix_implicit_mf_cos,
    "sim_implicit_mf_l2": sim_matrix_implicit_mf_l2
}

similarity_raw_to_name = {
    "Pearson's Correlation Coeficient FULL": "sim_full",
    "Pearson's Correlation Coeficient TRAIN": "sim_train",
    "Biased MF Cosine": "sim_biased_mf_cos",
    "Biased MF L2": "sim_biased_mf_l2",
    "Implicit MF Cosine": "sim_implicit_mf_cos",
    "Implicit MF L2": "sim_implicit_mf_l2"
}

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import scipy

similarity_threshold_dict = {}


for sim_name, sim_raw_name in similarity_raw_to_name.items():
    sim_matrix = similarity_dict[sim_raw_name]
    data = matrix.flatten()
    perc = np.percentile(data, [50, 75, 80, 90, 95, 99])
    print(f"Percentiles for {sim_name} = {perc}")
    similarity_threshold_dict[similarity_raw_to_name[name]] = perc[-1]

In [None]:
group_lists = {}
for sim_name, sim_matrix in similarity_dict.items():
    group_lists[sim_name] = gen_baseline_groups(sim_matrix, sim_name, sim_threshold_override = similarity_threshold_dict[sim_name])

In [None]:
from copy import deepcopy

In [None]:
def gen_outlier_groups(group_list, sm, sm_text):
    
    print(f"####################### Using similarity matrix = {sm_text}")
    
    from synthetic_groups_generation.groups_generators import Outliers
    from synthetic_groups_generation.groups_generators import GroupsGenerator
    # Add one more group type that needs special handling
    # Take similar groups and replace some of its members by outliers (so there is correspondence between the original similarity group and new outlier group)
    new_centered_outlier_groups = []
    new_diverged_outlier_groups = []

    for group in group_list:
        if group['group_similarity'] == 'similar':

            n_outliers = Outliers.get_num_outliers(group['group_size'])


            group_clone = deepcopy(group)
            group_clone['group_similarity'] = 'similar_with_centered_outliers'
            group_clone['group_members'][:n_outliers] = Outliers.find_outlier_centered_group(group['group_members'][n_outliers:], n_outliers, sm.copy(), user_id_indexes)
            group_clone['avg_similarity'] = GroupsGenerator.compute_average_similarity(group_clone['group_members'], user_id_indexes, sm.copy())
            new_centered_outlier_groups.append(group_clone)
            assert group_clone["group_members"] != group["group_members"], f"{group_clone['group_members']}, {group['group_members']}"

            group_clone = deepcopy(group)
            group_clone['group_similarity'] = 'similar_with_diverged_outliers'
            group_clone['group_members'][:n_outliers] = Outliers.find_outlier_diverged_group(group['group_members'][n_outliers:], n_outliers, sm.copy(), user_id_indexes)
            group_clone['avg_similarity'] = GroupsGenerator.compute_average_similarity(group_clone['group_members'], user_id_indexes, sm.copy())
            new_diverged_outlier_groups.append(group_clone)
            assert group_clone["group_members"] != group["group_members"], f"{group_clone['group_members']}, {group['group_members']}"

            print(f"lens: {len(new_centered_outlier_groups)}, {len(new_diverged_outlier_groups)}")
        
    return {
        "centered_outlier_groups": new_centered_outlier_groups,
        "diverged_outlier_groups": new_diverged_outlier_groups
    }

In [None]:
new_groups = {}
for sim_name, sim_matrix in similarity_dict.items():
    new_groups[sim_name] = gen_outlier_groups(group_lists[sim_name], sim_matrix, sim_name)

In [None]:
new_centered_outlier_groups, new_diverged_outlier_groups = gen_outlier_groups(sim_matrix_mf, "MF Sim matrix")

In [None]:
group_list.extend(new_centered_outlier_groups)
group_list.extend(new_diverged_outlier_groups)

In [None]:
group_dict = dict()
for group_id, group in zip(range(len(group_list)), group_list):
    group_dict[group_id] = group
display(group_dict)

In [None]:
import pickle
# Dump standard similarity version
#pickle.dump(group_dict, open(preprocessed_dataset_folder+"/group_composition.pkl", "wb"))

# Dump MF similarity version
#pickle.dump(group_dict, open(preprocessed_dataset_folder+"/group_composition_mf.pkl", "wb"))

for sim_name in similarity_dict.keys():
    g = group_lists[sim_name] + new_groups[sim_name]["centered_outlier_groups"] + new_groups[sim_name]["diverged_outlier_groups"]
    
    group_dict = dict()
    for group_id, group in zip(range(len(g)), g):
        group_dict[group_id] = group
        
    pickle.dump(group_dict, open(preprocessed_dataset_folder+f"/group_composition_{sim_name}.pkl", "wb"))
    
    
    groups_list = list()
    for group in group_dict:
        groups_list.append(
            {
                'group_id': group,
                'group_size': group_dict[group]['group_size'],
                'group_similarity': group_dict[group]['group_similarity'],
                'group_members': group_dict[group]['group_members'],
                'avg_similarity': group_dict[group]['avg_similarity']
            }
        )

    groups_df = pd.DataFrame.from_records(groups_list)
    groups_df.to_csv(preprocessed_dataset_folder+f"/group_composition_{sim_name}.csv")
    

    print("Done with:", sim_name)

In [None]:
groups_list = list()
for group in group_dict:
    groups_list.append(
        {
            'group_id': group,
            'group_size': group_dict[group]['group_size'],
            'group_similarity': group_dict[group]['group_similarity'],
            'group_members': group_dict[group]['group_members'],
            'avg_similarity': group_dict[group]['avg_similarity']
        }
    )

groups_df = pd.DataFrame.from_records(groups_list)
display(groups_df.head(10))

In [None]:
#groups_df.to_csv(preprocessed_dataset_folder+"/group_composition.csv")
groups_df.to_csv(preprocessed_dataset_folder+"/group_composition_mf.csv")

# GRS

In [None]:
from datetime import datetime
import settings.config_movie_lens as cfg
import pandas as pd
import numpy as np

pd.options.display.max_rows = 500 #Changes the number of rows diplayed (default is 60)

preprocessed_dataset_folder = cfg.preprocessed_dataset_folder
recommendations_number = cfg.recommendations_number
group_types = cfg.group_types

cfg.top_k = None # Predict for everything, do not restrict anything

## Train individual RS / Prepare groundtruth
Takes long time, only needed if you don't have test_pred_dfs.pkl file

In [None]:
import pickle
import os
from individual_rs.individual_rs import IndividualRS
from utils.utility_functions import create_per_user_group_choices

import warnings
warnings.filterwarnings('ignore')
# General pipeline

# creating train-test folds
# split stratified on the users 

from sklearn.model_selection import StratifiedKFold
import itertools

individual_rs_names = [
    "biasedMf",
    "implicitMf"
]

if group_types == "SYNTHETIC":

    # load train and test df
    print(datetime.now(), "Load dataset splits")
    train_df = pickle.load(open(preprocessed_dataset_folder+f"/train_df.pkl", "rb"))
    test_df = pickle.load(open(preprocessed_dataset_folder+f"/test_df.pkl", "rb"))

    # getting user-items pairs in the training set
    print(datetime.now(), "Get user-item pairs in the training set")
    train_set_pairs = set(list(zip(train_df['user'].values,train_df['item'].values)))

    # create test_complete_df with all the possible user-items pairs in the test_df
    print(datetime.now(), "Create complete_df with all possible pairs")
    user_set = set(test_df['user'].values)
    item_set = set(test_df['item'].values)
    all_ui_values = list(itertools.product(user_set, item_set))
    
    test_pred_dfs = {}
    for rs_name in individual_rs_names:
        individual_rs_path = f"{preprocessed_dataset_folder}/{rs_name}.pkl"
        test_pred_df = pd.DataFrame(all_ui_values, columns=['user', 'item'])
        
        # load individual RS
        print(datetime.now(), f"Load individual RS ({rs_name})")
        rs = pickle.load(open(individual_rs_path, "rb"))
        
        # Get predictions
        print(datetime.now(), "Get predictions")
        test_pred_df['predicted_rating'] = rs.predict(test_pred_df)

        #correction for train set records (assuming repeated recommendations provide no value, therefore predicted_rating=0)
        print(datetime.now(), "Do the correction")
        train_set_pairs_fixed = list(train_set_pairs.intersection(set(all_ui_values)))
        test_pred_df.set_index(["user","item"], inplace=True)
        test_pred_df.loc[train_set_pairs_fixed,"predicted_rating"] = 0.0
        test_pred_df.reset_index(inplace=True)
        
        test_pred_dfs[rs_name] = test_pred_df

    path_to_fold = preprocessed_dataset_folder+"/singlefold"

    if not os.path.exists(path_to_fold):
        os.mkdir(path_to_fold)

    print(datetime.now(), "Dump all the results")
    pickle.dump(test_pred_dfs, open(path_to_fold+"/test_pred_dfs.pkl", "wb"))

else:
    print("ERROR: incorrect config file!")
print(datetime.now(), "Done!")

test_pred_df = None

# Construct group recommendations
Takes a lot of time, only needed if you don't have the group_recommendations files

In [None]:
from utils.utility_functions import generate_group_recommendations_forall_groups

import os
import warnings
warnings.filterwarnings('ignore')
lst = os.listdir(preprocessed_dataset_folder)

for algo_name, tst_pred_df in test_pred_dfs.items():
    print(datetime.now(), f"Processing algorithm {algo_name}")
    path_to_fold = preprocessed_dataset_folder+"/"+"singlefold"
    if group_types == "SYNTHETIC":
        train_df = pickle.load(open(preprocessed_dataset_folder+"/train_df.pkl", "rb"))
        test_df = pickle.load(open(preprocessed_dataset_folder+"/test_df.pkl", "rb"))
    else:
        print("ERROR: incorrect config file!")
    
    print(datetime.now(), "Generate GRS for all the aggregation strategies and all the groups")
    
    
    for sim_name, group_comp in group_compositions.items():
        for pred_k in cfg.recommendations_number:
            print(f"Generating for algo_name={algo_name}, sim_name={sim_name}, k={pred_k}")

            group_recommendations = generate_group_recommendations_forall_groups(tst_pred_df, group_comp, pred_k)
            print(datetime.now(), f": Done algo_name={algo_name}, sim_name={sim_name}, k={pred_k}")

            pickle.dump(group_recommendations, open(path_to_fold+"/"+f"predk{pred_k}_group_recommendations_{sim_name}_{algo_name}.pkl", "wb"))