In [1]:
from datetime import datetime
import settings.config as cfg
import pandas as pd
import numpy as np


preprocessed_dataset_folder = cfg.preprocessed_dataset_folder
individual_rs_strategy = cfg.individual_rs_strategy
aggregation_strategies = cfg.aggregation_strategies
recommendations_number = cfg.recommendations_number
individual_rs_validation_folds_k = cfg.individual_rs_validation_folds_k
group_rs_evaluation_folds_k = cfg.group_rs_evaluation_folds_k
evaluation_strategy = cfg.evaluation_strategy
metrics = cfg.metrics
group_types = cfg.group_types

display(cfg.dataset_folder,cfg.preprocessed_dataset_folder)

'ml-1m'

'preprocessed_dataset'

In [2]:
import pandas as pd
import pickle
ratings_df = pd.read_csv(preprocessed_dataset_folder+"/ratings.csv")

## Train individual RS / Prepare groundtruth

In [3]:
recommenders = ["LENSKIT_ALS"]#,"LENSKIT_CF_USER", "LENSKIT_CF_ITEM"

import pickle
import os
from individual_rs.individual_rs import IndividualRS
from utils.utility_functions import create_per_user_group_choices

import warnings
warnings.filterwarnings('ignore')
# General pipeline

# creating train-test folds
# split stratified on the users 

from sklearn.model_selection import StratifiedKFold
import itertools

if group_types == "SYNTHETIC":
    print(datetime.now(), "Creating folds")
    # skf = StratifiedKFold(n_splits=group_rs_evaluation_folds_k, random_state=None, shuffle=True)
    skf = StratifiedKFold(n_splits=group_rs_evaluation_folds_k, random_state=42, shuffle=True)

    print(datetime.now(), "Folds created!")
    current_fold = 0
    for train_index, test_index in skf.split(ratings_df, ratings_df['user']):
        print(">>> Start processing fold: Train", len(train_index), "Test:", len(test_index))

        # split train and test df
        train_df = ratings_df.iloc[train_index]
        test_df = ratings_df.iloc[test_index]

        # getting user-items pairs in the training set
        train_set_pairs = set(list(zip(train_df['user'].values,train_df['item'].values)))

        # create test_complete_df with all the possible user-items pairs in the test_df
        user_set = set(test_df['user'].values)
        item_set = set(test_df['item'].values)
        all_ui_values = list(itertools.product(user_set, item_set))
        test_pred_df = pd.DataFrame(all_ui_values, columns=['user', 'item'])

    #     print(datetime.now(), "Extended test df")
    #     display(test_pred_df)

        print(datetime.now(), "Train individual RS and get predictions")
        # train individual rs and get predictions
        
        path_to_fold = preprocessed_dataset_folder+"/fold_"+str(current_fold)
        if not os.path.exists(path_to_fold):
            os.mkdir(path_to_fold)
            
        pickle.dump(train_df, open(path_to_fold+"/train_df.pkl", "wb"))
        pickle.dump(test_df, open(path_to_fold+"/test_df.pkl", "wb"))
        
        for recommender in recommenders:                
            test_pred_df = IndividualRS.train_individual_rs_and_get_predictions(recommender, train_df, test_pred_df)
            #correction for train set records (assuming repeated recommendations provide no value, therefore predicted_rating=0)
            train_set_pairs = train_set_pairs.intersection(set(all_ui_values))
            test_pred_df.set_index(["user","item"], inplace=True)
            test_pred_df.loc[train_set_pairs,"predicted_rating"] = 0.0
            test_pred_df.reset_index(inplace=True)
            
            path_to_rec = path_to_fold+"/"+recommender
            if not os.path.exists(path_to_rec):
                os.mkdir(path_to_rec)
                
            pickle.dump(test_pred_df, open(path_to_rec+"/test_pred_df.pkl", "wb"))                

        current_fold = current_fold + 1
        


else:
    print("ERROR: incorrect config file!")
print(datetime.now(), "Done!")


2022-10-18 22:18:55.508939 Creating folds
2022-10-18 22:18:55.508939 Folds created!
>>> Start processing fold: Train 753780 Test: 188445
2022-10-18 22:19:06.618034 Train individual RS and get predictions
LENSKIT_ALS
training


Numba is using threading layer omp - consider TBB
BLAS using multiple threads - can cause oversubscription
found 2 potential runtime problems - see https://boi.st/lkpy-perf


features: 200, iterations: 2, regularization: 0.1, damping: 50000
('LENSKIT_ALS', '[200, 2, 0.1, 50000]')
evaluating predictions
Done!
>>> Start processing fold: Train 753780 Test: 188445
2022-10-18 22:40:39.275745 Train individual RS and get predictions
LENSKIT_ALS
training
features: 200, iterations: 2, regularization: 0.002, damping: 50000
('LENSKIT_ALS', '[200, 2, 0.002, 50000]')
evaluating predictions
Done!
>>> Start processing fold: Train 753780 Test: 188445
2022-10-18 23:01:55.049136 Train individual RS and get predictions
LENSKIT_ALS
training
features: 200, iterations: 2, regularization: 0.1, damping: 50000
('LENSKIT_ALS', '[200, 2, 0.1, 50000]')
evaluating predictions
Done!
>>> Start processing fold: Train 753780 Test: 188445
2022-10-18 23:23:10.802209 Train individual RS and get predictions
LENSKIT_ALS
training
features: 200, iterations: 3, regularization: 0.1, damping: 50000
('LENSKIT_ALS', '[200, 3, 0.1, 50000]')
evaluating predictions
Done!
>>> Start processing fold: Train 