### Some notes: 
- we are not splitting biased test set as we used to do (sampling a number of interaction of 30% random items id for each user from trainset)
- i saw that some user, item tuples of the random test set are present in the training set, is this ok?

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import sys

# import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import numpy.random as npr
from scipy import sparse, stats

In [None]:
# Set the seed for reproducibility
seed = 2384795
np.random.seed(seed=seed)

# Preparing folder for output data
output_name = f"./generated_data/"
if os.path.exists(output_name) == False:
    os.makedirs(output_name)

In [None]:
DATA_DIR = './original_files/'

## Loading Datasets

In [None]:
raw_data = pd.read_csv(os.path.join(DATA_DIR, 'train.ascii'), sep=" ", header=None, engine="python")
test_data = pd.read_csv(os.path.join(DATA_DIR, 'test.ascii'), sep=" ", header=None, engine="python")

In [None]:
tr_vd_data = pd.DataFrame({"userId": sparse.coo_matrix(raw_data).row,                            "songId": sparse.coo_matrix(raw_data).col,                           "rating": sparse.coo_matrix(raw_data).data})

In [None]:
test_data = pd.DataFrame({"userId": sparse.coo_matrix(test_data).row,                            "songId": sparse.coo_matrix(test_data).col,                           "rating": sparse.coo_matrix(test_data).data})

## Defining functions

# NOTA BENE

## Qui stiamo usando la funzione di split inclusa nel preproc script fornito con il dataset, NON stiamo seguendo il processo fatto per gli altri dataset.

## ciÃ² nonostante possiamo dividere il dataset originale in 70 / 30 ed avere il 70% in trainingset e 30% in biased test set per ottenere un risultato simile

In [None]:
def split_train_test_proportion(data, uid, test_prop=0.5, random_seed=0):
    data_grouped_by_user = data.groupby(uid)
    tr_list, te_list = list(), list()

    np.random.seed(random_seed)

    for u, (_, group) in enumerate(data_grouped_by_user):
        n_items_u = len(group)

        if n_items_u >= 5:
            idx = np.zeros(n_items_u, dtype='bool')
            idx[np.random.choice(n_items_u, size=int(test_prop * n_items_u), replace=False).astype('int64')] = True

            tr_list.append(group[np.logical_not(idx)])
            te_list.append(group[idx])
        else:
            tr_list.append(group)

        if u % 5000 == 0:
            print("%d users sampled" % u)
            sys.stdout.flush()

    data_tr = pd.concat(tr_list)
    data_te = pd.concat(te_list)
    
    return data_tr, data_te

In [None]:
def get_count(tp, id):
    playcount_groupbyid = tp[[id]].groupby(id, as_index=False)
    count = playcount_groupbyid.size()
    return count

## Make dataset implicit

In [None]:
tr_vd_data.head(10)

In [None]:
test_data.head(10)

In [None]:
# Suggested on the original yahoo's paper
POSITIVE_THRESHOLD = 4

# Add column to the DataFrame
tr_vd_data['ImplicitRating'] = np.where(tr_vd_data['rating'] >= POSITIVE_THRESHOLD, 1, 0)
test_data['ImplicitRating'] = np.where(test_data['rating'] >= POSITIVE_THRESHOLD, 1, 0)

In [None]:
tr_vd_data.head(10)

In [None]:
tr_vd_data = tr_vd_data.drop(['rating'],axis=1).rename({"ImplicitRating":"rating"}, axis='columns')

In [None]:
tr_vd_data.head(10)

In [None]:
test_data.head(10)

In [None]:
test_data = test_data.drop(['rating'],axis=1).rename({"ImplicitRating":"rating"}, axis='columns')

In [None]:
test_data.head(10)

## Some stats

In [None]:
tr_vd_data.head(), tr_vd_data.shape

In [None]:
test_data.head(), test_data.shape

In [None]:
test_data.info()

In [None]:
user_activity = get_count(tr_vd_data, 'userId')
item_popularity = get_count(tr_vd_data, 'songId')

In [None]:
unique_uid = user_activity.index
unique_sid = item_popularity.index

In [None]:
n_users = len(unique_uid)
n_items = len(unique_sid)

In [None]:
n_users, n_items

## Removing eventual songs and users from the test set not present in the training set

In [None]:
song2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
user2id = dict((uid, i) for (i, uid) in enumerate(unique_uid))

In [None]:
# for the test set, only keep the users/items from the training set

test_data = test_data.loc[test_data['userId'].isin(unique_uid)]
test_data = test_data.loc[test_data['songId'].isin(unique_sid)]

## Turn userId and songId to 0-based index

In [None]:
def numerize(tp):
    uid = list(map(lambda x: user2id[x], tp['userId']))
    sid = list(map(lambda x: song2id[x], tp['songId']))
    tp.loc[:, 'user_id'] = uid
    tp.loc[:, 'item_id'] = sid
    return tp[['user_id', 'item_id', 'rating']]

In [None]:
tr_vd_data = numerize(tr_vd_data)
test_data = numerize(test_data)

## Do we need the validation for our purpose?

In [None]:
#train_data, vad_data = split_train_test_proportion(tr_vd_data, 'user_id', test_prop=0.7, random_seed=12345)
#obs_test_data, vad_data = split_train_test_proportion(vad_data, 'user_id', test_prop=0.5, random_seed=12345)
train_data, obs_test_data = split_train_test_proportion(tr_vd_data, 'user_id', test_prop=0.7, random_seed=12345)


In [None]:
print("There are total of %d unique users in the training set and %d unique users in the entire dataset" % (len(pd.unique(train_data['user_id'])), len(unique_uid)))

In [None]:
print("There are total of %d unique items in the training set and %d unique items in the entire dataset" % (len(pd.unique(train_data['item_id'])), len(unique_sid)))

In [None]:
def move_to_fill(part_data_1, part_data_2, unique_id, key):
    # move the data from part_data_2 to part_data_1 so that part_data_1 has the same number of unique "key" as unique_id
    part_id = set(pd.unique(part_data_1[key]))
    
    left_id = list()
    for i, _id in enumerate(unique_id):
        if _id not in part_id:
            left_id.append(_id)
            
    move_idx = part_data_2[key].isin(left_id)
    part_data_1 = part_data_1.append(part_data_2[move_idx])
    part_data_2 = part_data_2[~move_idx]
    return part_data_1, part_data_2

The move_to_fill function is used to ensure that train_data ends up with a complete set of unique IDs as specified by unique_id, by "moving" the necessary rows from another dataset (part_data_2 like vad_data or obs_test_data) and updating both DataFrames accordingly.

In [None]:
#train_data, vad_data = move_to_fill(train_data, vad_data, np.arange(n_items), 'item_id')
train_data, obs_test_data = move_to_fill(train_data, obs_test_data, np.arange(n_items), 'item_id')

In [None]:
print("There are total of %d unique items in the training set and %d unique items in the entire dataset" % (len(pd.unique(train_data['item_id'])), len(unique_sid)))

## Store datasets in csv files

In [None]:
train_data.to_csv(os.path.join(output_name, 'train.csv'), index=False)
#vad_data.to_csv(os.path.join(output_name, 'validation.csv'), index=False)
tr_vd_data.to_csv(os.path.join(output_name, 'train_full.csv'), index=False)

In [None]:
obs_test_data.to_csv(os.path.join(output_name, 'obs_test_full.csv'), index=False)
test_data.to_csv(os.path.join(output_name, 'test_full.csv'), index=False)

now *obs_test_data* is our biased testset extracted by the original dataset, while *test_data* is our unbiased test set 

In [None]:
obs_test_data

## Build files for creating dataset for the openrec library

# Biased

In [None]:
# Init empty
pos_test_set = []
neg_test_set = []

In [None]:
# Create masks for positive and negative ratings
pos_mask = obs_test_data['rating'] == 1
neg_mask = obs_test_data['rating'] != 1

# Extract the user_id and item_id pairs for positive and negative ratings
pos_test_set = obs_test_data.loc[pos_mask, ['user_id', 'item_id']].values.tolist()
neg_test_set = obs_test_data.loc[neg_mask, ['user_id', 'item_id']].values.tolist()

# pos_test_set and neg_test_set now contain the lists of [user_id, item_id] for positive and negative ratings, respectively.
# Get np arrays
pos_test_set = np.array(pos_test_set)
neg_test_set = np.array(neg_test_set)

In [None]:
pos_test_set

In [None]:
# Get the dataframe
pos_test_set_df = pd.DataFrame(pos_test_set)
neg_test_set_df = pd.DataFrame(neg_test_set)

# Get couples user-item
pos_test_set_df.columns = ["user_id","item_id"]
neg_test_set_df.columns = ["user_id","item_id"]

# Turn into records
structured_data_pos_test_set = pos_test_set_df.to_records(index=False)
structured_data_neg_test_set = neg_test_set_df.to_records(index=False)

# Save
np.save(output_name + "biased-test_arr_pos.npy", structured_data_pos_test_set)
np.save(output_name + "biased-test_arr_neg.npy", structured_data_neg_test_set)

# Unbiased

In [None]:
# Init empty
pos_test_set = []
neg_test_set = []

In [None]:
# Create masks for positive and negative ratings
pos_mask = test_data['rating'] == 1
neg_mask = test_data['rating'] != 1

# Extract the user_id and item_id pairs for positive and negative ratings
pos_test_set = test_data.loc[pos_mask, ['user_id', 'item_id']].values.tolist()
neg_test_set = test_data.loc[neg_mask, ['user_id', 'item_id']].values.tolist()

# pos_test_set and neg_test_set now contain the lists of [user_id, item_id] for positive and negative ratings, respectively.
# Get np arrays
pos_test_set = np.array(pos_test_set)
neg_test_set = np.array(neg_test_set)

In [None]:
# Get the dataframe
pos_test_set_df = pd.DataFrame(pos_test_set)
neg_test_set_df = pd.DataFrame(neg_test_set)

# Get couples user-item
pos_test_set_df.columns = ["user_id","item_id"]
neg_test_set_df.columns = ["user_id","item_id"]

# Turn into records
structured_data_pos_test_set = pos_test_set_df.to_records(index=False)
structured_data_neg_test_set = neg_test_set_df.to_records(index=False)

# Save
np.save(output_name + "unbiased-test_arr_pos.npy", structured_data_pos_test_set)
np.save(output_name + "unbiased-test_arr_neg.npy", structured_data_neg_test_set)

# Trainset

In [None]:
positive_trainset = train_data[train_data['rating'] != 0]
positive_trainset = positive_trainset.drop(columns=['rating'])

# Convert the DataFrame to a structured array
positive_trainset = positive_trainset.to_records(index=False) 

# Save
np.save(output_name + "training_arr.npy", train_data)