# **GET TRAINING SET**

## Import

In [1]:
import os
import numpy as np

## Init

In [2]:
import numpy as np
import pandas as pd
import scipy.sparse as sps

seed = 76424236
np.random.seed(seed=seed)

output_folder = f"./generated_data/"
folder_name = f"./original_files/"

if os.path.exists(output_folder) == False:
    os.makedirs(output_folder)

## Load training set

In [3]:
file_path = 'ydata-ymusic-rating-study-v1_0-train.txt'

# Load the training set into a DataFrame
df_train = pd.read_csv(folder_name+file_path, sep='\t',names=["UserID","SongID","Rating"], header=None)  # sep='\t' for tab-separated values
df_train.head(10)

Unnamed: 0,UserID,SongID,Rating
0,1,14,5
1,1,35,1
2,1,46,1
3,1,83,1
4,1,93,1
5,1,94,1
6,1,153,5
7,1,170,4
8,1,184,5
9,1,194,5


## Convert to implicit

"We treat items rated greater than or equal to 4 as relevant, and others as irrelevant, as suggested by prior literature."

In [4]:
POSITIVE_THRESHOLD = 4
df_train['ImplicitRating'] = np.where(df_train['Rating'] >= POSITIVE_THRESHOLD, 1, 0)

df_train.head(10)

Unnamed: 0,UserID,SongID,Rating,ImplicitRating
0,1,14,5,1
1,1,35,1,0
2,1,46,1,0
3,1,83,1,0
4,1,93,1,0
5,1,94,1,0
6,1,153,5,1
7,1,170,4,1
8,1,184,5,1
9,1,194,5,1


## Check the number of users and items in the training set

"The training set contains 300K ratings given by 15.4K users against 1K songs through natural interactions."

In [5]:
min_user = df_train["UserID"].min()
max_user = df_train["UserID"].max()

min_item = df_train["SongID"].min()
max_item = df_train["SongID"].max()

max_item, max_user

(1000, 15400)

# **GET UNBIASED TESTSET**

## Load the unbiased testset and convert it to implicit

In [6]:
file_path = folder_name + 'ydata-ymusic-rating-study-v1_0-test.txt'
df_test = pd.read_csv(file_path, sep='\t',names=["UserID","SongID","Rating"], header=None)  # sep='\t' for tab-separated values
df_test['ImplicitRating'] = np.where(df_test['Rating'] >= POSITIVE_THRESHOLD, 1, 0)
df_test.head(10)

Unnamed: 0,UserID,SongID,Rating,ImplicitRating
0,1,49,1,0
1,1,126,1,0
2,1,138,1,0
3,1,141,1,0
4,1,177,1,0
5,1,268,3,0
6,1,511,1,0
7,1,587,1,0
8,1,772,5,1
9,1,941,1,0


## Check the number of users and items in the training set

"The testing set is collected by asking a subset of 5.4K users to rate 10 randomly selected songs."

In [7]:
df_test["UserID"].max(), df_test["SongID"].max(), int(df_test.shape[0]/df_test["UserID"].max())

(5400, 1000, 10)

## Filter unbiased testset

"We filter the testing set by retaining users who have at least a relevant and an irrelevant song in the testing set and two relevant songs in the training set."

In [8]:
# Select users with at least an irrelevant song in the unbiased testset
usersWithNegativeInteractionInTest = df_test[df_test["ImplicitRating"] == 0]["UserID"].unique()

# Select UserID of users with at least a relevant song in testset
usersWithPositiveInteractionInTest = df_test[df_test["ImplicitRating"] == 1]["UserID"].unique()

# Select UserID of users with at least two relevant song in trainset
usersWithTwoPositiveInteractions = df_train[df_train["ImplicitRating"] == 1].groupby("UserID").filter(lambda x: len(x) >= 2)['UserID'].unique()

# Compute the intersection
set1 = set(usersWithNegativeInteractionInTest)
set2 = set(usersWithPositiveInteractionInTest)
set3 = set(usersWithTwoPositiveInteractions)
valid_users_testset = set1 & set2 & set3

# Filter the testset
df_test_filtered = df_test[df_test["UserID"].isin(valid_users_testset)]

"2296 users satisfy these requirements."

In [9]:
len(valid_users_testset)

2296

## Shape the unbiased test set

In [10]:
# From the dataframe, for each row where ImplicitRating is 1, append [userID, itemID] to unbiased_pos_test_set
# and for each row where ImplicitRating is 0, append [userID, itemID] to unbiased_neg_test_set

unbiased_pos_test_set = df_test_filtered[df_test_filtered["ImplicitRating"] == 1][["UserID", "SongID"]].values
unbiased_neg_test_set = df_test_filtered[df_test_filtered["ImplicitRating"] == 0][["UserID", "SongID"]].values

## Save unbiased test set

In [11]:
unbiased_pos_test_set_df = pd.DataFrame(unbiased_pos_test_set)
unbiased_neg_test_set_df = pd.DataFrame(unbiased_neg_test_set)

unbiased_pos_test_set_df.columns = ["user_id","item_id"]
unbiased_neg_test_set_df.columns = ["user_id","item_id"]

structured_data_pos_test_set_unbiased = unbiased_pos_test_set_df.to_records(index=False)
structured_data_neg_test_set_unbiased = unbiased_neg_test_set_df.to_records(index=False)

np.save(output_folder + "unbiased-test_arr_pos.npy", structured_data_pos_test_set_unbiased)
np.save(output_folder + "unbiased-test_arr_neg.npy", structured_data_neg_test_set_unbiased)

# **GET BIASED TESTSET**

## Extract the biased test set and shape it

"We additionally held out a biased testing set (biased-testing) from the training set by randomly sampling 300 songs for each user."

In [12]:
# Precompute, for each user, the list of songs with a relevant rating
user_positive_ratings = df_train[df_train["ImplicitRating"] == 1].groupby("UserID")["SongID"].apply(set)

# Initialize the range of indexes for the items
items_ids = np.arange(min_item, max_item + 1)
# Set the number of songs for each user
SONGS_FOR_BIASED_TEST = 300

#IPOTESI MAN

pos_test_set = []
neg_test_set = []

for user_id in range(min_user, max_user + 1):
    np.random.shuffle(items_ids)
    test_items = set(items_ids[-SONGS_FOR_BIASED_TEST:])
    pos_ids = user_positive_ratings.get(user_id, set()) & test_items

    #set those to 0 so that they will no longer be used in training set
    df_train.loc[(df_train['SongID'].isin(pos_ids)) & (df_train['UserID'] == user_id), 'ImplicitRating'] = 0

    for id in test_items:
        if id in pos_ids:
            pos_test_set.append([user_id, id])
        else:
            neg_test_set.append([user_id, id])

pos_test_set = np.array(pos_test_set)
neg_test_set = np.array(neg_test_set)

## Save the biased test set

In [13]:
pos_test_set_df = pd.DataFrame(pos_test_set)
neg_test_set_df = pd.DataFrame(neg_test_set)

pos_test_set_df.columns = ["user_id","item_id"]
neg_test_set_df.columns = ["user_id","item_id"]

structured_data_pos_test_set = pos_test_set_df.to_records(index=False)
structured_data_neg_test_set = neg_test_set_df.to_records(index=False)

np.save(output_folder + "biased-test_arr_pos.npy", structured_data_pos_test_set)
np.save(output_folder + "biased-test_arr_neg.npy", structured_data_neg_test_set)

# **SPLIT TRAIN AND VALIDATION**

## Take couples user-item filtering out the irrelevant ones

In [14]:
# Only take the couples (user, item) with relevant rating
new_df = df_train[df_train['ImplicitRating'] != 0]
new_df = new_df.drop(columns=['Rating', 'ImplicitRating'])

# Define a dictionary for renaming columns
rename_dict = {
    'UserID': 'user_id',
    'SongID': 'item_id'
}

# Rename the columns
new_df = new_df.rename(columns=rename_dict)

# Convert the DataFrame to a structured array
structured_data = new_df.to_records(index=False) 

## Split dataset

In [15]:
TRAINING_QUOTA = 0.85
VALIDATIION_QUOTA = 0.15

assert TRAINING_QUOTA + VALIDATIION_QUOTA == 1

# Get relevant interactions indexes
indicesRelevantInteractions = np.arange(structured_data.size)
np.random.shuffle(indicesRelevantInteractions)

# Split the dataset
n_train_interactions = round(indicesRelevantInteractions.shape[0] * TRAINING_QUOTA)
indices_for_train = indicesRelevantInteractions[0:n_train_interactions]
indices_for_validation = indicesRelevantInteractions[n_train_interactions:]

train_data = structured_data[indices_for_train]
validation_data = structured_data[indices_for_validation]

## Save the training set

In [16]:
np.save(output_folder + "training_arr.npy", train_data)
np.save(output_folder + "validation_arr.npy", validation_data)