# **GET TRAINING SET**

## Import

In [1]:
import os
import numpy as np

## Init

In [2]:
import numpy as np
import pandas as pd
import scipy.sparse as sps

seed = 76424236
np.random.seed(seed=seed)

output_folder = f"./generated_data/"
folder_name = f"./original_files/"

if os.path.exists(output_folder) == False:
    os.makedirs(output_folder)

## Load training set

KuaiRec is a real-world dataset collected from the recommendation logs of the video-sharing mobile app Kuaishou. For now, it is the first dataset that contains a fully observed user-item interaction matrix. For the term “fully observed”, we mean there are almost no missing values in the user-item matrix, i.e., each user has viewed each video and then left feedback.

In [7]:
file_path = 'big_matrix.csv'
# Load the training set into a DataFrame
df_train = pd.read_csv(folder_name+file_path) 
df_train.head(10)

Unnamed: 0,user_id,video_id,play_duration,video_duration,time,date,timestamp,watch_ratio
0,0,3649,13838,10867,2020-07-05 00:08:23.438,20200705,1593879000.0,1.273397
1,0,9598,13665,10984,2020-07-05 00:13:41.297,20200705,1593879000.0,1.244082
2,0,5262,851,7908,2020-07-05 00:16:06.687,20200705,1593879000.0,0.107613
3,0,1963,862,9590,2020-07-05 00:20:26.792,20200705,1593880000.0,0.089885
4,0,8234,858,11000,2020-07-05 00:43:05.128,20200705,1593881000.0,0.078
5,0,8228,13484,8576,2020-07-05 01:00:25.5,20200705,1593882000.0,1.572295
6,0,6789,2327,13267,2020-07-05 03:28:02.32,20200705,1593891000.0,0.175398
7,0,6812,23731,10728,2020-07-05 22:22:11.813,20200705,1593959000.0,2.212062
8,0,183,796,6100,2020-07-06 00:14:06.245,20200706,1593966000.0,0.130492
9,0,169,13735,9767,2020-07-06 00:14:48.8,20200706,1593966000.0,1.406266


## Convert to implicit

"We treat items with a watch_ratio greater than or equal to 2 as relevant, and others as irrelevant, as suggested by prior literature."

In [12]:
POSITIVE_THRESHOLD = 2.0 # Suggested on dataset webpage
df_train['ImplicitRating'] = np.where(df_train['watch_ratio'] >= POSITIVE_THRESHOLD, 1, 0)

df_train.head(10)

Unnamed: 0,user_id,video_id,play_duration,video_duration,time,date,timestamp,watch_ratio,ImplicitRating
0,0,3649,13838,10867,2020-07-05 00:08:23.438,20200705,1593879000.0,1.273397,0
1,0,9598,13665,10984,2020-07-05 00:13:41.297,20200705,1593879000.0,1.244082,0
2,0,5262,851,7908,2020-07-05 00:16:06.687,20200705,1593879000.0,0.107613,0
3,0,1963,862,9590,2020-07-05 00:20:26.792,20200705,1593880000.0,0.089885,0
4,0,8234,858,11000,2020-07-05 00:43:05.128,20200705,1593881000.0,0.078,0
5,0,8228,13484,8576,2020-07-05 01:00:25.5,20200705,1593882000.0,1.572295,0
6,0,6789,2327,13267,2020-07-05 03:28:02.32,20200705,1593891000.0,0.175398,0
7,0,6812,23731,10728,2020-07-05 22:22:11.813,20200705,1593959000.0,2.212062,1
8,0,183,796,6100,2020-07-06 00:14:06.245,20200706,1593966000.0,0.130492,0
9,0,169,13735,9767,2020-07-06 00:14:48.8,20200706,1593966000.0,1.406266,0


## Check the number of users and items in the training set

"The training set contains 12,530,806 ratings given by 7,176 users against 10,728 videos through natural interactions."

In [13]:
min_user = df_train["user_id"].min()
max_user = df_train["user_id"].max()

min_item = df_train["video_id"].min()
max_item = df_train["video_id"].max()

max_item, max_user

(10727, 7175)

# **GET UNBIASED TESTSET**

## Load the unbiased testset and convert it to implicit

In [14]:
file_path = folder_name + 'small_matrix.csv'
df_test = pd.read_csv(file_path)  # sep='\t' for tab-separated values
df_test['ImplicitRating'] = np.where(df_test['watch_ratio'] >= POSITIVE_THRESHOLD, 1, 0)
df_test.head(10)

Unnamed: 0,user_id,video_id,play_duration,video_duration,time,date,timestamp,watch_ratio,ImplicitRating
0,14,148,4381,6067,2020-07-05 05:27:48.378,20200705.0,1593898000.0,0.722103,0
1,14,183,11635,6100,2020-07-05 05:28:00.057,20200705.0,1593898000.0,1.907377,0
2,14,3649,22422,10867,2020-07-05 05:29:09.479,20200705.0,1593898000.0,2.063311,1
3,14,5262,4479,7908,2020-07-05 05:30:43.285,20200705.0,1593898000.0,0.566388,0
4,14,8234,4602,11000,2020-07-05 05:35:43.459,20200705.0,1593899000.0,0.418364,0
5,14,6789,8607,13267,2020-07-05 05:36:00.773,20200705.0,1593899000.0,0.648753,0
6,14,1963,8613,9590,2020-07-05 05:36:47.741,20200705.0,1593899000.0,0.898123,0
7,14,175,11640,46514,2020-07-05 05:49:27.965,20200705.0,1593899000.0,0.250247,0
8,14,1973,4572,7400,2020-07-05 05:49:41.762,20200705.0,1593899000.0,0.617838,0
9,14,171,8518,5217,2020-07-05 05:57:26.581,20200705.0,1593900000.0,1.632739,0


## Check the number of users and items in the training set

Note that the density of the small matrix is 99.6% instead of 100% because some users have explicitly indicated that they would not be willing to receive recommendations from certain authors. I.e., They blocked these videos.

In [22]:
df_test['user_id'].unique().shape[0] , df_test["video_id"].unique().shape[0], df_test.shape[0] / (df_test['user_id'].unique().shape[0] * df_test["video_id"].unique().shape[0])

(1411, 3327, 0.9962024941648523)

## Shape the unbiased test set

In [23]:
# From the dataframe, for each row where ImplicitRating is 1, append [userID, itemID] to unbiased_pos_test_set
# and for each row where ImplicitRating is 0, append [userID, itemID] to unbiased_neg_test_set

unbiased_pos_test_set = df_test[df_test["ImplicitRating"] == 1][["user_id", "video_id"]].values
unbiased_neg_test_set = df_test[df_test["ImplicitRating"] == 0][["user_id", "video_id"]].values

## Save unbiased test set

In [24]:
unbiased_pos_test_set_df = pd.DataFrame(unbiased_pos_test_set)
unbiased_neg_test_set_df = pd.DataFrame(unbiased_neg_test_set)

unbiased_pos_test_set_df.columns = ["user_id","item_id"]
unbiased_neg_test_set_df.columns = ["user_id","item_id"]

structured_data_pos_test_set_unbiased = unbiased_pos_test_set_df.to_records(index=False)
structured_data_neg_test_set_unbiased = unbiased_neg_test_set_df.to_records(index=False)

np.save(output_folder + "unbiased-test_arr_pos.npy", structured_data_pos_test_set_unbiased)
np.save(output_folder + "unbiased-test_arr_neg.npy", structured_data_neg_test_set_unbiased)

# **GET BIASED TESTSET**

## Extract the biased test set and shape it

"We additionally held out a biased testing set (biased-testing) from the training set by randomly sampling 300 songs for each user."

In [25]:
# Precompute, for each user, the list of songs with a relevant rating
user_positive_ratings = df_train[df_train["ImplicitRating"] == 1].groupby("user_id")["video_id"].apply(set)

# Initialize the range of indexes for the items
items_ids = np.arange(min_item, max_item + 1)
# Set the number of songs for each user


SONGS_FOR_BIASED_TEST = 300 # Is 300 ok here? Shouldn'it it be 3576?

#IPOTESI MAN

pos_test_set = []
neg_test_set = []

for user_id in range(min_user, max_user + 1):
    np.random.shuffle(items_ids)
    test_items = set(items_ids[-SONGS_FOR_BIASED_TEST:])
    pos_ids = user_positive_ratings.get(user_id, set()) & test_items

    #set those to 0 so that they will no longer be used in training set
    df_train.loc[(df_train['video_id'].isin(pos_ids)) & (df_train['user_id'] == user_id), 'ImplicitRating'] = 0

    for id in test_items:
        if id in pos_ids:
            pos_test_set.append([user_id, id])
        else:
            neg_test_set.append([user_id, id])

pos_test_set = np.array(pos_test_set)
neg_test_set = np.array(neg_test_set)

## Save the biased test set

In [26]:
pos_test_set_df = pd.DataFrame(pos_test_set)
neg_test_set_df = pd.DataFrame(neg_test_set)

pos_test_set_df.columns = ["user_id","item_id"]
neg_test_set_df.columns = ["user_id","item_id"]

structured_data_pos_test_set = pos_test_set_df.to_records(index=False)
structured_data_neg_test_set = neg_test_set_df.to_records(index=False)

np.save(output_folder + "biased-test_arr_pos.npy", structured_data_pos_test_set)
np.save(output_folder + "biased-test_arr_neg.npy", structured_data_neg_test_set)

# **SPLIT TRAIN AND VALIDATION**

## Take couples user-item filtering out the irrelevant ones

In [34]:
df_train.head(10)

Unnamed: 0,user_id,video_id,play_duration,video_duration,time,date,timestamp,watch_ratio,ImplicitRating
0,0,3649,13838,10867,2020-07-05 00:08:23.438,20200705,1593879000.0,1.273397,0
1,0,9598,13665,10984,2020-07-05 00:13:41.297,20200705,1593879000.0,1.244082,0
2,0,5262,851,7908,2020-07-05 00:16:06.687,20200705,1593879000.0,0.107613,0
3,0,1963,862,9590,2020-07-05 00:20:26.792,20200705,1593880000.0,0.089885,0
4,0,8234,858,11000,2020-07-05 00:43:05.128,20200705,1593881000.0,0.078,0
5,0,8228,13484,8576,2020-07-05 01:00:25.5,20200705,1593882000.0,1.572295,0
6,0,6789,2327,13267,2020-07-05 03:28:02.32,20200705,1593891000.0,0.175398,0
7,0,6812,23731,10728,2020-07-05 22:22:11.813,20200705,1593959000.0,2.212062,1
8,0,183,796,6100,2020-07-06 00:14:06.245,20200706,1593966000.0,0.130492,0
9,0,169,13735,9767,2020-07-06 00:14:48.8,20200706,1593966000.0,1.406266,0


In [35]:
# Only take the couples (user, item) with relevant rating
new_df = df_train[df_train['ImplicitRating'] != 0]
new_df = new_df.drop(columns=['watch_ratio', 'ImplicitRating','play_duration','video_duration','time','date','timestamp'])

# Define a dictionary for renaming columns
rename_dict = {
    'user_id': 'user_id',
    'video_id': 'item_id'
}

# Rename the columns
new_df = new_df.rename(columns=rename_dict)

# Convert the DataFrame to a structured array
structured_data = new_df.to_records(index=False) 

## Split dataset

In [36]:
TRAINING_QUOTA = 0.85
VALIDATIION_QUOTA = 0.15

assert TRAINING_QUOTA + VALIDATIION_QUOTA == 1

# Get relevant interactions indexes
indicesRelevantInteractions = np.arange(structured_data.size)
np.random.shuffle(indicesRelevantInteractions)

# Split the dataset
n_train_interactions = round(indicesRelevantInteractions.shape[0] * TRAINING_QUOTA)
indices_for_train = indicesRelevantInteractions[0:n_train_interactions]
indices_for_validation = indicesRelevantInteractions[n_train_interactions:]

train_data = structured_data[indices_for_train]
validation_data = structured_data[indices_for_validation]

## Save the training set

In [37]:
np.save(output_folder + "training_arr.npy", train_data)
np.save(output_folder + "validation_arr.npy", validation_data)