In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sps

In [2]:
# Replace 'your_file.txt' with your file path
file_path = 'Dataset/yahoo_ymusic_v1/ydata-ymusic-rating-study-v1_0-train.txt'

# Load the file into a DataFrame
df = pd.read_csv(file_path, sep='\t',names=["UserID","SongID","Rating"], header=None)  # sep='\t' for tab-separated values

# Display the DataFrame
df.head(10)

Unnamed: 0,UserID,SongID,Rating
0,1,14,5
1,1,35,1
2,1,46,1
3,1,83,1
4,1,93,1
5,1,94,1
6,1,153,5
7,1,170,4
8,1,184,5
9,1,194,5


In [3]:
POSITIVE_THRESHOLD = 4

df['ImplicitRating'] = np.where(df['Rating'] >= POSITIVE_THRESHOLD, 1, 0)

In [4]:
df.head(10)

Unnamed: 0,UserID,SongID,Rating,ImplicitRating
0,1,14,5,1
1,1,35,1,0
2,1,46,1,0
3,1,83,1,0
4,1,93,1,0
5,1,94,1,0
6,1,153,5,1
7,1,170,4,1
8,1,184,5,1
9,1,194,5,1


In [5]:
print(f"UserID Count: {np.unique(df['UserID']).shape[0]}")
print(f"UserID Min: {df['UserID'].min()}")
print(f"UserID Max: {df['UserID'].max()}")

UserID Count: 15400
UserID Min: 1
UserID Max: 15400


In [6]:
df['UserID'] = df['UserID'] - 1
print(f"UserID Count: {np.unique(df['UserID']).shape[0]}")
print(f"UserID Min: {df['UserID'].min()}")
print(f"UserID Max: {df['UserID'].max()}")

UserID Count: 15400
UserID Min: 0
UserID Max: 15399


In [7]:
print(f"SongID Count: {np.unique(df['SongID']).shape[0]}")
print(f"SongID Min: {df['SongID'].min()}")
print(f"SongID Max: {df['SongID'].max()}")

SongID Count: 1000
SongID Min: 1
SongID Max: 1000


In [8]:
df['SongID'] = df['SongID'] - 1
print(f"SongID Count: {np.unique(df['SongID']).shape[0]}")
print(f"SongID Min: {df['SongID'].min()}")
print(f"SongID Max: {df['SongID'].max()}")

SongID Count: 1000
SongID Min: 0
SongID Max: 999


In [9]:
df["ImplicitRating"] = df["ImplicitRating"].values.astype(np.float32)
df['UserID'].shape, df['SongID'].shape

((311704,), (311704,))

In [10]:
URM_all = sps.coo_matrix((df["ImplicitRating"].values, 
                          (df["UserID"].values, df["SongID"].values)))

In [11]:
URM_all

<15400x1000 sparse matrix of type '<class 'numpy.float32'>'
	with 311704 stored elements in COOrdinate format>

# **Try To Split Global Wise**

In [12]:
#Import DaCrema's function from 
def split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.1):
    """
    The function splits an URM in two matrices selecting the number of interactions globally
    :param URM_all:
    :param train_percentage:
    :param verbose:
    :return:
    """

    assert train_percentage >= 0.0 and train_percentage<=1.0, "train_percentage must be a value between 0.0 and 1.0, provided was '{}'".format(train_percentage)


    from  MFDLib.IncrementalSparseMatrix import IncrementalSparseMatrix

    num_users, num_items = URM_all.shape

    URM_train_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items, auto_create_col_mapper=False, auto_create_row_mapper=False)
    URM_validation_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items, auto_create_col_mapper=False, auto_create_row_mapper=False)


    URM_train = sps.coo_matrix(URM_all)

    indices_for_sampling = np.arange(0, URM_all.nnz, dtype=int)
    np.random.shuffle(indices_for_sampling)

    n_train_interactions = round(URM_all.nnz * train_percentage)

    indices_for_train = indices_for_sampling[indices_for_sampling[0:n_train_interactions]]
    indices_for_validation = indices_for_sampling[indices_for_sampling[n_train_interactions:]]


    URM_train_builder.add_data_lists(URM_train.row[indices_for_train],
                                     URM_train.col[indices_for_train],
                                     URM_train.data[indices_for_train])

    URM_validation_builder.add_data_lists(URM_train.row[indices_for_validation],
                                          URM_train.col[indices_for_validation],
                                          URM_train.data[indices_for_validation])


    URM_train = URM_train_builder.get_SparseMatrix()
    URM_validation = URM_validation_builder.get_SparseMatrix()

    URM_train = sps.csr_matrix(URM_train)
    URM_validation = sps.csr_matrix(URM_validation)

    user_no_item_train = np.sum(np.ediff1d(URM_train.indptr) == 0)
    user_no_item_validation = np.sum(np.ediff1d(URM_validation.indptr) == 0)

    if user_no_item_train != 0:
        print("Warning: {} ({:.2f} %) of {} users have no train items".format(user_no_item_train, user_no_item_train/num_users*100, num_users))
    if user_no_item_validation != 0:
        print("Warning: {} ({:.2f} %) of {} users have no sampled items".format(user_no_item_validation, user_no_item_validation/num_users*100, num_users))


    return URM_train, URM_validation

In [13]:
# Use 15% for Test and 15%/85% of remaining for Validation

urm_train_validation, urm_test = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.85)
urm_train, urm_validation = split_train_in_two_percentage_global_sample(urm_train_validation, train_percentage = 0.15/0.85)

