In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sps

seed = 42
np.random.seed(seed=seed)

In [2]:
# Replace 'your_file.txt' with your file path
file_path = 'Dataset/yahoo_ymusic_v1/ydata-ymusic-rating-study-v1_0-train.txt'

# Load the file into a DataFrame
df = pd.read_csv(file_path, sep='\t',names=["UserID","SongID","Rating"], header=None)  # sep='\t' for tab-separated values

# Display the DataFrame
df.head(10)

Unnamed: 0,UserID,SongID,Rating
0,1,14,5
1,1,35,1
2,1,46,1
3,1,83,1
4,1,93,1
5,1,94,1
6,1,153,5
7,1,170,4
8,1,184,5
9,1,194,5


In [3]:
POSITIVE_THRESHOLD = 4

df['ImplicitRating'] = np.where(df['Rating'] >= POSITIVE_THRESHOLD, 1, 0)

In [4]:
df.head(10)

Unnamed: 0,UserID,SongID,Rating,ImplicitRating
0,1,14,5,1
1,1,35,1,0
2,1,46,1,0
3,1,83,1,0
4,1,93,1,0
5,1,94,1,0
6,1,153,5,1
7,1,170,4,1
8,1,184,5,1
9,1,194,5,1


# **Making IDs 0-based**

In [5]:
print(f"UserID Count: {np.unique(df['UserID']).shape[0]}")
print(f"UserID Min: {df['UserID'].min()}")
print(f"UserID Max: {df['UserID'].max()}")

UserID Count: 15400
UserID Min: 1
UserID Max: 15400


In [6]:
#df['UserID'] = df['UserID'] - 1
print(f"UserID Count: {np.unique(df['UserID']).shape[0]}")
print(f"UserID Min: {df['UserID'].min()}")
print(f"UserID Max: {df['UserID'].max()}")
max_user = df['UserID'].max()

UserID Count: 15400
UserID Min: 1
UserID Max: 15400


In [7]:
print(f"SongID Count: {np.unique(df['SongID']).shape[0]}")
print(f"SongID Min: {df['SongID'].min()}")
print(f"SongID Max: {df['SongID'].max()}")

SongID Count: 1000
SongID Min: 1
SongID Max: 1000


In [8]:
#df['SongID'] = df['SongID'] - 1
print(f"SongID Count: {np.unique(df['SongID']).shape[0]}")
print(f"SongID Min: {df['SongID'].min()}")
print(f"SongID Max: {df['SongID'].max()}")
max_item = df['SongID'].max()

SongID Count: 1000
SongID Min: 1
SongID Max: 1000


In [9]:
#df["ImplicitRating"] = df["ImplicitRating"].values.astype(np.float32)
df['UserID'].shape, df['SongID'].shape

((311704,), (311704,))

# **Creating a URM coo format**

In [10]:
URM_all = sps.coo_matrix((df["ImplicitRating"].values, 
                          (df["UserID"].values, df["SongID"].values)))

In [11]:
URM_all

<15401x1001 sparse matrix of type '<class 'numpy.int64'>'
	with 311704 stored elements in COOrdinate format>

# **Try To Split Global Wise**

**Creating array for OpenRec**

In [12]:
TRAINING_QUOTA = 0.7

#Get relevant interactions indexes
indicesRelevantInteractions = np.where(df["ImplicitRating"] == 1)[0]

print(f"Total positive interactions: {indicesRelevantInteractions.shape[0]}")

#Shuffle them
np.random.shuffle(indicesRelevantInteractions)
n_train_interactions = round(indicesRelevantInteractions.shape[0] * TRAINING_QUOTA)

print(f"Training sampled positive interactions: {n_train_interactions}")

#Sample training indexes
indices_for_train = indicesRelevantInteractions[0:n_train_interactions]
indices_for_test_validation = indicesRelevantInteractions[n_train_interactions:]

print(f"Training indeces length: {indices_for_train.shape[0]}")
print(f"Training validation+test length: {indices_for_test_validation.shape[0]}")


#Split remaining
n_validation_interactions = round(len(indices_for_test_validation) / 2)

print(f"Validation sampled positive interactions: {n_validation_interactions}")

indices_for_validation = indices_for_test_validation[:n_validation_interactions]
indices_for_test = indices_for_test_validation[n_validation_interactions:]

assert len(indices_for_train) + len(indices_for_validation) + len(indices_for_test) == indicesRelevantInteractions.shape[0]
len(indices_for_train), len(indices_for_validation), len(indices_for_test)


Total positive interactions: 125077
Training sampled positive interactions: 87554
Training indeces length: 87554
Training validation+test length: 37523
Validation sampled positive interactions: 18762


(87554, 18762, 18761)

In [13]:
train_data = np.ones(indices_for_train.shape[0], dtype={'names':('user_id', 'item_id'),
                          'formats':('i4', 'i4')})
train_data['user_id'] = df["UserID"][indices_for_train]
train_data['item_id'] = df["SongID"][indices_for_train]


In [14]:
validation_data = np.ones(indices_for_validation.shape[0], dtype={'names':('user_id', 'item_id'),
                          'formats':('i4', 'i4')})
validation_data['user_id'] = df["UserID"][indices_for_validation]
validation_data['item_id'] = df["SongID"][indices_for_validation]

In [15]:
test_data = np.ones(indices_for_test.shape[0], dtype={'names':('user_id', 'item_id'),
                          'formats':('i4', 'i4')})
test_data['user_id'] = df["UserID"][indices_for_test]
test_data['item_id'] = df["UserID"][indices_for_test]

** Splitting URM **

In [16]:
#Import DaCrema's function from 
def split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.1):
    """
    The function splits an URM in two matrices selecting the number of interactions globally
    :param URM_all:
    :param train_percentage:
    :param verbose:
    :return:
    """

    assert train_percentage >= 0.0 and train_percentage<=1.0, "train_percentage must be a value between 0.0 and 1.0, provided was '{}'".format(train_percentage)


    from  MFDLib.IncrementalSparseMatrix import IncrementalSparseMatrix

    num_users, num_items = URM_all.shape

    URM_train_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items, auto_create_col_mapper=False, auto_create_row_mapper=False)
    URM_validation_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items, auto_create_col_mapper=False, auto_create_row_mapper=False)


    URM_train = sps.coo_matrix(URM_all)

    indices_for_sampling = np.arange(0, URM_all.nnz, dtype=int)
    np.random.shuffle(indices_for_sampling)

    n_train_interactions = round(URM_all.nnz * train_percentage)

    indices_for_train = indices_for_sampling[indices_for_sampling[0:n_train_interactions]]
    indices_for_validation = indices_for_sampling[indices_for_sampling[n_train_interactions:]]


    URM_train_builder.add_data_lists(URM_train.row[indices_for_train],
                                     URM_train.col[indices_for_train],
                                     URM_train.data[indices_for_train])

    URM_validation_builder.add_data_lists(URM_train.row[indices_for_validation],
                                          URM_train.col[indices_for_validation],
                                          URM_train.data[indices_for_validation])


    URM_train = URM_train_builder.get_SparseMatrix()
    URM_validation = URM_validation_builder.get_SparseMatrix()

    URM_train = sps.csr_matrix(URM_train)
    URM_validation = sps.csr_matrix(URM_validation)

    user_no_item_train = np.sum(np.ediff1d(URM_train.indptr) == 0)
    user_no_item_validation = np.sum(np.ediff1d(URM_validation.indptr) == 0)

    if user_no_item_train != 0:
        print("Warning: {} ({:.2f} %) of {} users have no train items".format(user_no_item_train, user_no_item_train/num_users*100, num_users))
    if user_no_item_validation != 0:
        print("Warning: {} ({:.2f} %) of {} users have no sampled items".format(user_no_item_validation, user_no_item_validation/num_users*100, num_users))


    return URM_train, URM_validation

In [17]:
# Use 15% for Test and 15% for Validation

urm_train, urm_test_validation = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.7)
urm_test, urm_validation = split_train_in_two_percentage_global_sample(urm_test_validation, train_percentage = 0.5)
urm_train_validation = urm_train + urm_validation



# **Lets USE openrec**

In [18]:
#Import the same libraries of the Section 4
#openrec.legacy now moved to openrec.tf1.legacy
from openrec.tf1.legacy import ImplicitModelTrainer
from openrec.tf1.legacy.utils import ImplicitDataset
from openrec.tf1.legacy.recommenders import PMF
from openrec.tf1.legacy.utils.evaluators import AUC, Recall, Precision, NDCG
from openrec.tf1.legacy.utils.samplers import PointwiseSampler

In [19]:
raw_data = dict()
raw_data['train_data'] = train_data
raw_data['val_data'] = validation_data
raw_data['test_data'] = test_data
raw_data['max_user'] = max_user + 1
raw_data['max_item'] = max_item + 1
batch_size = 8000
test_batch_size = 200
display_itr = 5000

In [20]:
train_dataset = ImplicitDataset(raw_data['train_data'], raw_data['max_user'], raw_data['max_item'], name='Train')
val_dataset = ImplicitDataset(raw_data['val_data'], raw_data['max_user'], raw_data['max_item'], name='Val')
test_dataset = ImplicitDataset(raw_data['test_data'], raw_data['max_user'], raw_data['max_item'], name='Test')

In [21]:
import tensorflow as tf
tf.compat.v1.reset_default_graph()
model = PMF(batch_size=batch_size, max_user=train_dataset.max_user(), max_item=train_dataset.max_item(), 
            dim_embed=50, opt='Adam', sess_config=None, l2_reg=0.0)



Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use `tf.cast` instead.
Instructions for updating:
keep_dims is deprecated, use keepdims instead






2023-12-26 15:12:18.135766: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2023-12-26 15:12:18.151869: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7fe196cb78b0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2023-12-26 15:12:18.151882: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version


In [22]:
sampler = PointwiseSampler(batch_size=batch_size, dataset=train_dataset, pos_ratio=0.2, num_process=5)
model_trainer = ImplicitModelTrainer(batch_size=batch_size, test_batch_size=test_batch_size, 
    train_dataset=train_dataset, model=model, sampler=sampler, 
    eval_save_prefix="./pmf-yahoo")
auc_evaluator = AUC()
recall_evaluator = Recall(recall_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
dcg_evaluator = NDCG(ndcg_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100])

In [23]:
#model_trainer.train(num_itr=50001, display_itr=display_itr, eval_datasets=[val_dataset],
#                    evaluators=[auc_evaluator, recall_evaluator, dcg_evaluator], num_negatives=200)

# **Explore TestSet**

We filter the testing set by retaining users who have at least a relevant and an irrelevant song in the testing set and two relevant songs in the training set (2,296 users satisfy these requirements). 

In [24]:
# Replace 'your_file.txt' with your file path
file_path = 'Dataset/yahoo_ymusic_v1/ydata-ymusic-rating-study-v1_0-train.txt'

# Load the file into a DataFrame
df_train = pd.read_csv(file_path, sep='\t',names=["UserID","SongID","Rating"], header=None)  # sep='\t' for tab-separated values

# Display the DataFrame
df.head(10)

Unnamed: 0,UserID,SongID,Rating,ImplicitRating
0,1,14,5,1
1,1,35,1,0
2,1,46,1,0
3,1,83,1,0
4,1,93,1,0
5,1,94,1,0
6,1,153,5,1
7,1,170,4,1
8,1,184,5,1
9,1,194,5,1


In [25]:
# Replace 'your_file.txt' with your file path
file_path = 'Dataset/yahoo_ymusic_v1/ydata-ymusic-rating-study-v1_0-test.txt'

# Load the file into a DataFrame
df_test = pd.read_csv(file_path, sep='\t',names=["UserID","SongID","Rating"], header=None)  # sep='\t' for tab-separated values

# Display the DataFrame
df_test.head(10)

Unnamed: 0,UserID,SongID,Rating
0,1,49,1
1,1,126,1
2,1,138,1
3,1,141,1
4,1,177,1
5,1,268,3
6,1,511,1
7,1,587,1
8,1,772,5
9,1,941,1


In [26]:
POSITIVE_THRESHOLD = 4

df_train['ImplicitRating'] = np.where(df_train["Rating"] >= POSITIVE_THRESHOLD, 1, 0)

df_test['ImplicitRating'] = np.where(df_test['Rating'] >= POSITIVE_THRESHOLD, 1, 0)
#df["ImplicitRating"] = df["ImplicitRating"].values.astype(np.float32)
df_test['UserID'].shape, df_test['SongID'].shape

((54000,), (54000,))

In [27]:
# Display the DataFrame
df_test.head(10)

Unnamed: 0,UserID,SongID,Rating,ImplicitRating
0,1,49,1,0
1,1,126,1,0
2,1,138,1,0
3,1,141,1,0
4,1,177,1,0
5,1,268,3,0
6,1,511,1,0
7,1,587,1,0
8,1,772,5,1
9,1,941,1,0


In [28]:
#Select UserID of users with at least an irrelevant song in testset
usersWithNegativeInteractionTestSet = df_test[df_test["ImplicitRating"] == 0]["UserID"].unique()

In [29]:
#Select UserID of users with at least a relevant song in testset
usersWithPositiveInteractionTestSet = df_test[df_test["ImplicitRating"] == 1]["UserID"].unique()

In [30]:
#Select UserID of users with at least two relevant song in trainset
valid_users_trainset = df_train[df_train["ImplicitRating"] == 1].groupby("UserID").filter(lambda x: len(x) >= 2)['UserID'].unique()

In [31]:
# Converting arrays to sets
set1 = set(usersWithNegativeInteractionTestSet)
set2 = set(usersWithPositiveInteractionTestSet)

set_train = set(valid_users_trainset)

# Finding the intersection
valid_users_testset = set1 & set2 & set_train

In [32]:
len(valid_users_testset)

2296

In [33]:
df_test_filtered = df_test[df_test["UserID"].isin(valid_users_testset)]

In [34]:
df_test_filtered.head(10)

Unnamed: 0,UserID,SongID,Rating,ImplicitRating
0,1,49,1,0
1,1,126,1,0
2,1,138,1,0
3,1,141,1,0
4,1,177,1,0
5,1,268,3,0
6,1,511,1,0
7,1,587,1,0
8,1,772,5,1
9,1,941,1,0
