In [1]:
import numpy as np
from openrec.tf1.legacy import ImplicitModelTrainer
from openrec.tf1.legacy.utils import ImplicitDataset
from openrec.tf1.legacy.recommenders import CML
from openrec.tf1.legacy.utils.evaluators import AUC
from openrec.tf1.legacy.utils.samplers import PairwiseSampler

In [2]:
import numpy as np
import pandas as pd
import scipy.sparse as sps

seed = 42
np.random.seed(seed=seed)

In [3]:
# Replace 'your_file.txt' with your file path
file_path = 'Dataset/yahoo_ymusic_v1/ydata-ymusic-rating-study-v1_0-train.txt'

# Load the file into a DataFrame
df = pd.read_csv(file_path, sep='\t',names=["UserID","SongID","Rating"], header=None)  # sep='\t' for tab-separated values

# Display the DataFrame
df.head(10)

Unnamed: 0,UserID,SongID,Rating
0,1,14,5
1,1,35,1
2,1,46,1
3,1,83,1
4,1,93,1
5,1,94,1
6,1,153,5
7,1,170,4
8,1,184,5
9,1,194,5


In [4]:
POSITIVE_THRESHOLD = 4

df['ImplicitRating'] = np.where(df['Rating'] >= POSITIVE_THRESHOLD, 1, 0)
#df["ImplicitRating"] = df["ImplicitRating"].values.astype(np.float32)
df['UserID'].shape, df['SongID'].shape

((311704,), (311704,))

In [5]:
max_user = df["UserID"].max()
max_item = df["SongID"].max()

In [6]:
new_df = df[df['ImplicitRating'] != 0]
new_df = new_df.drop(columns=['Rating', 'ImplicitRating'])

# Define a dictionary for renaming columns
rename_dict = {
    'UserID': 'user_id',
    'SongID': 'item_id'
}

# Rename the columns
new_df = new_df.rename(columns=rename_dict)


structured_data = new_df.to_records(index=False)

In [7]:
structured_data.size

125077

In [8]:
structured_data.shape

(125077,)

In [9]:
TRAINING_QUOTA = 0.7
VALIDATIION_QUOTA = 0.15
TEST_QUOTA = 0.15

assert TRAINING_QUOTA+VALIDATIION_QUOTA+TEST_QUOTA == 1

#Get relevant interactions indexes
indicesRelevantInteractions = np.arange(structured_data.size)

print(f"Total positive interactions: {indicesRelevantInteractions.shape[0]}")

#Shuffle them
np.random.shuffle(indicesRelevantInteractions)
n_train_interactions = round(indicesRelevantInteractions.shape[0] * TRAINING_QUOTA)

print(f"Training sampled positive interactions: {n_train_interactions}")

#Sample training indexes
indices_for_train = indicesRelevantInteractions[0:n_train_interactions]
indices_for_validation_test = indicesRelevantInteractions[n_train_interactions:]

print(f"Training indeces length: {indices_for_train.shape[0]}")
print(f"Test+validation length: {indices_for_validation_test.shape[0]}")

n_validation_interactions = round(indices_for_validation_test.shape[0] * VALIDATIION_QUOTA / (VALIDATIION_QUOTA+TEST_QUOTA))

indices_for_validation = indices_for_validation_test[:n_validation_interactions]
indices_for_test = indices_for_validation_test[n_validation_interactions:]

print(f"Validation length: {indices_for_validation.shape[0]}")
print(f"Test length: {indices_for_test.shape[0]}")

Total positive interactions: 125077
Training sampled positive interactions: 106315
Training indeces length: 106315
Training validation length: 18762


In [10]:
train_data = structured_data[indices_for_train]

In [11]:
train_data.shape

(106315,)

In [12]:
validation_data = structured_data[indices_for_validation]

In [13]:
validation_data.shape

(18762,)

In [None]:
test_data = structured_data[indices_for_test]

In [None]:
test_data.shape

In [14]:
np.save("training_arr.npy", train_data)
np.save("validation_arr.npy", validation_data)
np.save("test_arr.npy", test_data)