In [1]:
import numpy as np
from openrec.tf1.legacy import ImplicitModelTrainer
from openrec.tf1.legacy.utils import ImplicitDataset
from openrec.tf1.legacy.recommenders import CML
from openrec.tf1.legacy.utils.evaluators import AUC
from openrec.tf1.legacy.utils.samplers import PairwiseSampler

In [2]:
import numpy as np
import pandas as pd
import scipy.sparse as sps

seed = 42
np.random.seed(seed=seed)

In [3]:
# Replace 'your_file.txt' with your file path
file_path = 'Dataset/yahoo_ymusic_v1/ydata-ymusic-rating-study-v1_0-train.txt'

# Load the file into a DataFrame
df = pd.read_csv(file_path, sep='\t',names=["UserID","SongID","Rating"], header=None)  # sep='\t' for tab-separated values

# Display the DataFrame
df.head(10)

Unnamed: 0,UserID,SongID,Rating
0,1,14,5
1,1,35,1
2,1,46,1
3,1,83,1
4,1,93,1
5,1,94,1
6,1,153,5
7,1,170,4
8,1,184,5
9,1,194,5


In [4]:
POSITIVE_THRESHOLD = 4

df['ImplicitRating'] = np.where(df['Rating'] >= POSITIVE_THRESHOLD, 1, 0)
#df["ImplicitRating"] = df["ImplicitRating"].values.astype(np.float32)
df['UserID'].shape, df['SongID'].shape

((311704,), (311704,))

In [5]:
max_user = df["UserID"].max()
max_item = df["SongID"].max()

# **GET BIASED TESTSET**

In [6]:
min_user = df["UserID"].min()
max_user = df["UserID"].max()

min_item = df["SongID"].min()
max_item = df["SongID"].max()

In [7]:
items_ids = np.arange(min_item, max_item + 1)

SONGS_FOR_BIASED_TEST = 300

# Precompute the songs each user has rated implicitly (ImplicitRating == 1)
user_positive_ratings = df[df["ImplicitRating"] == 1].groupby("UserID")["SongID"].apply(set)

pos_test_set = []
neg_test_set = []

for user_id in range(min_user, max_user + 1):
    np.random.shuffle(items_ids)
    test_items = set(items_ids[-SONGS_FOR_BIASED_TEST:])
    pos_ids = user_positive_ratings.get(user_id, set()) & test_items

    #set those to 0 so that they will no longer be used in training set
    df.loc[(df['SongID'].isin(pos_ids)) & (df['UserID'] == user_id), 'ImplicitRating'] = 0

    for id in test_items:
        if id in pos_ids:
            pos_test_set.append([user_id, id])
        else:
            neg_test_set.append([user_id, id])

pos_test_set = np.array(pos_test_set)
neg_test_set = np.array(neg_test_set)

pos_test_set.shape, neg_test_set.shape

((37646, 2), (4582354, 2))

In [8]:
pos_test_set_df = pd.DataFrame(pos_test_set)
neg_test_set_df = pd.DataFrame(neg_test_set)

In [9]:
pos_test_set_df.columns = ["user_id","item_id"]
neg_test_set_df.columns = ["user_id","item_id"]

In [10]:
pos_test_set_df

Unnamed: 0,user_id,item_id
0,1,14
1,1,684
2,1,184
3,1,786
4,1,492
...,...,...
37641,15399,194
37642,15399,244
37643,15399,252
37644,15400,214


In [11]:
structured_data_pos_test_set = pos_test_set_df.to_records(index=False)
structured_data_pos_test_set

rec.array([(    1,  14), (    1, 684), (    1, 184), ..., (15399, 252),
           (15400, 214), (15400, 242)],
          dtype=[('user_id', '<i8'), ('item_id', '<i8')])

In [12]:
structured_data_neg_test_set = neg_test_set_df.to_records(index=False)
structured_data_neg_test_set

rec.array([(    1,   2), (    1,   5), (    1, 521), ..., (15400, 502),
           (15400, 503), (15400, 509)],
          dtype=[('user_id', '<i8'), ('item_id', '<i8')])

In [13]:
np.save("biased-test_arr_pos.npy", structured_data_pos_test_set)
np.save("biased-test_arr_neg.npy", structured_data_neg_test_set)

# **GET TRAIN AND VALIDATION SET**

In [14]:
new_df = df[df['ImplicitRating'] != 0]
new_df = new_df.drop(columns=['Rating', 'ImplicitRating'])

# Define a dictionary for renaming columns
rename_dict = {
    'UserID': 'user_id',
    'SongID': 'item_id'
}

# Rename the columns
new_df = new_df.rename(columns=rename_dict)


structured_data = new_df.to_records(index=False)

In [15]:
structured_data.size

87431

In [16]:
structured_data.shape

(87431,)

In [17]:
TRAINING_QUOTA = 0.85
VALIDATIION_QUOTA = 0.15
TEST_QUOTA = 0

assert TRAINING_QUOTA+VALIDATIION_QUOTA+TEST_QUOTA == 1

#Get relevant interactions indexes
indicesRelevantInteractions = np.arange(structured_data.size)

print(f"Total positive interactions: {indicesRelevantInteractions.shape[0]}")

#Shuffle them
np.random.shuffle(indicesRelevantInteractions)
n_train_interactions = round(indicesRelevantInteractions.shape[0] * TRAINING_QUOTA)

print(f"Training sampled positive interactions: {n_train_interactions}")

#Sample training indexes
indices_for_train = indicesRelevantInteractions[0:n_train_interactions]
indices_for_validation_test = indicesRelevantInteractions[n_train_interactions:]

print(f"Training indeces length: {indices_for_train.shape[0]}")
print(f"Test+validation length: {indices_for_validation_test.shape[0]}")

n_validation_interactions = round(indices_for_validation_test.shape[0] * VALIDATIION_QUOTA / (VALIDATIION_QUOTA+TEST_QUOTA))

indices_for_validation = indices_for_validation_test[:n_validation_interactions]
indices_for_test = indices_for_validation_test[n_validation_interactions:]

print(f"Validation length: {indices_for_validation.shape[0]}")
print(f"Test length: {indices_for_test.shape[0]}")

Total positive interactions: 87431
Training sampled positive interactions: 74316
Training indeces length: 74316
Test+validation length: 13115
Validation length: 13115
Test length: 0


In [18]:
train_data = structured_data[indices_for_train]

In [19]:
train_data.shape

(74316,)

In [20]:
validation_data = structured_data[indices_for_validation]

In [21]:
validation_data.shape

(13115,)

In [22]:
test_data = structured_data[indices_for_test]

In [23]:
test_data.shape

(0,)

In [24]:
np.save("training_arr.npy", train_data)
np.save("validation_arr.npy", validation_data)
if TEST_QUOTA > 0:
    np.save("test_arr.npy", test_data)

# **GET UNBIASED TESTSET**

In [25]:
# Replace 'your_file.txt' with your file path
file_path = 'Dataset/yahoo_ymusic_v1/ydata-ymusic-rating-study-v1_0-train.txt'

# Load the file into a DataFrame
df_train = pd.read_csv(file_path, sep='\t',names=["UserID","SongID","Rating"], header=None)  # sep='\t' for tab-separated values

# Display the DataFrame
df.head(10)

Unnamed: 0,UserID,SongID,Rating,ImplicitRating
0,1,14,5,0
1,1,35,1,0
2,1,46,1,0
3,1,83,1,0
4,1,93,1,0
5,1,94,1,0
6,1,153,5,1
7,1,170,4,1
8,1,184,5,0
9,1,194,5,1


In [26]:
# Replace 'your_file.txt' with your file path
file_path = 'Dataset/yahoo_ymusic_v1/ydata-ymusic-rating-study-v1_0-test.txt'

# Load the file into a DataFrame
df_test = pd.read_csv(file_path, sep='\t',names=["UserID","SongID","Rating"], header=None)  # sep='\t' for tab-separated values

# Display the DataFrame
df_test.head(10)

Unnamed: 0,UserID,SongID,Rating
0,1,49,1
1,1,126,1
2,1,138,1
3,1,141,1
4,1,177,1
5,1,268,3
6,1,511,1
7,1,587,1
8,1,772,5
9,1,941,1


In [27]:
df_train['ImplicitRating'] = np.where(df_train["Rating"] >= POSITIVE_THRESHOLD, 1, 0)

df_test['ImplicitRating'] = np.where(df_test['Rating'] >= POSITIVE_THRESHOLD, 1, 0)
#df["ImplicitRating"] = df["ImplicitRating"].values.astype(np.float32)
df_test['UserID'].shape, df_test['SongID'].shape

((54000,), (54000,))

In [28]:
# Display the DataFrame
df_test.head(10)

Unnamed: 0,UserID,SongID,Rating,ImplicitRating
0,1,49,1,0
1,1,126,1,0
2,1,138,1,0
3,1,141,1,0
4,1,177,1,0
5,1,268,3,0
6,1,511,1,0
7,1,587,1,0
8,1,772,5,1
9,1,941,1,0


In [29]:
#Select UserID of users with at least an irrelevant song in testset
usersWithNegativeInteractionTestSet = df_test[df_test["ImplicitRating"] == 0]["UserID"].unique()

In [30]:
#Select UserID of users with at least a relevant song in testset
usersWithPositiveInteractionTestSet = df_test[df_test["ImplicitRating"] == 1]["UserID"].unique()

In [31]:
#Select UserID of users with at least two relevant song in trainset
valid_users_trainset = df_train[df_train["ImplicitRating"] == 1].groupby("UserID").filter(lambda x: len(x) >= 2)['UserID'].unique()

In [32]:
# Converting arrays to sets
set1 = set(usersWithNegativeInteractionTestSet)
set2 = set(usersWithPositiveInteractionTestSet)

set_train = set(valid_users_trainset)

# Finding the intersection
valid_users_testset = set1 & set2 & set_train

In [33]:
len(valid_users_testset)

2296

In [34]:
df_test_filtered = df_test[df_test["UserID"].isin(valid_users_testset)]

In [35]:
df_test_filtered.head(10)

Unnamed: 0,UserID,SongID,Rating,ImplicitRating
0,1,49,1,0
1,1,126,1,0
2,1,138,1,0
3,1,141,1,0
4,1,177,1,0
5,1,268,3,0
6,1,511,1,0
7,1,587,1,0
8,1,772,5,1
9,1,941,1,0


In [36]:
items_ids = np.arange(min_item, max_item + 1)

SONGS_FOR_BIASED_TEST = 300

# Precompute the songs each user has rated implicitly (ImplicitRating == 1)
user_positive_ratings = df_test_filtered[df_test_filtered["ImplicitRating"] == 1].groupby("UserID")["SongID"].apply(set)

unbiased_pos_test_set = []
unbiased_neg_test_set = []

for user_id in range(min_user, max_user + 1):
    np.random.shuffle(items_ids)
    test_items = set(items_ids[-SONGS_FOR_BIASED_TEST:])
    pos_ids = user_positive_ratings.get(user_id, set()) & test_items

    for id in test_items:
        if id in pos_ids:
            unbiased_pos_test_set.append([user_id, id])
        else:
            unbiased_neg_test_set.append([user_id, id])

unbiased_pos_test_set = np.array(pos_test_set)
unbiased_neg_test_set = np.array(neg_test_set)

unbiased_pos_test_set.shape, unbiased_neg_test_set.shape

((37646, 2), (4582354, 2))

In [37]:
unbiased_pos_test_set_df = pd.DataFrame(unbiased_pos_test_set)
unbiased_neg_test_set_df = pd.DataFrame(unbiased_neg_test_set)

In [38]:
unbiased_pos_test_set_df.columns = ["user_id","item_id"]
unbiased_neg_test_set_df.columns = ["user_id","item_id"]

In [39]:
unbiased_pos_test_set_df


Unnamed: 0,user_id,item_id
0,1,14
1,1,684
2,1,184
3,1,786
4,1,492
...,...,...
37641,15399,194
37642,15399,244
37643,15399,252
37644,15400,214


In [40]:
structured_data_pos_test_set_unbiased = unbiased_pos_test_set_df.to_records(index=False)
structured_data_pos_test_set_unbiased

rec.array([(    1,  14), (    1, 684), (    1, 184), ..., (15399, 252),
           (15400, 214), (15400, 242)],
          dtype=[('user_id', '<i8'), ('item_id', '<i8')])

In [41]:
structured_data_neg_test_set_unbiased = unbiased_neg_test_set_df.to_records(index=False)
structured_data_neg_test_set_unbiased

rec.array([(    1,   2), (    1,   5), (    1, 521), ..., (15400, 502),
           (15400, 503), (15400, 509)],
          dtype=[('user_id', '<i8'), ('item_id', '<i8')])

In [42]:
np.save("unbiased-test_arr_pos.npy", structured_data_pos_test_set_unbiased)
np.save("unbiased-test_arr_neg.npy", structured_data_neg_test_set_unbiased)