In [1]:
import numpy as np
from openrec.tf1.legacy import ImplicitModelTrainer
from openrec.tf1.legacy.utils import ImplicitDataset
from openrec.tf1.legacy.recommenders import CML
from openrec.tf1.legacy.utils.evaluators import AUC
from openrec.tf1.legacy.utils.samplers import PairwiseSampler

In [None]:
import numpy as np
import pandas as pd
import scipy.sparse as sps

seed = 42
np.random.seed(seed=seed)

In [None]:
# Replace 'your_file.txt' with your file path
file_path = 'Dataset/yahoo_ymusic_v1/ydata-ymusic-rating-study-v1_0-train.txt'

# Load the file into a DataFrame
df = pd.read_csv(file_path, sep='\t',names=["UserID","SongID","Rating"], header=None)  # sep='\t' for tab-separated values

# Display the DataFrame
df.head(10)

In [None]:
POSITIVE_THRESHOLD = 4

df['ImplicitRating'] = np.where(df['Rating'] >= POSITIVE_THRESHOLD, 1, 0)
#df["ImplicitRating"] = df["ImplicitRating"].values.astype(np.float32)
df['UserID'].shape, df['SongID'].shape

In [None]:
max_user = df["UserID"].max()
max_item = df["SongID"].max()

# **GET BIASED TESTSET**

In [None]:
min_user = df["UserID"].min()
max_user = df["UserID"].max()

min_item = df["SongID"].min()
max_item = df["SongID"].max()

In [None]:
items_ids = np.arange(min_item, max_item + 1)

SONGS_FOR_BIASED_TEST = 300

# Precompute the songs each user has rated implicitly (ImplicitRating == 1)
user_positive_ratings = df[df["ImplicitRating"] == 1].groupby("UserID")["SongID"].apply(set)

pos_test_set = []
neg_test_set = []

for user_id in range(min_user, max_user + 1):
    np.random.shuffle(items_ids)
    test_items = set(items_ids[-SONGS_FOR_BIASED_TEST:])
    pos_ids = user_positive_ratings.get(user_id, set()) & test_items

    #set those to 0 so that they will no longer be used in training set
    df.loc[(df['SongID'].isin(pos_ids)) & (df['UserID'] == user_id), 'ImplicitRating'] = 0

    for id in test_items:
        if id in pos_ids:
            pos_test_set.append([user_id, id])
        else:
            neg_test_set.append([user_id, id])

pos_test_set = np.array(pos_test_set)
neg_test_set = np.array(neg_test_set)

pos_test_set.shape, neg_test_set.shape

In [None]:
pos_test_set_df = pd.DataFrame(pos_test_set)
neg_test_set_df = pd.DataFrame(neg_test_set)

In [None]:
pos_test_set_df.columns = ["user_id","item_id"]
neg_test_set_df.columns = ["user_id","item_id"]

In [None]:
pos_test_set_df

In [None]:
structured_data_pos_test_set = pos_test_set_df.to_records(index=False)
structured_data_pos_test_set

In [None]:
structured_data_neg_test_set = neg_test_set_df.to_records(index=False)
structured_data_neg_test_set

In [None]:
np.save("biased-test_arr_pos.npy", structured_data_pos_test_set)
np.save("biased-test_arr_neg.npy", structured_data_neg_test_set)

# **GET TRAIN AND VALIDATION SET**

In [None]:
new_df = df[df['ImplicitRating'] != 0]
new_df = new_df.drop(columns=['Rating', 'ImplicitRating'])

# Define a dictionary for renaming columns
rename_dict = {
    'UserID': 'user_id',
    'SongID': 'item_id'
}

# Rename the columns
new_df = new_df.rename(columns=rename_dict)


structured_data = new_df.to_records(index=False)

In [None]:
structured_data.size

In [None]:
structured_data.shape

In [None]:
TRAINING_QUOTA = 0.85
VALIDATIION_QUOTA = 0.15
TEST_QUOTA = 0

assert TRAINING_QUOTA+VALIDATIION_QUOTA+TEST_QUOTA == 1

#Get relevant interactions indexes
indicesRelevantInteractions = np.arange(structured_data.size)

print(f"Total positive interactions: {indicesRelevantInteractions.shape[0]}")

#Shuffle them
np.random.shuffle(indicesRelevantInteractions)
n_train_interactions = round(indicesRelevantInteractions.shape[0] * TRAINING_QUOTA)

print(f"Training sampled positive interactions: {n_train_interactions}")

#Sample training indexes
indices_for_train = indicesRelevantInteractions[0:n_train_interactions]
indices_for_validation_test = indicesRelevantInteractions[n_train_interactions:]

print(f"Training indeces length: {indices_for_train.shape[0]}")
print(f"Test+validation length: {indices_for_validation_test.shape[0]}")

n_validation_interactions = round(indices_for_validation_test.shape[0] * VALIDATIION_QUOTA / (VALIDATIION_QUOTA+TEST_QUOTA))

indices_for_validation = indices_for_validation_test[:n_validation_interactions]
indices_for_test = indices_for_validation_test[n_validation_interactions:]

print(f"Validation length: {indices_for_validation.shape[0]}")
print(f"Test length: {indices_for_test.shape[0]}")

In [None]:
train_data = structured_data[indices_for_train]

In [None]:
train_data.shape

In [None]:
validation_data = structured_data[indices_for_validation]

In [None]:
validation_data.shape

In [None]:
test_data = structured_data[indices_for_test]

In [None]:
test_data.shape

In [None]:
np.save("training_arr.npy", train_data)
np.save("validation_arr.npy", validation_data)
if TEST_QUOTA > 0:
    np.save("test_arr.npy", test_data)

# **GET UNBIASED TESTSET**

In [None]:
# Replace 'your_file.txt' with your file path
file_path = 'Dataset/yahoo_ymusic_v1/ydata-ymusic-rating-study-v1_0-train.txt'

# Load the file into a DataFrame
df_train = pd.read_csv(file_path, sep='\t',names=["UserID","SongID","Rating"], header=None)  # sep='\t' for tab-separated values

# Display the DataFrame
df.head(10)

In [None]:
# Replace 'your_file.txt' with your file path
file_path = 'Dataset/yahoo_ymusic_v1/ydata-ymusic-rating-study-v1_0-test.txt'

# Load the file into a DataFrame
df_test = pd.read_csv(file_path, sep='\t',names=["UserID","SongID","Rating"], header=None)  # sep='\t' for tab-separated values

# Display the DataFrame
df_test.head(10)

In [None]:
df_train['ImplicitRating'] = np.where(df_train["Rating"] >= POSITIVE_THRESHOLD, 1, 0)

df_test['ImplicitRating'] = np.where(df_test['Rating'] >= POSITIVE_THRESHOLD, 1, 0)
#df["ImplicitRating"] = df["ImplicitRating"].values.astype(np.float32)
df_test['UserID'].shape, df_test['SongID'].shape

In [None]:
# Display the DataFrame
df_test.head(10)

In [None]:
#Select UserID of users with at least an irrelevant song in testset
usersWithNegativeInteractionTestSet = df_test[df_test["ImplicitRating"] == 0]["UserID"].unique()

In [None]:
#Select UserID of users with at least a relevant song in testset
usersWithPositiveInteractionTestSet = df_test[df_test["ImplicitRating"] == 1]["UserID"].unique()

In [None]:
#Select UserID of users with at least two relevant song in trainset
valid_users_trainset = df_train[df_train["ImplicitRating"] == 1].groupby("UserID").filter(lambda x: len(x) >= 2)['UserID'].unique()

In [None]:
# Converting arrays to sets
set1 = set(usersWithNegativeInteractionTestSet)
set2 = set(usersWithPositiveInteractionTestSet)

set_train = set(valid_users_trainset)

# Finding the intersection
valid_users_testset = set1 & set2 & set_train

In [None]:
len(valid_users_testset)

In [None]:
df_test_filtered = df_test[df_test["UserID"].isin(valid_users_testset)]

In [None]:
df_test_filtered.head(10)

In [None]:
items_ids = np.arange(min_item, max_item + 1)

SONGS_FOR_BIASED_TEST = 300

# Precompute the songs each user has rated implicitly (ImplicitRating == 1)
user_positive_ratings = df_test_filtered[df_test_filtered["ImplicitRating"] == 1].groupby("UserID")["SongID"].apply(set)

unbiased_pos_test_set = []
unbiased_neg_test_set = []

for user_id in range(min_user, max_user + 1):
    np.random.shuffle(items_ids)
    test_items = set(items_ids[-SONGS_FOR_BIASED_TEST:])
    pos_ids = user_positive_ratings.get(user_id, set()) & test_items

    for id in test_items:
        if id in pos_ids:
            unbiased_pos_test_set.append([user_id, id])
        else:
            unbiased_neg_test_set.append([user_id, id])

unbiased_pos_test_set = np.array(pos_test_set)
unbiased_neg_test_set = np.array(neg_test_set)

unbiased_pos_test_set.shape, unbiased_neg_test_set.shape

In [None]:
unbiased_pos_test_set_df = pd.DataFrame(unbiased_pos_test_set)
unbiased_neg_test_set_df = pd.DataFrame(unbiased_neg_test_set)

In [None]:
unbiased_pos_test_set_df.columns = ["user_id","item_id"]
unbiased_neg_test_set_df.columns = ["user_id","item_id"]

In [None]:
unbiased_pos_test_set_df


In [None]:
structured_data_pos_test_set_unbiased = unbiased_pos_test_set_df.to_records(index=False)
structured_data_pos_test_set_unbiased

In [None]:
structured_data_neg_test_set_unbiased = unbiased_neg_test_set_df.to_records(index=False)
structured_data_neg_test_set_unbiased

In [None]:
np.save("unbiased-test_arr_pos.npy", structured_data_pos_test_set_unbiased)
np.save("unbiased-test_arr_neg.npy", structured_data_neg_test_set_unbiased)