In [18]:
import pandas as pd
import os

filepath = "../../datasets/ml-20m"


In [19]:
df = pd.read_csv(os.path.join(filepath, "ratings.csv"))
df.shape

(20000263, 4)

In [20]:
print(df.head())

   userId  movieId  rating   timestamp
0       1        2     3.5  1112486027
1       1       29     3.5  1112484676
2       1       32     3.5  1112484819
3       1       47     3.5  1112484727
4       1       50     3.5  1112484580


In [21]:
# binarize the explicit data by keeping ratings of four or higher and interpret them as implicit feedback
df['rating'] = df['rating'].apply(lambda x: 1 if x >= 4 else 0)
df.rename(columns={'rating': 'click'}, inplace=True)
print(df.head())

   userId  movieId  click   timestamp
0       1        2      0  1112486027
1       1       29      0  1112484676
2       1       32      0  1112484819
3       1       47      0  1112484727
4       1       50      0  1112484580


In [22]:
# We only keep users who have watched at least 5 movies
df = df.groupby('userId').filter(lambda x: len(x) >= 5)
print(df.shape)

(20000263, 4)


In [23]:
# We only keep movies with at least 5 interactions
df = df.groupby('movieId').filter(lambda x: len(x) >= 5)
print(df.shape)

(19984024, 4)


In [24]:
# Create a user-movie interaction matrix, oh wait, this is a huuuge table!!!!!
# user_movie_matrix = df.pivot_table(index='userId', columns='movieId', values='rating', aggfunc='sum', fill_value=0)
# user_movie_matrix.head()

# alternative solution, lazy load table when needed, defined in vae.ipynb
print(len(df['userId'].unique())*len(df['movieId'].unique()))

2540654085


In [25]:
holdout_size = 20000
# randomly select 20000 holdout users, according to the paper
holdout_userid = df['userId'].drop_duplicates().sample(n=holdout_size, random_state=42)

# Train
# Create train data based on all history of non-holdout userid
train_df = df[~df['userId'].isin(holdout_userid)]
train_df.to_csv(os.path.join(filepath, "train.csv"), index=False)
print("Train: ", train_df.shape)


Train:  (17065612, 4)


In [None]:
# holdout (20000 users)
houldout_df = df[df['userId'].isin(holdout_userid)]

# split houldout equally between val and test
val_userid = houldout_df['userId'].drop_duplicates().sample(n=holdout_size // 2, random_state=42)
val_df = houldout_df[houldout_df['userId'].isin(val_userid)]
test_df = houldout_df[~houldout_df['userId'].isin(val_userid)]
val_df.to_csv(os.path.join(filepath, "val.csv"), index=False)
test_df.to_csv(os.path.join(filepath, "test.csv"), index=False)
print("val_df:", val_df.shape)
print("test_df:", test_df.shape)