In [2]:
import pandas as pd
import os

filepath = "../../datasets/ml-20m"


In [2]:
df = pd.read_csv(os.path.join(filepath, "ratings.csv"))
df.shape

(20000263, 4)

In [3]:
print(df.head())

   userId  movieId  rating   timestamp
0       1        2     3.5  1112486027
1       1       29     3.5  1112484676
2       1       32     3.5  1112484819
3       1       47     3.5  1112484727
4       1       50     3.5  1112484580


In [4]:
# binarize the explicit data by keeping ratings of four or higher and interpret them as implicit feedback
df['rating'] = df['rating'].apply(lambda x: 1 if x >= 4 else 0)
df.rename(columns={'rating': 'click'}, inplace=True)
print(df.head())

   userId  movieId  click   timestamp
0       1        2      0  1112486027
1       1       29      0  1112484676
2       1       32      0  1112484819
3       1       47      0  1112484727
4       1       50      0  1112484580


In [5]:
# We only keep users who have watched at least 5 movies
df = df.groupby('userId').filter(lambda x: len(x) >= 5)
print(df.shape)

(20000263, 4)


In [6]:
# We only keep movies with at least 5 interactions
df = df.groupby('movieId').filter(lambda x: len(x) >= 5)
print(df.shape)

(19984024, 4)


In [7]:
# Create a user-movie interaction matrix, oh wait, this is a huuuge table!!!!!
# user_movie_matrix = df.pivot_table(index='userId', columns='movieId', values='rating', aggfunc='sum', fill_value=0)
# user_movie_matrix.head()

# alternative solution, lazy load table when needed, defined in vae.ipynb
print(len(df['userId'].unique())*len(df['movieId'].unique()))

2540654085


In [8]:
holdout_size = 20000
# randomly select 20000 holdout users, according to the paper
holdout_userid = df['userId'].drop_duplicates().sample(n=holdout_size, random_state=42)

# Train
# Create train data based on all history of non-holdout userid
train_df = df[~df['userId'].isin(holdout_userid)]
# train_df.to_csv(os.path.join(filepath, "train.csv"), index=False)
print("Train: ", train_df.shape)


Train:  (17065612, 4)


In [12]:
# holdout (20000 users)
houldout_df = df[df['userId'].isin(holdout_userid)]

# split houldout equally between val and test
val_userid = houldout_df['userId'].drop_duplicates().sample(n=holdout_size // 2, random_state=42)
val_df = houldout_df[houldout_df['userId'].isin(val_userid)]
test_df = houldout_df[~houldout_df['userId'].isin(val_userid)]
# val_df.to_csv(os.path.join(filepath, "val.csv"), index=False)
# test_df.to_csv(os.path.join(filepath, "test.csv"), index=False)
# print("val_df:", val_df.shape)
# print("test_df:", test_df.shape)

In [13]:
import numpy as np
import pandas as pd

def split_train_test_proportion(data, test_prop=0.2, random_state=42):
    """
    Splits each user's interactions randomly into a training and a test set
    based on the specified proportion (test_prop).
    """
    data_grouped_by_user = data.groupby('userId')
    tr_list, te_list = list(), list()

    np.random.seed(random_state)

    for _, group in data_grouped_by_user:
        n_items_u = len(group)

        # Only split users with a minimum number of interactions (e.g., 5)
        # Note: If you filtered users earlier, this check might be less crucial.
        if n_items_u >= 5:
            # Create a boolean array to index the test items
            idx = np.zeros(n_items_u, dtype='bool')
            
            # Randomly select indices for the test items
            test_indices = np.random.choice(
                n_items_u, 
                size=int(test_prop * n_items_u), 
                replace=False
            ).astype('int64')
            
            idx[test_indices] = True

            # Append 80% to train list, 20% to test list
            tr_list.append(group[np.logical_not(idx)])
            te_list.append(group[idx])
        else:
            # Keep users with < 5 items entirely in the training set (tr)
            tr_list.append(group)

    data_tr = pd.concat(tr_list)
    data_te = pd.concat(te_list)

    return data_tr, data_te

In [14]:
# 1. Split Validation User Interactions (80% for observation, 20% for evaluation)
val_df_tr, val_df_te = split_train_test_proportion(val_df, test_prop=0.2)

# 2. Split Test User Interactions (80% for observation, 20% for evaluation)
test_df_tr, test_df_te = split_train_test_proportion(test_df, test_prop=0.2)

# Now, save the new files
# The original 'train.csv' (from non-holdout users) will be merged with 
# val_df_tr and test_df_tr later when building the final training matrix.
# For now, save the necessary components:

# val_df_tr.to_csv(os.path.join(filepath, "val_tr.csv"), index=False)
val_df_te.to_csv(os.path.join(filepath, "val.csv"), index=False)

# test_df_tr.to_csv(os.path.join(filepath, "test_tr.csv"), index=False)
test_df_te.to_csv(os.path.join(filepath, "test.csv"), index=False)

print(f"Validation observation set (val_df_tr) shape: {val_df_tr.shape}")
print(f"Validation held-out set (val_df_te) shape: {val_df_te.shape}")
print(f"Test observation set (test_df_tr) shape: {test_df_tr.shape}")
print(f"Test held-out set (test_df_te) shape: {test_df_te.shape}")

Validation observation set (val_df_tr) shape: (1171109, 4)
Validation held-out set (val_df_te) shape: (287915, 4)
Test observation set (test_df_tr) shape: (1171298, 4)
Test held-out set (test_df_te) shape: (288090, 4)


In [15]:
pd.concat([train_df, val_df_tr, test_df_tr], ignore_index=True).to_csv(os.path.join(filepath, "train.csv"), index=False)

Official Approach

In [3]:
raw_data = pd.read_csv(os.path.join(filepath, 'ratings.csv'), header=0)
raw_data = raw_data[raw_data['rating'] > 3.5]


In [6]:
def get_count(tp, id):
    playcount_groupbyid = tp[[id]].groupby(id, as_index=False)
    count = playcount_groupbyid.size()
    return count

def filter_triplets(tp, min_uc=5, min_sc=0):
   # 1. Filter items (min_sc)
    if min_sc > 0:
        itemcount = get_count(tp, 'movieId')
        # Corrected line: Get the index of 'movieId's where count >= min_sc
        valid_items = itemcount[itemcount >= min_sc].index
        tp = tp[tp['movieId'].isin(valid_items)]
    
    # 2. Filter users (min_uc)
    if min_uc > 0:
        usercount = get_count(tp, 'userId')
        # Corrected line: Get the index of 'userId's where count >= min_uc
        valid_users = usercount[usercount >= min_uc].index
        tp = tp[tp['userId'].isin(valid_users)]
    
    # Update both usercount and itemcount after filtering
    usercount, itemcount = get_count(tp, 'userId'), get_count(tp, 'movieId') 
    return tp, usercount, itemcount

In [7]:
raw_data, user_activity, item_popularity = filter_triplets(raw_data)
sparsity = 1. * raw_data.shape[0] / (user_activity.shape[0] * item_popularity.shape[0])

print("After filtering, there are %d watching events from %d users and %d movies (sparsity: %.3f%%)" % 
      (raw_data.shape[0], user_activity.shape[0], item_popularity.shape[0], sparsity * 100))

After filtering, there are 9980096 watching events from 138081 users and 20688 movies (sparsity: 0.349%)


In [9]:
import numpy as np
unique_uid = user_activity.index

np.random.seed(98765)
idx_perm = np.random.permutation(unique_uid.size)
unique_uid = unique_uid[idx_perm]

# create train/validation/test users
n_users = unique_uid.size
n_heldout_users = 10000

tr_users = unique_uid[:(n_users - n_heldout_users * 2)]
vd_users = unique_uid[(n_users - n_heldout_users * 2): (n_users - n_heldout_users)]
te_users = unique_uid[(n_users - n_heldout_users):]

train_plays = raw_data.loc[raw_data['userId'].isin(tr_users)]
unique_sid = pd.unique(train_plays['movieId'])

show2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
profile2id = dict((pid, i) for (i, pid) in enumerate(unique_uid))

pro_dir = os.path.join(filepath, 'pro_sg')

if not os.path.exists(pro_dir):
    os.makedirs(pro_dir)

with open(os.path.join(pro_dir, 'unique_sid.txt'), 'w') as f:
    for sid in unique_sid:
        f.write('%s\n' % sid)

In [10]:
import sys

def split_train_test_proportion(data, test_prop=0.2):
    data_grouped_by_user = data.groupby('userId')
    tr_list, te_list = list(), list()

    np.random.seed(98765)

    for i, (_, group) in enumerate(data_grouped_by_user):
        n_items_u = len(group)

        if n_items_u >= 5:
            idx = np.zeros(n_items_u, dtype='bool')
            idx[np.random.choice(n_items_u, size=int(test_prop * n_items_u), replace=False).astype('int64')] = True

            tr_list.append(group[np.logical_not(idx)])
            te_list.append(group[idx])
        else:
            tr_list.append(group)

        if i % 1000 == 0:
            print("%d users sampled" % i)
            sys.stdout.flush()

    data_tr = pd.concat(tr_list)
    data_te = pd.concat(te_list)
    
    return data_tr, data_te

vad_plays = raw_data.loc[raw_data['userId'].isin(vd_users)]
vad_plays = vad_plays.loc[vad_plays['movieId'].isin(unique_sid)]



In [11]:
vad_plays_tr, vad_plays_te = split_train_test_proportion(vad_plays)

0 users sampled
1000 users sampled
2000 users sampled
3000 users sampled
4000 users sampled
5000 users sampled
6000 users sampled
7000 users sampled
8000 users sampled
9000 users sampled


In [12]:
test_plays = raw_data.loc[raw_data['userId'].isin(te_users)]
test_plays = test_plays.loc[test_plays['movieId'].isin(unique_sid)]

In [13]:
test_plays_tr, test_plays_te = split_train_test_proportion(test_plays)

0 users sampled
1000 users sampled
2000 users sampled
3000 users sampled
4000 users sampled
5000 users sampled
6000 users sampled
7000 users sampled
8000 users sampled
9000 users sampled


In [15]:
def numerize(tp):
    uid = list(map(lambda x: profile2id[x], tp['userId']))
    sid = list(map(lambda x: show2id[x], tp['movieId']))
    return pd.DataFrame(data={'uid': uid, 'sid': sid}, columns=['uid', 'sid'])

train_data = numerize(train_plays)
train_data.to_csv(os.path.join(pro_dir, 'train_official.csv'), index=False)

vad_data_tr = numerize(vad_plays_tr)
vad_data_tr.to_csv(os.path.join(pro_dir, 'validation_tr.csv'), index=False)

vad_data_te = numerize(vad_plays_te)
vad_data_te.to_csv(os.path.join(pro_dir, 'validation_te.csv'), index=False)

test_data_tr = numerize(test_plays_tr)
test_data_tr.to_csv(os.path.join(pro_dir, 'test_tr.csv'), index=False)

test_data_te = numerize(test_plays_te)
test_data_te.to_csv(os.path.join(pro_dir, 'test_te.csv'), index=False)

In [16]:
train_data.head()

Unnamed: 0,uid,sid
0,66756,0
1,66756,1
2,66756,2
3,66756,3
4,66756,4
