In [None]:
import pandas as pd
import os
from tqdm import tqdm
import pickle 
import numpy as np


In [None]:
import sys
sys.path.append('..')
from paths.paths import *

# Prepare data (10-core filtering) and temporal split into train/test/validation

In [None]:
data_path = f'{BASE_FOLDER}/actr_data'

In [None]:
FULL_EVENTS_PATH = f'{data_path}/listening_events.tsv.bz2'
counts_columns = ['user_id', 'track_id', 'album_id', 'timestamp']

In [None]:
listening_events = pd.read_csv(FULL_EVENTS_PATH, sep='\t', usecols=['user_id', 'track_id', 'timestamp'])

In [None]:
# select only the last month
listening_events_ = listening_events[listening_events['timestamp'] > '2020-02-19']

# remove users that listened to more tracks than 99% of the users (radio-stations)
users = listening_events_.groupby(['user_id']).count().reset_index().sort_values(by='track_id')
quant = listening_events_.groupby(['user_id']).count().reset_index()['track_id'].quantile(q=0.99)
normal_users = users[users['track_id'] < quant]['user_id'].values.tolist()
listening_events_filtered = listening_events_[listening_events_['user_id'].isin(normal_users)]
listening_events = listening_events_filtered

In [None]:
tracks_5_users_listening_events = listening_events.copy()

In [None]:
# Select only users with at least 10 interactions. In the end, we want to achieve 10-core filtering. 
# This way, we are sure that there are listening events in the train, val, and test sets.
# This needs to be iterated until there are no items with less than 10 users and no users with less than 10 LE

users_thresh = 10
tracks_thresh = 10

# Set the initial number of cold users, i.e. users with less than users_thresh listening events
cold_users = (listening_events.user_id.value_counts() < users_thresh).sum()

# Repeat until there are no cold users
while cold_users > 0:
    # print(cold_users)
    # Get the unique ids of users with more than users_thresh LE
    core_users_series = tracks_5_users_listening_events.user_id.value_counts()
    core_users_series = core_users_series[core_users_series > users_thresh]
    core_users = set(core_users_series.index.unique())
    
    # restrict to those users
    core_5_listening_events = tracks_5_users_listening_events[tracks_5_users_listening_events.user_id.isin(core_users)]
    
    # get the number of unique listeners of each track
    gb = core_5_listening_events.groupby(['track_id', 'user_id']).size()
    gb = gb.reset_index()
    gb_tracks = gb.groupby(['track_id']).size()

    # Select tracks that have at least tracks_thresh unique listeners
    tracks_5_users = set(gb_tracks[gb_tracks >= tracks_thresh].index.unique())
    tracks_5_users_listening_events = core_5_listening_events[core_5_listening_events.track_id.isin(tracks_5_users)]
    
    # Recompute the number of cold users
    cold_users = (tracks_5_users_listening_events.user_id.value_counts() < users_thresh).sum()

In [None]:
# Convert the timestamp to a 
tracks_5_users_listening_events.timestamp = pd.to_datetime(tracks_5_users_listening_events.timestamp)

In [None]:
# Temporal splitting into train test val
# - Train 60% [first portion]
# - Val 20%   [mid portion]
# - Test 20%  [last portion]

train_list = []
val_list = []
test_list = []

# Iterate over selected users
for user in tqdm(tracks_5_users_listening_events.user_id.unique()):
    # Get the LE of the current user, sorted by timestamp
    user_df = tracks_5_users_listening_events[tracks_5_users_listening_events.user_id==user].sort_values(by=['timestamp'])
    user_df = user_df.reset_index(drop=True)
    user_interactions = len(user_df)
    
    
    test = user_df
    n_test_int = user_interactions // 5
    
    # Split the LE of the user. 
    # Train is used for user similarity, and training and validation of VAE, GRU4Rec, BPR
    # Val is used for hyperparam selection
    # Test is using for the evaluation protocol (rolling session completion)
    train, val, test = user_df[:-2 * n_test_int], user_df[-2 * n_test_int:-n_test_int], user_df[-n_test_int:]
    train_list += [train]
    val_list += [val]
    test_list += [test]
    

In [None]:
# Concatenate all users
train = pd.concat(train_list)
val = pd.concat(val_list)
test = pd.concat(test_list)
# Convert the timestamp in the training set to seconds and remove useless zeros 
train['timestamp'] = train.timestamp.values.astype(np.int64) // 10 ** 9

# Train set handling
We do the following:
 1. Binarize the training set
 2. Select unique (user, item) pairs with a threshold of 1
 3. Select users that listened to at least 6 unique tracks
 4. Select corresponding items
 5. Filter the non-binarized training set on these sets of users and items
The rest of the splitting for GRU4Rec, MultVAE and BPR is done by RecBole.

Temporal for GRU4Rec, random for MultVAE and BPR (on the binarized version).

In [None]:
binarization_threshold = 0
playcounts_df = train.groupby(['user_id', 'track_id']).count().reset_index()
binarized_df = playcounts_df[playcounts_df.timestamp > binarization_threshold].reset_index(drop=True)
binarized_df.timestamp = 1
# This is the group of users that have at least 6 binarized LE in the training set (used for BPR and VAE) 
group = binarized_df.groupby(['user_id'])['track_id'].count().reset_index()
group = group[group['track_id'] > 5]

# For BPR and MultVAE
users_above_threshold = list(group.user_id.unique())
train_bin = binarized_df[binarized_df['user_id'].isin(users_above_threshold)]
items_above_threshold = list(train_bin.track_id.unique())
train_bin = train_bin[train_bin['track_id'].isin(items_above_threshold)]
    
# For GRU4Rec
train_gru = train[train['user_id'].isin(users_above_threshold)]
train_gru = train_gru[train_gru['track_id'].isin(items_above_threshold)]

In [None]:
# Use RecBole column naming
train_bin = binarized_df.rename(
    columns={'user_id': 'user_id:token', 'track_id': 'item_id:token', 'timestamp': 'timestamp:float'})

train_gru = train_gru.rename(
    columns={'user_id': 'user_id:token', 'track_id': 'item_id:token', 'timestamp': 'timestamp:float'})

In [None]:
# train_bin.to_csv(data_path + '/actr_data_bin/actr_data_bin_2.inter', index=False, sep='\t')
# train_gru.to_csv(data_path + '/actr_data_gru/actr_data_gru_2.inter', index=False, sep='\t')
# val = val.rename(columns={'user_id': 'user_id:token', 'track_id': 'item_id:token', 'timestamp': 'timestamp:float'})
# val.to_csv(data_path + '/actr_val_data/actr_val_data_2.inter', index=False, sep='\t')

# END OF DATA PREPARATION
Now go to FinalPrepareSpotify