In [1]:
import os
import sys
import argparse
import pandas as pd
import scipy.sparse as ssp
import pickle

sys.path.append('../src')

from data_utils import *
from builder import PandasGraphBuilder

Using backend: pytorch


In [2]:
directory = '../data/nowplaying_rs_dataset'
output_path = '../data/nowplaying_out.pkl'

In [3]:
data = pd.read_csv(os.path.join(directory, 'context_content_features.csv'))

In [4]:
data.head(1).T

Unnamed: 0,0
coordinates,
instrumentalness,0.00479
liveness,0.18
speechiness,0.0294
danceability,0.634
valence,0.342
loudness,-8.345
tempo,125.044
acousticness,0.00035
energy,0.697


In [5]:
data.shape

(11614671, 23)

## Clean Data

In [6]:
track_feature_cols = list(data.columns[1:13])
data = data[['user_id', 'track_id', 'created_at'] + track_feature_cols].dropna()

In [7]:
data.columns

Index(['user_id', 'track_id', 'created_at', 'instrumentalness', 'liveness',
       'speechiness', 'danceability', 'valence', 'loudness', 'tempo',
       'acousticness', 'energy', 'mode', 'key', 'artist_id'],
      dtype='object')

## Users

In [38]:
users = data[['user_id']].drop_duplicates()
users.head()

Unnamed: 0,user_id
0,81496937
1,2205686924
2,132588395
3,97675221
4,17945688


## Tracks

In [36]:
tracks = data[['track_id'] + track_feature_cols].drop_duplicates()
assert tracks['track_id'].value_counts().max() == 1 # check for duplicate tracks
tracks = tracks.astype({'mode': 'int64', 'key': 'int64', 'artist_id': 'category'})
tracks.head()

Unnamed: 0,track_id,instrumentalness,liveness,speechiness,danceability,valence,loudness,tempo,acousticness,energy,mode,key,artist_id
0,cd52b3e5b51da29e5893dba82a418a4b,0.00479,0.18,0.0294,0.634,0.342,-8.345,125.044,0.00035,0.697,1,6,b2980c722a1ace7a30303718ce5491d8
1,da3110a77b724072b08f231c9d6f7534,0.0177,0.0638,0.0624,0.769,0.752,-8.252,95.862,0.267,0.826,1,7,5cddcd0e314e2f2223ab21937d2c8778
2,ba84d88c10fb0e42d4754a27ead10546,0.0,0.086,0.0436,0.675,0.775,-4.432,97.03,0.217,0.885,0,1,e41273f43af504714d85465294f1f369
3,33f95122281f76e7134f9cbea3be980f,0.0,0.143,0.0292,0.324,0.333,-5.647,74.101,0.239,0.574,1,7,557ce373bd29743eb00a3723ab19ebe8
4,b5c42e81e15cd54b9b0ee34711dedf05,0.000183,0.362,0.0524,0.767,0.808,-5.011,114.237,0.0364,0.739,0,10,77bd64b4bf77e10001fd02964985ae0f


## Events

In [85]:
events = data[['user_id', 'track_id', 'created_at']].copy()
events['created_at'] = events['created_at'].values.astype('datetime64[s]').astype('int64')
events.head()

Unnamed: 0,user_id,track_id,created_at
0,81496937,cd52b3e5b51da29e5893dba82a418a4b,1388555661
1,2205686924,da3110a77b724072b08f231c9d6f7534,1388555662
2,132588395,ba84d88c10fb0e42d4754a27ead10546,1388555662
3,97675221,33f95122281f76e7134f9cbea3be980f,1388555664
4,17945688,b5c42e81e15cd54b9b0ee34711dedf05,1388555664


## Build Graph

In [39]:
graph_builder = PandasGraphBuilder()
graph_builder.add_entities(users, 'user_id', 'user') # add users
graph_builder.add_entities(tracks, 'track_id', 'track') # add tracks
graph_builder.add_binary_relations(events, 'user_id', 'track_id', 'listened') # add listening events
graph_builder.add_binary_relations(events, 'track_id', 'user_id', 'listened-by') # add listening events
g = graph_builder.build()

In [40]:
g

Graph(num_nodes={'track': 346122, 'user': 138721},
      num_edges={('track', 'listened-by', 'user'): 11609883, ('user', 'listened', 'track'): 11609883},
      metagraph=[('track', 'user', 'listened-by'), ('user', 'track', 'listened')])

## Get Track Features

In [49]:
float_cols = []
for col in tracks.columns:
    if col == 'track_id':
        continue
    elif col == 'artist_id':
        g.nodes['track'].data[col] = torch.LongTensor(tracks[col].cat.codes.values.copy())
    elif tracks.dtypes[col] == 'float64':
        float_cols.append(col)
    elif tracks.dtypes[col] == 'int64':
        g.nodes['track'].data[col] = torch.LongTensor(tracks[col].values)
        
g.nodes['track'].data['song_features'] = torch.FloatTensor(linear_normalize(tracks[float_cols].values)) # add min-max normalize song features

In [51]:
# add edges
g.edges['listened'].data['created_at'] = torch.LongTensor(events['created_at'].values)
g.edges['listened-by'].data['created_at'] = torch.LongTensor(events['created_at'].values)

In [92]:
train_indices, val_indices, test_indices = train_test_split_by_time(events, 'created_at', 'user_id')

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  .apply(train_test_split) \


           user_id                          track_id  created_at  train_mask  \
0         81496937  cd52b3e5b51da29e5893dba82a418a4b  1388555661        True   
919       81496937  29cb3f8f366888158226c810b3fee372  1388558532        True   
1079      81496937  f62b0e51fc59cca8af70942e12554765  1388558890        True   
1543      81496937  d99853238d61833e3158a0fe76425ca2  1388560382        True   
1661      81496937  977837bd00f5374b5d0586ba2538523c  1388560804        True   
...            ...                               ...         ...         ...   
11613299  81496937  f15ae084e89942084df0f4f989247de9  1419316321        True   
11613401  81496937  a5ad13a10a9a6ad7ba3784d185de4fec  1419316516        True   
11613508  81496937  9b664ec984823d11e70287a5201ede35  1419316748        True   
11613984  81496937  744c145d74cf4f65162a749bebda0db7  1419317816       False   
11614273  81496937  c4eb7de403a7e55b968defca65398f78  1419318413       False   

          val_mask  test_mask  
0      

In [104]:
train_g = build_train_graph(g, train_indices, 'user', 'track', 'listened', 'listened-by')
assert train_g.out_degrees(etype='listened').min() > 0

In [105]:
val_matrix, test_matrix = build_val_test_matrix(g, val_indices, test_indices, 'user', 'track', 'listened')

In [109]:
dataset = {
    'train-graph': train_g,
    'val-matrix': val_matrix,
    'test-matrix': test_matrix,
    'item-texts': {},
    'item-images': None,
    'user-type': 'user',
    'item-type': 'track',
    'user-to-item-type': 'listened',
    'item-to-user-type': 'listened-by',
    'timestamp-edge-column': 'created_at'}