In [1]:
# import os
import pyspark

In [2]:
path_to_dataset = 'data'
bucket_name = 'thesis_spotify_apc_bucket'
path_to_write = 'gs://thesis_spotify_apc_bucket/df_data'

In [3]:
playlist_cols = ['pid', 'name', 'modified_at', 'num_artists', 'num_albums', 
                 'num_tracks', 'num_followers', 'num_edits', 'duration_ms', 'collaborative']

track_cols = ['album_name', 'album_uri', 'artist_name', 'artist_uri', 
              'duration_ms', 'track_name', 'track_uri']

playlist_test_cols = ['name', 'num_holdouts', 'num_samples', 'num_tracks', 'pid']

In [4]:
# Initialize pyspark

from pyspark.sql import SparkSession

spark = SparkSession.builder\
                    .appName("Load Dataset")\
                    .getOrCreate()

In [5]:
# Train playlist metadata
data_playlists = []
# Track metada
data_tracks = []
# Train playlist ID with track URI and position
playlists = []
# Track URIs
tracks = set()

In [6]:
%%time

# Load train data to lists

from google.cloud import storage
import json

train_path = path_to_dataset + '/mpd/data/'

client = storage.Client()

file_count = 0

limit = 200

# for filename in filenames:
for blob in client.list_blobs(bucket_name, prefix=train_path):
    file_count += 1
    # Read file
    mpd_slice = json.loads(blob.download_as_string(client=None))
    for playlist in mpd_slice['playlists']:
        data_playlists.append([playlist[col] for col in playlist_cols])
        for track in playlist['tracks']:
            playlists.append([playlist['pid'], track['track_uri'], track['pos']])
            if track['track_uri'] not in tracks:
                data_tracks.append([track[col] for col in track_cols])
                tracks.add(track['track_uri'])
    # For every 100 files, save and reset
    if file_count % 100 == 0:
        print(file_count)
        # Save
        temp_playlists_metadata = spark.createDataFrame(data=data_playlists, schema=playlist_cols)
        temp_tracks = spark.createDataFrame(data=data_tracks, schema=track_cols)
        temp_playlists = spark.createDataFrame(data=playlists, schema=['pid', 'track_uri', 'pos'])
        part_count = str(int(file_count / 100))
        temp_playlists_metadata.write.mode('overwrite').orc(path_to_write + '/temp_playlists_metadata_' + part_count + '.orc')
        temp_tracks.write.mode('overwrite').orc(path_to_write + '/temp_tracks_' + part_count + '.orc')
        temp_playlists.write.mode('overwrite').orc(path_to_write + '/temp_playlists_' + part_count + '.orc')
        # Reset
        data_playlists = []
        playlists = []
        data_tracks = []
    if file_count == limit:
        break

100
200
CPU times: user 6min 35s, sys: 12.7 s, total: 6min 48s
Wall time: 8min 59s


In [8]:
# Test playlist metadata
data_playlists_test = []
# Test playlist ID with track URI and position
playlists_test = []

In [9]:
%%time

# Load test data to lists

test_path = path_to_dataset + '/challenge_set.json'

bucket = client.get_bucket(bucket_name)
blob = bucket.blob(test_path)
mpd_slice = json.loads(blob.download_as_string(client=None))

for playlist in mpd_slice['playlists']:
    data_playlists_test.append([playlist.get(col, '') for col in playlist_test_cols])
    for track in playlist['tracks']:
        playlists_test.append([playlist['pid'], track['track_uri'], track['pos']])
        if track['track_uri'] not in tracks:
            data_tracks.append([track[col] for col in track_cols])
            tracks.add(track['track_uri'])

# Save data_tracks into new temp_tracks
if data_tracks is not None:
    temp_tracks = spark.createDataFrame(data=data_tracks, schema=track_cols)
    temp_tracks.write.mode('overwrite').orc(path_to_write + '/temp_tracks_11.orc')

CPU times: user 2.44 s, sys: 705 ms, total: 3.15 s
Wall time: 7.06 s


In [10]:
%%time

# Turn lists into dataframes
# Playlist and track metadata dataframes

from pyspark.sql.functions import col, when

df_playlists_metadata = spark.read.orc(path_to_write + '/temp_playlists_metadata_*.orc')
df_playlists_metadata = df_playlists_metadata.withColumn('collaborative', 
                                                         when(col('collaborative') == 'true', 1).when(col('collaborative') == 'false', 0))

df_tracks = spark.read.orc(path_to_write + '/temp_tracks_*.orc')

# Generate tid from index
tracks_rdd = df_tracks.rdd.zipWithIndex()
df_tracks = tracks_rdd.toDF()
# Break list in column _1 to the original schema
for col in track_cols:
    df_tracks = df_tracks.withColumn(col, df_tracks['_1'].getItem(col))

df_tracks = df_tracks.withColumnRenamed('_2', 'tid') \
                     .drop('_1')

CPU times: user 44.4 ms, sys: 298 µs, total: 44.7 ms
Wall time: 8.3 s


In [11]:
%%time

# Turn lists into dataframes
# Train playlist dataframes

from pyspark.sql.functions import col

df_playlists = spark.read.orc(path_to_write + '/temp_playlists_*.orc')

# Remove rows in df_playlists with null values
df_playlists = df_playlists.filter(col('pos').isNotNull())

CPU times: user 2 ms, sys: 3.93 ms, total: 5.93 ms
Wall time: 567 ms


In [None]:
# CHECKING - Check for nan and null values in df_train

# from pyspark.sql.functions import when, isnan, count, col

# df_playlists.select([count(when(isnan(c), c)).alias(c) for c in df_playlists.columns]).show()
# df_playlists.select([count(when(col(c).isNull(), c)).alias(c) for c in df_playlists.columns]).show()

In [13]:
%%time

# Convert track_uri into tid in df_playlists

df_playlists = df_playlists.join(df_tracks.select('track_uri', 'tid'), ['track_uri'], 'left') \
                           .drop('track_uri')

CPU times: user 7.47 ms, sys: 118 µs, total: 7.59 ms
Wall time: 47.7 ms


In [14]:
%%time

# Turn lists into dataframes
# Test playlist and metadata dataframes

df_challenge_playlists_metadata = spark.createDataFrame(data=data_playlists_test, schema=playlist_test_cols)

df_challenge_playlists = spark.createDataFrame(data=playlists_test, schema=['pid', 'track_uri', 'pos'])

CPU times: user 5.24 s, sys: 15.5 ms, total: 5.26 s
Wall time: 5.31 s


In [15]:
%%time

# Convert track_uri to tid in df_challenge_playlists

df_challenge_playlists = df_challenge_playlists.join(df_tracks.select('track_uri', 'tid'), ['track_uri'], 'left') \
                                     .drop('track_uri')

CPU times: user 7.52 ms, sys: 0 ns, total: 7.52 ms
Wall time: 39.6 ms


In [20]:
%%time

# Save dataframes as ORC files

df_tracks.write.mode('overwrite').orc(path_to_write + '/df_tracks.orc')
df_playlists_metadata.write.mode('overwrite').orc(path_to_write + '/df_playlists_metadata.orc')
df_playlists.write.mode('overwrite').orc(path_to_write + '/df_playlists.orc')
df_challenge_playlists.write.mode('overwrite').orc(path_to_write + '/df_challenge_playlists.orc')
df_challenge_playlists_metadata.write.mode('overwrite').orc(path_to_write + '/df_challenge_playlists_metadata.orc')

CPU times: user 18.6 ms, sys: 150 µs, total: 18.7 ms
Wall time: 52.6 s


In [None]:
# Save dataframes as HDF files

# import pandas as pd

# train = df_train.toPandas()
# val1 = df_val1.toPandas()
# val2 = df_val2.toPandas()

# train.to_hdf(path_to_write + '/train.hdf', key='abc')
# val1.to_hdf(path_to_write + '/val1.hdf', key='abc')
# val2.to_hdf(path_to_write + '/val2.hdf', key='abc')