### Setup & Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns

# configure
%matplotlib inline  
style.use('fivethirtyeight')
sns.set(style='whitegrid',color_codes=True)

# sklearn libraries
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# manipulating zipped images and getting numpy arrays of pixel values of images.
import os
from tqdm import tqdm 
import json

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp "/content/drive/MyDrive/data.zip" .
!unzip -q data.zip
!rm data.zip

### Data Preprocessing

In [None]:
def get_playlists_df(number_of_files):
    
    start = 0
    end = 1000
    list_of_df = []
    
    for i in range(number_of_files):
        
        path = 'data/mpd.slice.' + str(start) + "-" + str(end-1) + '.json'
        json_file = json.load(open(path, 'r'))
        
        playlists_df = pd.DataFrame.from_dict(json_file['playlists'], orient='columns')
        list_of_df.append(playlists_df)
        
        start = end
        end = end + 1000
    
    concat_playlists_df = pd.concat(list_of_df).reset_index(drop=True)
        
        
    return concat_playlists_df

In [None]:
def get_all_songs_df(playlists_df):
    all_songs_array = []
    for index, row in playlists_df.iterrows():
        for track in row['tracks']:
            all_songs_array.append([track['track_uri'], 
                                      track['track_name'], 
                                      track['artist_uri'], 
                                      track['artist_name'],  
                                      track['album_uri'], 
                                      track['album_name'],
                                      row['pid']])

    all_songs_df = pd.DataFrame(all_songs_array, columns=['track_uri', 
                                                          'track_name', 
                                                          'artist_uri', 
                                                          'artist_name', 
                                                          'album_uri', 
                                                          'album_name', 
                                                          'pid'])
    return all_songs_df

In [None]:
# get playlists
playlists_df = get_playlists_df(number_of_files=10)
playlists_df

Unnamed: 0,name,collaborative,pid,modified_at,num_tracks,num_albums,num_followers,tracks,num_edits,duration_ms,num_artists,description
0,Throwbacks,false,0,1493424000,52,47,1,"[{'pos': 0, 'artist_name': 'Missy Elliott', 't...",6,11532414,37,
1,Awesome Playlist,false,1,1506556800,39,23,1,"[{'pos': 0, 'artist_name': 'Survivor', 'track_...",5,11656470,21,
2,korean,false,2,1505692800,64,51,1,"[{'pos': 0, 'artist_name': 'Hoody', 'track_uri...",18,14039958,31,
3,mat,false,3,1501027200,126,107,1,"[{'pos': 0, 'artist_name': 'Camille Saint-Saën...",4,28926058,86,
4,90s,false,4,1401667200,17,16,2,"[{'pos': 0, 'artist_name': 'The Smashing Pumpk...",7,4335282,16,
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,rap,false,9995,1491782400,34,26,1,"[{'pos': 0, 'artist_name': 'Lecrae', 'track_ur...",16,8530582,15,
9996,Blues,false,9996,1482364800,57,48,1,"[{'pos': 0, 'artist_name': 'Robert Johnson', '...",15,13010049,30,
9997,game songs,false,9997,1508371200,27,24,4,"[{'pos': 0, 'artist_name': 'NateWantsToBattle'...",23,5104068,7,
9998,country,false,9998,1466208000,12,12,1,"[{'pos': 0, 'artist_name': 'Little Big Town', ...",3,2459585,12,


In [None]:
# apply the threshold to playlists 
threshold_num_tracks = 50
playlists_df = playlists_df[playlists_df.num_tracks > threshold_num_tracks]
playlists_df

Unnamed: 0,name,collaborative,pid,modified_at,num_tracks,num_albums,num_followers,tracks,num_edits,duration_ms,num_artists,description
0,Throwbacks,false,0,1493424000,52,47,1,"[{'pos': 0, 'artist_name': 'Missy Elliott', 't...",6,11532414,37,
2,korean,false,2,1505692800,64,51,1,"[{'pos': 0, 'artist_name': 'Hoody', 'track_uri...",18,14039958,31,
3,mat,false,3,1501027200,126,107,1,"[{'pos': 0, 'artist_name': 'Camille Saint-Saën...",4,28926058,86,
5,Wedding,false,5,1430956800,80,71,1,"[{'pos': 0, 'artist_name': 'Cali Swag District...",3,19156557,56,
7,2017,false,7,1509321600,53,52,1,"[{'pos': 0, 'artist_name': 'Fink', 'track_uri'...",38,12674796,48,
...,...,...,...,...,...,...,...,...,...,...,...,...
9992,Likes,false,9992,1509062400,74,69,3,"[{'pos': 0, 'artist_name': 'High Highs', 'trac...",67,17521523,65,
9993,October,false,9993,1478563200,58,56,2,"[{'pos': 0, 'artist_name': 'Two Door Cinema Cl...",10,12661986,55,
9994,alone,false,9994,1387065600,56,37,1,"[{'pos': 0, 'artist_name': 'Britt Nicole', 'tr...",11,13389556,21,
9996,Blues,false,9996,1482364800,57,48,1,"[{'pos': 0, 'artist_name': 'Robert Johnson', '...",15,13010049,30,


In [None]:
# get all songs
all_songs_df = get_all_songs_df(playlists_df)
all_songs_df

Unnamed: 0,track_uri,track_name,artist_uri,artist_name,album_uri,album_name,pid
0,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,Lose Control (feat. Ciara & Fat Man Scoop),spotify:artist:2wIVse2owClT7go1WT98tk,Missy Elliott,spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,The Cookbook,0
1,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,Toxic,spotify:artist:26dSoYclwsYLMAKD3tpOr4,Britney Spears,spotify:album:0z7pVBGOD7HCIB7S8eLkLI,In The Zone,0
2,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,Crazy In Love,spotify:artist:6vWDO969PvNqNYHIOW5v0m,Beyoncé,spotify:album:25hVFAxTlDvXbx2X2QkUkE,Dangerously In Love (Alben für die Ewigkeit),0
3,spotify:track:1AWQoqb9bSvzTjaLralEkT,Rock Your Body,spotify:artist:31TPClRtHm23RisEBtV3X7,Justin Timberlake,spotify:album:6QPkyl04rXwTGlGlcYaRoW,Justified,0
4,spotify:track:1lzr43nnXAijIGYnCT8M8H,It Wasn't Me,spotify:artist:5EvFsr3kj42KNv97ZEnqij,Shaggy,spotify:album:6NmFmPX56pcLBOFMhIiKvF,Hot Shot,0
...,...,...,...,...,...,...,...
526060,spotify:track:6ZOPiKQeibCn7fP8dncucL,Blue Ain't Your Color,spotify:artist:0u2FHSq3ln94y5Q57xazwf,Keith Urban,spotify:album:1r7ABqzNXQnUPAH3ZjrHMn,Ripcord,9999
526061,spotify:track:7pxhKtuTwofDIdgHx2DcVK,Seein' Red,spotify:artist:1dID9zgn0OV0Y8ud7Mh2tS,Dustin Lynch,spotify:album:23cuZhPWDfX1uKD4qwuv7t,Current Mood,9999
526062,spotify:track:7mldq42yDuxiUNn08nvzHO,Body Like A Back Road,spotify:artist:2kucQ9jQwuD8jWdtR9Ef38,Sam Hunt,spotify:album:2N7kidh1wA9EoLdf16QWrz,Body Like A Back Road,9999
526063,spotify:track:23TxRN09aR1RB0G0tFoT0b,Better Man,spotify:artist:3CygdxquGHurS7f9LjNLkv,Little Big Town,spotify:album:2aQOzEjLzPkffXDwREXdAh,The Breaker,9999


#### Prepare training data

In [None]:
# get df that we will use in training // artist_uri and album_uri can be added
training_df = all_songs_df[['pid', 'track_uri']]
training_df

Unnamed: 0,pid,track_uri
0,0,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI
1,0,spotify:track:6I9VzXrHxO9rA9A5euc8Ak
2,0,spotify:track:0WqIKmW4BTrj3eJFmnCKMv
3,0,spotify:track:1AWQoqb9bSvzTjaLralEkT
4,0,spotify:track:1lzr43nnXAijIGYnCT8M8H
...,...,...
526060,9999,spotify:track:6ZOPiKQeibCn7fP8dncucL
526061,9999,spotify:track:7pxhKtuTwofDIdgHx2DcVK
526062,9999,spotify:track:7mldq42yDuxiUNn08nvzHO
526063,9999,spotify:track:23TxRN09aR1RB0G0tFoT0b


In [None]:
training_df['interaction'] = 1
training_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,pid,track_uri,interaction
0,0,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,1
1,0,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,1
2,0,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,1
3,0,spotify:track:1AWQoqb9bSvzTjaLralEkT,1
4,0,spotify:track:1lzr43nnXAijIGYnCT8M8H,1
...,...,...,...
526060,9999,spotify:track:6ZOPiKQeibCn7fP8dncucL,1
526061,9999,spotify:track:7pxhKtuTwofDIdgHx2DcVK,1
526062,9999,spotify:track:7mldq42yDuxiUNn08nvzHO,1
526063,9999,spotify:track:23TxRN09aR1RB0G0tFoT0b,1


In [None]:
# unique songs in all songs
all_unique_songs = training_df['track_uri'].unique()

In [None]:
print("Although we have {} tracks in all playlists we include, there are {} unique tracks.".format(training_df.shape[0], len(all_unique_songs)))

Although we have 526065 tracks in all playlists we include, there are 146399 unique tracks.


In [None]:
def get_negative_samples(training_df, all_unique_songs, number_of_neg_sample):

  """
  number_of_neg_sample : number of negative samples will be added for each playlist,
                         or assign 'same' to add negative samples as much as number of positive samples for each playlist.
  """

  all_neg_samples_list = []
  all_pids = training_df['pid'].unique()

  for playlist_id in tqdm(all_pids, position=0, leave=True):

    # tracks in corresponding playlist
    tracks_in_playlist = training_df[training_df.pid == playlist_id].track_uri.values

    # take the difference between all unique songs and songs in the playlist to get possible neg samples 
    possible_neg_samples =  np.array(list( set(all_unique_songs) - set(tracks_in_playlist) ))

    # get indices of n neg random samples
    random_neg_sample_indices = np.random.randint(0, len(possible_neg_samples), size=(number_of_neg_sample,))

    # get n neg random samples
    neg_samples_for_a_playlist = possible_neg_samples[random_neg_sample_indices]

    for a_track in neg_samples_for_a_playlist:
      all_neg_samples_list.append([playlist_id, a_track])

  all_neg_samples_df = pd.DataFrame(data = all_neg_samples_list, columns=['pid', 'track_uri'])
  all_neg_samples_df['interaction'] = 0

  return all_neg_samples_df

In [None]:
neg_samples_df = get_negative_samples(training_df, all_unique_songs, number_of_neg_sample=20)

100%|██████████| 4907/4907 [12:43<00:00,  6.43it/s]


In [None]:
training_df = pd.concat([training_df, neg_samples_df])

In [None]:
training_df.sort_values(['pid', 'interaction'], ascending=[True, False], inplace=True)

In [None]:
training_df.reset_index(drop=True, inplace=True)

#### Prepare test data

In [None]:
def get_test_samples(training_df, number_of_test_sample):

  """
  number_of_neg_sample : number of test samples will be selected for each playlist

  """

  all_test_samples_indices = []
  all_pids = training_df['pid'].unique()

  for playlist_id in tqdm(all_pids, position=0, leave=True):

    # indices of tracks in corresponding playlist
    track_indices = training_df[(training_df.pid == playlist_id) & (training_df.interaction == 1)].index.values

    # randomly select n track
    random_indices = np.random.randint(0, len(track_indices), size=(number_of_test_sample,))
    test_samples_ind_for_a_playlist = track_indices[random_indices]

    for test_sample_ind in test_samples_ind_for_a_playlist:
      all_test_samples_indices.append(test_sample_ind)

  return all_test_samples_indices

In [None]:
test_sample_indices = get_test_samples(training_df, number_of_test_sample=1)

100%|██████████| 4907/4907 [00:17<00:00, 281.18it/s]


In [None]:
test_df = training_df.iloc[test_sample_indices,:]
test_df.reset_index(drop=True, inplace=True)

In [None]:
training_df = training_df.drop(test_sample_indices)
training_df.reset_index(drop=True, inplace=True)

In [None]:
training_df.to_csv('training_df.csv', index=False) 
test_df.to_csv('test_df.csv', index=False) 

In [None]:
training_df

Unnamed: 0,pid,track_uri,interaction
0,0,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,1
1,0,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,1
2,0,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,1
3,0,spotify:track:1AWQoqb9bSvzTjaLralEkT,1
4,0,spotify:track:1lzr43nnXAijIGYnCT8M8H,1
...,...,...,...
619293,9999,spotify:track:4DIdJkGctTM9v5tM4oit4I,0
619294,9999,spotify:track:3lTxnm4oWx096MakGmQSWD,0
619295,9999,spotify:track:5oLWKwAejXRkOv8bKaTBO7,0
619296,9999,spotify:track:1J1QKnproZR0JkQ7Der8IW,0


In [None]:
test_df

Unnamed: 0,pid,track_uri,interaction
0,0,spotify:track:19Js5ypV6JKn4DMExHQbGc,1
1,2,spotify:track:74tqql9zP6JjF5hjkHHUXp,1
2,3,spotify:track:6J7c3Fg5Bey55neqioLLvd,1
3,5,spotify:track:6i0eXvRAbVKD1EZFrXvur8,1
4,7,spotify:track:7Js278ET3O52ymQd8LU5bq,1
...,...,...,...
4902,9992,spotify:track:0Y3PJ1a7GdPc5Xw1PEhExT,1
4903,9993,spotify:track:7ATrP2pE9EyaDDU58MkHUn,1
4904,9994,spotify:track:4gpgyMGxZFbPKf8meMNb53,1
4905,9996,spotify:track:5sKLUVHc6FR1LHdoc0cgl1,1
