# Create Spotify dataset

We will create a Spotify playlists dataset based on a list of Last.fm users.

We will do this way because it's not easy to gather Spotify users directly, but many Last.fm users are Spotify users too.

In [None]:
from spotipy.oauth2 import SpotifyClientCredentials
from tqdm.notebook import tqdm
import pandas as pd
import random
import spotipy
import time

In [None]:
auth_manager = SpotifyClientCredentials()
sp = spotipy.Spotify(auth_manager=auth_manager)

In [None]:
# How many users do we want to search for playlists?
LEN_USERS = 50

## Obtain users

We will gather Last.fm users and test if they are Spotify users too.

In [None]:
with open('../../data/users.txt') as f:
    users = f.read().split('\n')
    
random.shuffle(users)

## Test users and obtain playlists

We now test the users and gather their playlists at the same time, if it succeeds.

In [None]:
sp_users = []
playlists = []
pbar = tqdm(total=LEN_USERS)
for i, user in enumerate(users):
    try:
        their_playlists = sp.user_playlists(user)
        playlists.extend(their_playlists['items'])
        while their_playlists['next']:
            their_playlists = sp.next(their_playlists)
            playlists.extend(their_playlists['items'])
    except:
        continue
    pbar.update()
    sp_users.append(user)
    if (i + 1) % 100 == 0:
        time.sleep(5)
    if len(sp_users) >= LEN_USERS:
        break

In [None]:
print('We have now {} playlists!'.format(len(playlists)))

## Save Spotify users to file

In [None]:
with open('../../data/sp_users.txt', 'w') as f:
    for user in sp_users:
        f.write('{}\n'.format(user))

### Optional: get number of followers info

It seems to be necessary to pass again through all playlists just for a bit of information, that is, the number of followers of a playlist.

In [None]:
# for i, playlist in tqdm(enumerate(playlists.copy()), total=len(playlists.copy())):
#     playlists[i] = sp.playlist(playlists[i]['id'])

## Treat playlists dataset

Now we treat the dataset playlists, filtering just what we want. We also expand the `owner` column and remove duplicates.

In [None]:
# Filter columns
playlists = pd.DataFrame(playlists, columns=[
    'collaborative',
    'description',
#     'external_urls',
#     'followers',
#     'href',
    'id',
#     'images',
    'name',
    'owner',
    'primary_color',
    'public',
#     'snapshot_id',
    'tracks',
#     'type',
#     'uri'
])

# Expand owner dict
playlists['owner_id'] = playlists['owner'].apply(pd.Series)['id']
playlists.drop(columns='owner', inplace=True)

# Remove duplicates
playlists.drop_duplicates('id', inplace=True)

# Reindex
playlists.reset_index(drop=True, inplace=True)

## Write playlists dataset to file

In [None]:
playlists.to_csv('../../data/sp_playlists.csv')

## Iterate through playlists to get tracks

We now iterate through the playlists dataset in order to gather information about tracks.

In [None]:
# Conversion
playlists = playlists.to_dict('records')

# Iteration
tracks = []
for i, playlist in tqdm(enumerate(playlists), total=len(playlists)):
    q = sp.playlist_tracks(playlist['id'])
    items = q['items'].copy()
    for item in items:
        item.update({'playlist_id': playlist['id']})
    tracks.extend(items)
    while q['next']:
        q = sp.next(q)
        items = q['items'].copy()
        for item in items:
            item.update({'playlist_id': playlist['id']})
        tracks.extend(items)
    if (i + 1) % 100 == 0:
        time.sleep(5)

## Treat tracks database

In [None]:
# Filter
df = pd.DataFrame(tracks, columns=[
    'added_at',
    'added_by',
    'is_local',
#     'primary_color',
    'track',
#     'video_thumbnail',
    'playlist_id',

])

In [None]:
# Drop rows with NaN values
print('{} rows were dropped.'.format(len(df.drop(df.dropna().index))))
df.dropna(inplace=True)

In [None]:
# Parse dates
df.added_at = pd.to_datetime(df.added_at)

In [None]:
# Expand added_by column
df['added_by'] = df.added_by.apply(pd.Series).id

In [None]:
# Expand track column

df2 = df.track.apply(pd.Series).copy()
df2 = df2[[
    'album',
    'artists',
    'available_markets',
    'disc_number',
    'duration_ms',
#     'episode',
    'explicit',
#     'external_ids',
#     'external_urls',
#     'href',
    'id',
#     'is_local',
    'name',
    'popularity',
#     'preview_url',
#     'track',
    'track_number',
#     'type',
#     'uri',
#     'linked_from'
]]
df = df.join(df2)
df.drop(columns='track', inplace=True)

In [None]:
# Expand album column

df2 = df.album.apply(pd.Series).copy()
df2 = df2[[
    'album_type',
    'artists',
    'available_markets',
#     'external_urls',
#     'href',
    'id',
#     'images',
    'name',
    'release_date',
#     'release_date_precision',
#     'total_tracks',
#     'type',
#     'uri'
]]
df2.rename(columns={
    'artists': 'album_artists',
    'available_markets': 'album_available_markets',
    'id': 'album_id',
    'name': 'album_name',
    'release_date': 'album_release_date'
}, inplace=True)
df = df.join(df2)
df.drop(columns='album', inplace=True)

In [None]:
# Expand artists column

df['artists_ids'] = df.artists.apply(lambda x: [i['id'] for i in x])
df['artists_names'] = df.artists.apply(lambda x: [i['name'] for i in x])
df.drop(columns='artists', inplace=True)

In [None]:
# Expand album_artists column

df['album_artists_ids'] = df.album_artists.apply(lambda x: [i['id'] for i in x])
df['album_artists_names'] = df.album_artists.apply(lambda x: [i['name'] for i in x])
df.drop(columns='album_artists', inplace=True)

In [None]:
# Drop rows with NaN values
print('{} rows were dropped.'.format(len(df.drop(df.dropna().index))))
df.dropna(inplace=True)

In [None]:
df.reset_index(drop=True, inplace=True)

## Write tracks database to file

In [None]:
df.to_csv('../../data/pd_tracks.csv')

## Iterate through tracks to get their features

Tracks have features, like `danceability`, which is important for future analysis.

In [None]:
audio_features = []

for i in tqdm(range(0, len(df), 100)):
    q = sp.audio_features(df.id.to_list()[i:i+100])
    audio_features.extend(q)
    if i % 10000 == 0:
        time.sleep(5)

`sp.audio_features` can return `[None]`, so we check it.

In [None]:
# Indices where audio_features is None
none_indices = [i for i in range(len(audio_features)) if audio_features[i] is None]

In [None]:
# We drop them in both df and audio_features
df.drop(index=none_indices, inplace=True)
for index in none_indices[::-1]:
    del audio_features[index]

In [None]:
df.reset_index(drop=True, inplace=True)

In [None]:
audio_features = pd.DataFrame(audio_features, columns=[
    'danceability',
    'energy',
    'key',
    'loudness',
    'mode',
    'speechiness',
    'acousticness',
    'instrumentalness',
    'liveness',
    'valence',
    'tempo',
#     'type',
#     'id',
#     'uri',
#     'track_href',
#     'analysis_url',
#     'duration_ms',
    'time_signature'
])

In [None]:
df = df.join(audio_features)

## Write final dataset to file

In [None]:
df.to_csv('../../data/sp_dataset.csv')