# Installing Spotipy

In order to use the Spotify API (SpotiPy) we will have to create an account in Spotify and follow these steps. Once we have done it we will start initializing the API and look at the search method for which we can introduce a "query" q, in this example we will try it with Lady Gaga:

In [1]:
##!conda install -c conda-forge spotipy

## Loading credentials from another config file

In [2]:
import config

## Starting with Spotify API

In [3]:
import spotipy
import json
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd

In [4]:
#Initialize SpotiPy with user credentias
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id= config.client_id,
                                                           client_secret= config.client_secret))

In [5]:
# The playlists involved are all created by Spotify
username = 'Spotify'

In [6]:
# get playlists
results = sp.user_playlists(username, limit = 30)

In [7]:
playlist_ids = []
for playlist in results['items']:
    playlist_ids.append(playlist['id'])

In [8]:
playlist_ids

['37i9dQZF1DXcBWIGoYBM5M',
 '37i9dQZF1DX0XUsuxWHRQd',
 '37i9dQZF1DX1lVhptIYRda',
 '37i9dQZF1DX10zKzsJ2jva',
 '37i9dQZF1DX4JAvHpjipBk',
 '37i9dQZF1DX4sWSpwq3LiO',
 '37i9dQZF1DX4SBhb3fqCJd',
 '37i9dQZF1DWXRqgorJj26U',
 '37i9dQZF1DX4dyzvuaRJ0n',
 '37i9dQZF1DXcF6B6QPhFDv',
 '37i9dQZF1DWXJfnUiYjUKT',
 '37i9dQZF1DXcRXFNfZr7Tp',
 '37i9dQZF1DX4o1oenSJRJd',
 '37i9dQZF1DXbTxeAdrVG2l',
 '37i9dQZF1DX4UtSsGT1Sbe',
 '37i9dQZF1DWTJ7xPn4vNaz',
 '37i9dQZF1DXaKIA8E7WcJj',
 '37i9dQZF1DWSV3Tk4GO2fq',
 '37i9dQZF1DWTwnEm1IYyoj',
 '37i9dQZF1DX2A29LI7xHn1',
 '37i9dQZF1DX2RxBh64BHjQ',
 '37i9dQZF1DWVA1Gq4XHa6U',
 '37i9dQZF1DWY4xHQp97fN6',
 '37i9dQZF1DWX3387IZmjNa',
 '37i9dQZF1DWYkaDif7Ztbp',
 '37i9dQZF1DX5hR0J49CmXC',
 '37i9dQZF1DXan38dNVDdl4',
 '37i9dQZF1DWSvKsRPPnv5o',
 '37i9dQZF1DWUVpAXiEPK8P',
 '37i9dQZF1DX0Tkc6ltcBfU']

In [9]:
# playlist_genres = ['Pop', 'Hip-Hop', 'Rock', 'Dance', 'Reggaeton', ]
# playlist_names = ['Songs to Sing in the Car', 'RapCaviar', 'Rock Classics', 'Motivation Mix', 'Viva Latino', ]
# playlist_ids = ['37i9dQZF1DWWMOmoXKqHTD', '37i9dQZF1DX0XUsuxWHRQd', '37i9dQZF1DWXRqgorJj26U', '37i9dQZF1DXdxcBWuJkbcy', '37i9dQZF1DX10zKzsJ2jva']

In [10]:
# # get json of a specific playlist
# username = 'maka_97'
# playlist_id = '6mtYuOxzl58vSGnEDtZ9uB' # get from url of the playplist ('https://open.spotify.com/playlist/6mtYuOxzl58vSGnEDtZ9uB')

# playlist = sp.user_playlist_tracks(username, playlist_id, market="GB")

In [11]:
# define a function to extract tracks of a playlist
def get_playlist_tracks(username, playlist_id):
    results = sp.user_playlist_tracks(username,playlist_id,market="GB")
    tracks = results['items']
    while results['next']:
        results = sp.next(results)
        tracks.extend(results['items'])
    return tracks

In [12]:
# extract tracks from the playlists

In [13]:
# # extract tracks from a specific playlist

# list_of_audio_features = []

# for playlist_id in playlist_ids:
#     tracks = get_playlist_tracks('Spotify', playlist_id)
#     for item in range(0, len(tracks)):
#         audio_features = sp.audio_features(tracks[item]["track"]["id"])
#         if audio_features is not None and len(audio_features) > 0:
#             list_of_audio_features.append(audio_features[0])

In [14]:
# extract tracks from a specific playlist

list_of_audio_features = []

for playlist_id in playlist_ids:
    tracks = get_playlist_tracks('Spotify', playlist_id)
    for item in range(0, len(tracks)):
        if tracks[item] is not None and tracks[item]["track"] is not None:
            audio_features = sp.audio_features(tracks[item]["track"]["id"])
            if audio_features is not None and len(audio_features) > 0:
                list_of_audio_features.append(audio_features[0])

In [15]:
# sp.audio_features(tracks[item]["track"]["id"])[0]

In [16]:
# create a dataframe for audio features
playlists_raw_df = pd.DataFrame(list_of_audio_features)

# Select some features only

playlists_df = playlists_raw_df[["danceability","energy","loudness","speechiness","acousticness",
    "instrumentalness","liveness","valence","tempo","id","duration_ms"]]

playlists_df

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,id,duration_ms
0,0.707,0.681,-4.325,0.0668,0.06320,0.000005,0.0322,0.646,117.999,0yLdNVWF3Srea0uzk55zFn,200455
1,0.644,0.735,-5.747,0.0391,0.05210,0.144000,0.1610,0.418,88.980,1Qrg8KqiBpW07V7PNxwwwL,153947
2,0.696,0.809,-8.254,0.0500,0.25200,0.000128,0.2480,0.857,132.962,6AQbmUe0Qwf5PZnt4HmTXv,131013
3,0.538,0.742,-5.355,0.1140,0.13800,0.000047,0.0934,0.250,96.107,5Z2MiIZ5I3jJvvmeWMLbOQ,272373
4,0.662,0.398,-6.691,0.0275,0.71600,0.000000,0.1110,0.468,107.001,4sx6NRwL6Ol3V6m9exwGlQ,145850
...,...,...,...,...,...,...,...,...,...,...,...
2895,0.645,0.842,-3.023,0.0983,0.00816,0.000000,0.3960,0.237,150.039,2kMuUWfsueRPYWxyaTSHd8,121600
2896,0.868,0.562,-5.111,0.2510,0.01110,0.000000,0.0826,0.132,89.951,27feJmQBpvsnW4tC4M7pw4,117543
2897,0.422,0.536,-6.605,0.0530,0.40500,0.000000,0.2040,0.719,151.206,1yXCWJQfYF8ff7XcUzwcCy,155760
2898,0.902,0.544,-8.242,0.2420,0.15300,0.000000,0.1110,0.136,131.912,7b4LMdMrjPLNUNwxO46sQR,160000


In [22]:
playlists_df.drop_duplicates(inplace=True)
playlists_df.to_csv('data/playlists.csv', index = False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  playlists_df.drop_duplicates(inplace=True)


In [17]:
import pprint
# Check where to find artist names
pprint.pprint(tracks[0]['track']['artists'][0]['name'])

'Gucci Mane'


In [19]:
def get_tracks_artists(username, playlist_id):
    tracks = get_playlist_tracks(username, playlist_id)
    if tracks is not None:
        artists = []
        for item in range(0, len(tracks)):
            artists.append(tracks[item]['track']['artists'][0]['name'])
        return artists
    else:
        return []

In [20]:
# Get the artists of tracks from all playlists
artist_list = []

for playlist_id in playlist_ids:
    artist_list.extend(get_tracks_artists('Spotify', playlist_id))

TypeError: 'NoneType' object is not subscriptable

In [None]:
len(artist_list)

In [None]:
playlists_df['artist'] = artist_list

In [None]:
# check where to find track names
pprint.pprint(tracks[0]['track']['name'])

In [None]:
# define a function to get track names
def get_tracks_names(username, playlist_id):
    tracks = get_playlist_tracks(username, playlist_id)
    if tracks is not None:
        track_names = []
        for item in range(0, len(tracks)):
            track_names.append(tracks[item]['track']['name'])
        return track_names
    else:
        return []

In [None]:
# Get the names of tracks from all playlists
name_list = []

for playlist_id in playlist_ids:
    name_list.extend(get_tracks_names('Spotify', playlist_id))

In [None]:
#get get track names and include it to the dataframe
playlists_df['name'] = name_list

In [None]:
# check where to get album names
pprint.pprint(tracks[0]['track']['album']['name'])

In [None]:
# define a function to get album names
def get_tracks_albums(username, playlist_id):
    tracks = get_playlist_tracks(username, playlist_id)
    if tracks is not None:
        albums = []
        for item in range(0, len(tracks)):
            albums.append(tracks[item]['track']['album']['name'])
        return albums
    else:
        return []

In [None]:
# Get albums from all playlists
album_list = []

for playlist_id in playlist_ids:
    album_list.extend(get_tracks_albums('Spotify', playlist_id))

In [None]:
#get get track names and include it to the dataframe
playlists_df['album'] = album_list

In [None]:
# check where to get the popularity of tracks
pprint.pprint(tracks[0]['track']['popularity'])

In [None]:
# define a function to get album names
def get_tracks_popularity(username, playlist_id):
    tracks = get_playlist_tracks(username, playlist_id)
    if tracks is not None:
        tracks_popularity = []
        for item in range(0, len(tracks)):
            tracks_popularity.append(tracks[item]['track']['popularity'])
        return tracks_popularity
    else:
        return []

In [None]:
# define a function to get track names
def get_tracks_popularity(username, playlist_id):
    tracks = get_playlist_tracks(username, playlist_id)
    popularity_li = []
    for item in range(0, len(tracks)):
        popularity_li.append(tracks[item]['track']['popularity'])
    return popularity_li

In [None]:
# Get popularity scores from all playlists
popularity_list = []

for playlist_id in playlist_ids:
    popularity_list.extend(get_tracks_popularity('Spotify', playlist_id))

In [None]:
#include popularity scores to the dataframe
playlists_df['popularity'] = popularity_list

In [None]:
playlists_df

In [None]:
# check where to get the release date of tracks
pprint.pprint(tracks[0]['track']['album']['release_date'])

In [None]:
# define a function to get release dates
def get_tracks_dates(username, playlist_id):
    tracks = get_playlist_tracks(username, playlist_id)
    if tracks is not None:
        dates = []
        for item in range(0, len(tracks)):
            dates.append(tracks[item]['track']['album']['release_date'])
        return dates
    else:
        return []

In [None]:
# Get release dates from all playlists
dates_list = []

for playlist_id in playlist_ids:
    dates_list.extend(get_tracks_dates('Spotify', playlist_id))

In [None]:
#include popularity scores to the dataframe
playlists_df['release_date'] = dates_list

In [None]:
playlists_df

In [None]:
playlists_df.info()

In [None]:
# convert 'release_date' to a datetime column
playlists_df['release_date'] = pd.to_datetime(playlists_df['release_date'], format='%Y-%m-%d')

# convert datetime column to a numerical format
playlists_df['release_date_ordinal'] = playlists_df['release_date'].apply(lambda x: x.toordinal())

# drop original datetime column
playlists_df = playlists_df.drop(columns=['release_date'])

In [None]:
playlists_df.info()

In [None]:
# Resort the columns
new_order = ['id', 'name', 'artist', 'album', 'release_date_ordinal', 'popularity', 'duration_ms',
             'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 
             'liveness', 'valence', 'tempo']

playlists_df = playlists_df.reindex(columns=new_order)


In [None]:
# Remove duplicates

playlists_df.drop_duplicates(inplace=True)

In [None]:
playlists_df.info()