# Capstone

## set up imports and spotify api

In [11]:
# imports
#pip install spotipy --upgrade
import spotipy
from spotipy.oauth2 import SpotifyOAuth 
from spotipy.oauth2 import SpotifyClientCredentials

import pandas as pd

In [12]:
# manually set environment variables (for testing)
import os

os.environ['SPOTIPY_CLIENT_ID'] = '9fc4a2bf145b43fca0f305f7c52dda41'
os.environ['SPOTIPY_CLIENT_SECRET'] = 'c01da866a44d400aa4cd615a9bc7a752'
os.environ['SPOTIPY_REDIRECT_URI'] = 'http://localhost:8888/callback'

In [13]:
client_id = os.environ.get('SPOTIPY_CLIENT_ID')
client_secret = os.environ.get('SPOTIPY_CLIENT_SECRET')
redirect_uri = os.environ.get('SPOTIPY_REDIRECT_URI')

In [14]:
# set client credentials
#sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials())

# set scope and authorization
scope = 'user-top-read'
sp = spotipy.Spotify(auth_manager=SpotifyOAuth(scope=scope))

## download and clean data

In [22]:
def get_tracklist(playlists):
    """
    get metadata and features for each track in a spotify playlist

    parameters:
    - playlists (list or str): a list of playlist URLs or a single playlist URL

    returns:
    - pd.DataFrame: DataFrame containing information about tracks in the specified playlists
    """
    
    # check that input is a list, convert if not
    if not isinstance(playlists, list):
        playlists = [playlists]

    tracklist = []

    for playlist in playlists:
        # get playlist ID from URL
        playlist_id = playlist.split('/')[-1].split('?')[0]

        try:
            # get info for playlist
            playlist_info = sp.playlist(playlist_id)
            playlist_name = playlist_info['name']

            # get info for each track in playlist
            track_info = [{
                'id': item['track']['id'],
                'title': item['track']['name'],
                'releasedate': pd.to_datetime(item['track']['album']['release_date']),
                'length': item['track']['duration_ms'] / 1000,  # convert ms to seconds
                'explicit': item['track']['explicit'],
                'popularity': item['track']['popularity']
            } for item in playlist_info['tracks']['items']]

            # create dataframe from the track info
            tracks_df = pd.DataFrame(track_info)
            # add column for release year
            tracks_df['year'] = tracks_df['release'].dt.year

            # convert track ids to list
            track_ids = tracks_df['id'].tolist()
            # get audio features for each track and store in df
            track_features = sp.audio_features(track_ids)
            track_features_df = pd.DataFrame(track_features)

            # select features to keep
            selected_features = ['id', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
            track_features_df = track_features_df[selected_features]

            # merge audio features with existing df
            tracks_df = pd.merge(tracks_df, track_features_df, on='id')

            tracklist.append(tracks_df)
        
        except Exception as e:
            print(f"Error processing playlist: {playlist}. Error: {e}")
            return None

    # concatenate all dataframes
    tracklist_df = pd.concat(tracklist, ignore_index=True)
    
    return tracklist_df

In [23]:
get_tracklist('https://open.spotify.com/playlist/37i9dQZF1Fa1IIVtEpGUcU?si=baa7cf66e4f54189')

Unnamed: 0,id,title,release,length,explicit,popularity,year,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,5ogBIlFs1oBHuBxOeTmnoH,The Sound of Letting Go,2023-03-17,159.066,False,53,2023,0.692,0.769,1,-2.989,1,0.0295,0.005470,0.000000,0.0458,0.9090,106.026
1,2DIo5VoBB6X1GzgpGbmMIF,Heartbreak Feels So Good,2023-03-24,217.537,False,64,2023,0.514,0.779,8,-5.136,1,0.0417,0.003140,0.000013,0.0637,0.3130,91.089
2,6iF4RgIjDvDqyW13PezSj3,Single Soon,2023-08-25,171.655,False,85,2023,0.610,0.571,2,-5.649,1,0.0782,0.008630,0.000000,0.1310,0.7470,105.010
3,0NapkeC45rszeuSgbvcjx4,Maan Meri Jaan (Afterlife),2023-03-10,186.998,False,64,2023,0.670,0.533,6,-8.040,0,0.0684,0.040300,0.000000,0.0938,0.3390,95.991
4,4inPxqHrj9VmjLx5JCDCV4,Waffle House,2023-05-12,145.440,False,74,2023,0.719,0.756,10,-4.385,1,0.0356,0.027200,0.000000,0.1910,0.8630,126.011
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,5pzWbZ2c9QEEKDNIsJerXH,Baby Annihilation,2023-03-24,67.804,False,49,2023,0.400,0.471,1,-12.394,0,0.1610,0.888000,0.048000,0.1150,0.0385,84.481
96,43ay9lQZ5rfNcOOHhRF2cM,The Greatest Show,2017-10-27,302.146,False,63,2017,0.417,0.824,11,-7.360,0,0.1050,0.000239,0.054500,0.0725,0.4000,157.920
97,4356PL6jkERzJouVfQQkZo,High Infidelity,2022-10-21,231.475,False,60,2022,0.652,0.536,10,-10.211,1,0.0881,0.712000,0.000000,0.0854,0.7580,87.963
98,0Ryd8975WihbObpp5cPW1t,boyfriend (with Social House),2019-08-02,186.106,True,81,2019,0.400,0.795,10,-3.731,0,0.4610,0.119000,0.000000,0.1590,0.7020,190.097


In [16]:
# input a list of playlists

playlist_urls = ['https://open.spotify.com/playlist/37i9dQZF1CyWExfjiBGoVh?si=e227bbd1de8b42f0',
                'https://open.spotify.com/playlist/37i9dQZF1E9WKHP4NOmDGL?si=e3ff3539c8ae47a1',
                'https://open.spotify.com/playlist/37i9dQZF1EjgKOpkPK3V4h?si=71409bb3b9cc40a2',
                'https://open.spotify.com/playlist/37i9dQZF1Et8YfkURNRFQQ?si=e1b3f6b940a3403f',
                'https://open.spotify.com/playlist/37i9dQZF1EMgToN6NNFzB2?si=01b2a28d5d54452d',
                'https://open.spotify.com/playlist/37i9dQZF1EUMDoJuT8yJsl?si=86f7eb098f8a4a51', 
                'https://open.spotify.com/playlist/37i9dQZF1F0sijgNaJdgit?si=2ee6884b1718473c',
                'https://open.spotify.com/playlist/37i9dQZF1Fa1IIVtEpGUcU?si=baa7cf66e4f54189']

In [17]:
tracklist_df = get_tracklist(playlist_urls)
tracklist_df

Unnamed: 0,id,title,release,length,explicit,popularity,year,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,2rizacJSyD9S1IQUxUxnsK,All We Know,2016-09-29,194.080,False,72,2016,0.662,0.586,0,-8.821,1,0.0307,0.097000,0.002720,0.1150,0.2960,90.000
1,0OynBOa42KSFIvVrqnw7DO,Don't Threaten Me with a Good Time,2016-01-01,213.093,False,33,2016,0.559,0.895,1,-4.476,0,0.0832,0.015400,0.000000,0.1750,0.5900,183.825
2,2IY7eOUDjw2ArKYxKa2jXc,Starboy,2016-09-22,230.466,True,0,2016,0.682,0.592,7,-7.033,1,0.2810,0.169000,0.000005,0.1360,0.5000,186.041
3,6hOHH3nj5VI7IskIUic0WQ,Yesterday's Song,2016-09-23,223.971,False,0,2016,0.571,0.903,1,-4.101,1,0.0538,0.000808,0.000000,0.2920,0.7380,128.011
4,3kSXn1osC89W8JcPLozTzs,Stand By You,2016-01-01,219.000,False,66,2016,0.506,0.897,9,-4.632,1,0.2600,0.146000,0.000451,0.0868,0.5250,188.030
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,5pzWbZ2c9QEEKDNIsJerXH,Baby Annihilation,2023-03-24,67.804,False,49,2023,0.400,0.471,1,-12.394,0,0.1610,0.888000,0.048000,0.1150,0.0385,84.481
796,43ay9lQZ5rfNcOOHhRF2cM,The Greatest Show,2017-10-27,302.146,False,63,2017,0.417,0.824,11,-7.360,0,0.1050,0.000239,0.054500,0.0725,0.4000,157.920
797,4356PL6jkERzJouVfQQkZo,High Infidelity,2022-10-21,231.475,False,60,2022,0.652,0.536,10,-10.211,1,0.0881,0.712000,0.000000,0.0854,0.7580,87.963
798,0Ryd8975WihbObpp5cPW1t,boyfriend (with Social House),2019-08-02,186.106,True,81,2019,0.400,0.795,10,-3.731,0,0.4610,0.119000,0.000000,0.1590,0.7020,190.097


In [21]:
duplicated_rows = tracklist_df[tracklist_df.duplicated(subset='id', keep=False)]
duplicated_rows

Unnamed: 0,id,title,release,length,explicit,popularity,year,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,2rizacJSyD9S1IQUxUxnsK,All We Know,2016-09-29,194.080,False,72,2016,0.662,0.586,0,-8.821,1,0.0307,0.09700,0.002720,0.1150,0.296,90.000
5,7BKLCZ1jbUBVqRi2FVlTVw,Closer,2016-07-29,244.960,False,87,2016,0.748,0.524,8,-5.599,1,0.0338,0.41400,0.000000,0.1110,0.661,95.010
11,3omXshBamrREltcf24gYDC,First,2014-10-31,200.360,False,73,2014,0.468,0.692,2,-4.015,1,0.0295,0.02020,0.000002,0.5230,0.561,78.009
12,3vv9phIu6Y1vX3jcqaGz5Z,Roses,2015-06-16,226.738,False,77,2015,0.713,0.802,4,-7.055,1,0.0561,0.04350,0.003770,0.3090,0.343,100.001
47,72xvRETEsd5hS1RfduciRi,Let Me Hold You (Turn Me On),2016-04-13,162.425,False,0,2016,0.681,0.774,1,-3.167,0,0.0787,0.09920,0.000000,0.3500,0.267,102.924
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
750,0VE4kBnHJUgtMf0dy6DRmW,Getaway Car,2017-11-10,233.626,False,87,2017,0.562,0.689,2,-6.745,1,0.1270,0.00465,0.000002,0.0888,0.351,172.054
762,5BK0uqwY9DNfZ630STAEaq,gold rush,2020-12-11,185.320,True,74,2020,0.512,0.462,9,-10.491,1,0.0408,0.83000,0.166000,0.1210,0.353,112.050
774,0Jlcvv8IykzHaSmj49uNW8,the 1,2020-07-24,210.251,True,81,2020,0.777,0.357,0,-6.942,1,0.0522,0.75700,0.000007,0.1080,0.172,139.883
776,5lzb11BOouSBDXxhTnTtpv,Call Me Sir (feat. Cam & Travie McCoy),2018-05-24,216.280,False,43,2018,0.522,0.755,8,-4.405,0,0.0461,0.21400,0.000000,0.1020,0.441,83.965


In [18]:
tracklist_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   id                800 non-null    object        
 1   title             800 non-null    object        
 2   release           800 non-null    datetime64[ns]
 3   length            800 non-null    float64       
 4   explicit          800 non-null    bool          
 5   popularity        800 non-null    int64         
 6   year              800 non-null    int64         
 7   danceability      800 non-null    float64       
 8   energy            800 non-null    float64       
 9   key               800 non-null    int64         
 10  loudness          800 non-null    float64       
 11  mode              800 non-null    int64         
 12  speechiness       800 non-null    float64       
 13  acousticness      800 non-null    float64       
 14  instrumentalness  800 non-