# Data Extension Via Spotify API

In this notebook, we will extend the features of the combined dataset using the Spotify API. This will be done in 2 steps:
1. Develop Spotify data fetch functions

2. Run functions on the combined, clean dataset

## Import Libraries

In [1]:
import os
import random
import time
import pprint
import pandas as pd
import spotipy
import json

from IPython.display import clear_output
from spotipy.oauth2 import SpotifyClientCredentials

# Define the root path
root_path = '../'

# Change the current working directory to the root path
os.chdir(root_path)

from credentials import *
from utils import functions

## Load Data

In [2]:
# Load the dataset
df = pd.read_csv('data/3_combined_clean_dataset.csv', sep=';')

# Create a sample of 20 songs for testing
test_df = df.sample(n=20, random_state=42)

## 1. Spotify Data Fetch & Utility Functions

### Function search_song()

In [3]:
# Create a function to search a given single song in the Spotify API: search_song(title, artist).

def search_song(title: str, artist: str) -> list:
    '''
    Given a song title and artist, this function returns up to 5 posible matches
    from Spotipy, Spotify's API.
    
    Input:
    title: song title as string
    artist: song artist as string
    
    Output:
    List of matches, each match is a dictionary
    
    '''
    
    # Initialize SpotiPy with user credentias
    sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=Client_ID,
                                                           client_secret=Client_Secret))

    # Create the query string
    query = ("track:"+title if title != "" else "")+" "+("artist:"+artist if artist != "" else "")

    # Search the requested song and artist if the input is not blank
    if len(query) > 0:
        #try:
            raw_results = sp.search(q=query,type="track,artist",limit=5)
            raw_results = raw_results['tracks']['items'] # Parse first two dictionary levels
        #except Exception as e:
            #print("Error:", e)
    else:
        raw_results = []
    
    # Define result variable
    clean_results = []
    
    # Loop through results and store relevant data in clean_results as list of dictionaries
    for index, result in enumerate(raw_results):
        
        # Store each individual result in temporal dictionary             TO-DO: add -> song poopularity, artist nationality...
        clean_result = {'result_index': index,
                        'track_id': result['id'],
                        'href': result['href'],
                        'track_link': result['external_urls']['spotify'],
                        'track_name': result['name'],
                        'artist': result['album']['artists'][0]['name'],
                        'album_name': result['album']['name'],
                        'album_release_year': result['album']['release_date'],
                        'track_duration': round(result['duration_ms']/1000),
                        'album_image': result['album']['images'][1]['url'],
                        'popularity': result['popularity'],
        }
        
        # Append each dictionary to results list
        clean_results.append(clean_result)
    
    return clean_results

In [4]:
# Test function
search_song('Scar Tissue', 'Red Hot')

[{'result_index': 0,
  'track_id': '1G391cbiT3v3Cywg8T7DM1',
  'href': 'https://api.spotify.com/v1/tracks/1G391cbiT3v3Cywg8T7DM1',
  'track_link': 'https://open.spotify.com/track/1G391cbiT3v3Cywg8T7DM1',
  'track_name': 'Scar Tissue',
  'artist': 'Red Hot Chili Peppers',
  'album_name': 'Californication (Deluxe Edition)',
  'album_release_year': '1999-06-08',
  'track_duration': 216,
  'album_image': 'https://i.scdn.co/image/ab67616d00001e0294d08ab63e57b0cae74e8595',
  'popularity': 82},
 {'result_index': 1,
  'track_id': '3Ld1GefhSbw0Iv2VB1C19Y',
  'href': 'https://api.spotify.com/v1/tracks/3Ld1GefhSbw0Iv2VB1C19Y',
  'track_link': 'https://open.spotify.com/track/3Ld1GefhSbw0Iv2VB1C19Y',
  'track_name': 'Scar Tissue',
  'artist': 'Various Artists',
  'album_name': 'Merchant Lake Fishing Songs',
  'album_release_year': '2023-09-16',
  'track_duration': 216,
  'album_image': 'https://i.scdn.co/image/ab67616d00001e024fee9b4468e079206b7ba23b',
  'popularity': 0},
 {'result_index': 2,
  'tr

### Function bulk_song_search()

In [5]:
# Create a function to get all the Spotify song data

def bulk_song_search(df: pd.DataFrame) -> pd.DataFrame:
    '''
    Given a DataFrame with two columns: title and artist,
    this function returns a new df with track data 
    using Spotipy, Spotify's API.
    
    Input:
    df: pandas DataFrame with songs
    
    Output:
    DataFrame with the track data
    
    '''
    
    # Define loop pauses to avoid API server blockage by overflow
    sleep_time = 30 # seconds of sleep time
    loop_count = 50 # tracks per sleep loop
    
    # Define an empty array to append results
    sp_songs = []
    
    loop_counter = 0 # counter to track loop
    counter = 0 # counter to track progress
    fail_counter = 0 # counter to track failed matches
                 
    for index, row in df.iterrows():
        
        counter += 1 # Add one to the global song counter
        
        # Clear last print and print next progress
        clear_output(wait=True)
        print('Downloading song Data...', round(((counter + 1) / df.shape[0])*100), '%')
        
        # Get the current row song title and artist. Fix to maximum 30 chars to avoid API error.
        song_name = str(row['title'])[:30]
        artist_name = str(row['artist'])[:30]
        
        try:
            sp_data = search_song(song_name, artist_name) # Call search_song function to get data
            if len(sp_data) > 0:
                sp_data[0]['is_hot'] = row['is_hot'] # Add is_hot key-value
                sp_songs.append(sp_data[0]) # If there is data, append to global variable
            else:
                fail_counter += 1 # If there is no data, add 1 to fail_counter
        except Exception as e:
            # Log error and add counter
            print("Error:", e)
            fail_counter += 1
        
        # Sleep to avoid API overflow...
        loop_counter += 1
        if loop_counter >= loop_count:
            loop_counter = 0
            print('Sleeping for', sleep_time, 's...') # Print sleep time
            time.sleep(sleep_time) # Sleep -> pause download to simulate human behavior

    # Print download summary
    clear_output(wait=True)
    print("Song data download complete.", "Success rate:", round(((counter-fail_counter)/counter)*100), "%")
    print("Succesful downloads:",counter-fail_counter,"Failed downloads:",fail_counter)
    
    # Convert list of dictionaries to DataFrame
    sp_songs_df = pd.DataFrame(sp_songs)
    
    # Drop result_index column: used for single songs searches in search_song() function
    sp_songs_df.drop(inplace=True, columns='result_index')
    
    return sp_songs_df

In [6]:
# Test function
bulk_song_search_test_df = bulk_song_search(test_df)

Song data download complete. Success rate: 60 %
Succesful downloads: 12 Failed downloads: 8


### Function get_audio_features()

+ [Description of Audio Features](https://developer.spotify.com/documentation/web-api/reference/#/operations/get-several-audio-features)

In [7]:
# Create a function to search a given single song in the Spotify API: search_song(title, artist).

def get_audio_features(track_ids: list) -> pd.DataFrame:
    '''
    Given a list of track IDs, this function returns 
    track audio features such as danceability, energy or tempo
    using Spotipy, Spotify's API.
    
    Input:
    track_ids: list of Spotipy track_ids
    
    Output:
    DataFrame with the track_ids and audio features
    
    '''
    
    # Define loop pauses to avoid API server blockage by overflow
    sleep_time = 30 # seconds of sleep time
    loop_count = 50 # tracks per sleep loop
    
    # Initialize SpotiPy with user credentias
    sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=Client_ID,
                                                           client_secret=Client_Secret))

    # Define result variable
    clean_results = []
    
    loop_counter = 0 # counter to track loop
    counter = 0 # counter to track progress
    fail_counter = 0 # counter to track failed matches
    
    # Loop through track_ids to get relevant audio features
    for index, track_id in enumerate(track_ids):
        
        counter += 1 # Add 1 to global counter
        clear_output(wait=True) # Clear print output to update progress
        print('Downloading audio features...', round(((index + 1) / len(track_ids))*100), '%') # Print progress
        
        try:
            # Get audio features from Spotipy
            api_result = sp.audio_features(track_id)[0]

            # Add features to track variable
            track = {'track_id': track_id,
                     'acousticness': api_result['acousticness'],
                     'danceability': api_result['danceability'],
                     'energy': api_result['energy'],
                     'instrumentalness': api_result['instrumentalness'],
                     'key': api_result['key'],
                     'liveness': api_result['liveness'],
                     'loudness': api_result['loudness'],
                     'mode': api_result['mode'],
                     'speechiness': api_result['speechiness'],
                     'tempo': api_result['tempo'],
                     'time_signature': api_result['time_signature'],
                     'valence': api_result['valence'],
            }

            # Append track results to results list
            clean_results.append(track)

        except Exception as e:
            # If there is an error:
            print("Error:", e)
            fail_counter += 1
        
        # Sleep to avoid API overflow...
        loop_counter += 1
        if loop_counter >= loop_count:
            loop_counter = 0
            print('Sleeping for', sleep_time, 's...')
            time.sleep(sleep_time) # Sleep
    
    clear_output(wait=True)
    print("Audio features download complete.", "Success rate:", round(((counter-fail_counter)/counter)*100), "%")
    print("Succesful downloads:",counter-fail_counter,"Failed downloads:",fail_counter)
    
    # Convert list of dictionaries to DataFrame
    clean_results_df = pd.DataFrame(clean_results)

    return clean_results_df



In [8]:
# Create list of track_ids from the Spotify song dataset
track_ids = list(bulk_song_search_test_df['track_id'])

# Test function
audio_features_test_df = get_audio_features(track_ids)

Audio features download complete. Success rate: 100 %
Succesful downloads: 12 Failed downloads: 0


### Function add_audio_features()

In [9]:
# Create function to concatenate dataframes

def add_audio_features(df1: pd.DataFrame, df2: pd.DataFrame) -> pd.DataFrame:
    '''
    This function concatenates the audio features columns to the base 
    song DataFrame columns and removes any duplicate columns of the 
    concatenates DataFrame.
    
    Input:
    df: base songs DataFrame
    audio_features
    
    '''
    
    # Merge datasets based on track_id column
    merged_df = pd.merge(df1, df2, on='track_id', how='inner')
    
    # Remove duplicate columns
    merged_df = merged_df.loc[:, ~merged_df.columns.duplicated()]
    
    return merged_df



In [10]:
# Test function
concat_df = add_audio_features(bulk_song_search_test_df, audio_features_test_df)

# Show concat dataset
concat_df.head()

Unnamed: 0,track_id,href,track_link,track_name,artist,album_name,album_release_year,track_duration,album_image,popularity,...,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,7zFZ0mDaiqPBCRpdDhTqHY,https://api.spotify.com/v1/tracks/7zFZ0mDaiqPB...,https://open.spotify.com/track/7zFZ0mDaiqPBCRp...,Big Balls And Cow Town,Asleep At The Wheel,House Of Blue Lights,2009-12-21,251,https://i.scdn.co/image/ab67616d00001e0283b626...,1,...,0.613,0.000122,0,0.983,-10.06,1,0.0438,126.333,4,0.889
1,3ypDGGBXK5vKZ9JFIR14mW,https://api.spotify.com/v1/tracks/3ypDGGBXK5vK...,https://open.spotify.com/track/3ypDGGBXK5vKZ9J...,Ethiopian Anthem,Ras Michael and The Sons Of Negus,Rally Round,1985-01-01,273,https://i.scdn.co/image/ab67616d00001e02a27a11...,5,...,0.349,0.000646,7,0.103,-15.301,1,0.0431,145.192,4,0.672
2,3AZ5XoHdiFpiKMky6M1Gt9,https://api.spotify.com/v1/tracks/3AZ5XoHdiFpi...,https://open.spotify.com/track/3AZ5XoHdiFpiKMk...,Suicidal Dream,Silverchair,Frogstomp (Deluxe Edition) [Remastered],1995-03-27,193,https://i.scdn.co/image/ab67616d00001e0272c344...,55,...,0.708,1.3e-05,11,0.108,-6.299,0,0.0327,97.532,4,0.388
3,07kBp5VuebubsozpQd3nyK,https://api.spotify.com/v1/tracks/07kBp5Vuebub...,https://open.spotify.com/track/07kBp5Vuebubsoz...,Thriller,Fall Out Boy,Infinity On High,2007-01-01,210,https://i.scdn.co/image/ab67616d00001e02da071a...,52,...,0.901,2e-06,2,0.315,-3.388,1,0.106,178.895,4,0.327
4,0AuhRB6bdvKY7cFpDdCJv9,https://api.spotify.com/v1/tracks/0AuhRB6bdvKY...,https://open.spotify.com/track/0AuhRB6bdvKY7cF...,Realizer,The Crystal Method,Legion of Boom,2004,229,https://i.scdn.co/image/ab67616d00001e02704f60...,35,...,0.769,0.848,10,0.0819,-6.433,0,0.0324,129.989,4,0.448


## 2. Run Functions & Save Data

In [11]:
# Download song data
sp_songs_df = bulk_song_search(df)

# Remove rows with duplicate track_id
sp_songs_df = sp_songs_df[~sp_songs_df.duplicated(subset='track_id', keep='first')]

# Save data as csv
sp_songs_df.to_csv('./data/4_sp_songs.csv',index = False, sep=";")

Song data download complete. Success rate: 57 %
Succesful downloads: 5703 Failed downloads: 4339


In [12]:
# Create list of track_ids from all song data
track_ids = list(sp_songs_df['track_id'])

# Download audio features data
audio_features_df = get_audio_features(track_ids)

# Save data as csv
audio_features_df.to_csv('./data/5_sp_audio_features.csv',index = False, sep=";")

Audio features download complete. Success rate: 100 %
Succesful downloads: 5691 Failed downloads: 0


In [19]:
# Merge song + audio features data
merge_df = add_audio_features(sp_songs_df, audio_features_df)

# Save data as csv
merge_df.to_csv('./data/6_sp_combined_dataset.csv',index = False, sep=";")

# Show dfs shape
print(sp_songs_df.shape)
print(audio_features_df.shape)
print(merge_df.shape)

# Show concat dataset
merge_df.head()

(5691, 11)
(5691, 13)
(5691, 23)


Unnamed: 0,track_id,href,track_link,track_name,artist,album_name,album_release_year,track_duration,album_image,popularity,...,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,3T7XFffEVFl76njTFOKSsL,https://api.spotify.com/v1/tracks/3T7XFffEVFl7...,https://open.spotify.com/track/3T7XFffEVFl76nj...,Martha Served,I Hate Sally,Don't Worry Lady,2007-06-12,139,https://i.scdn.co/image/ab67616d00001e0201a7d2...,1,...,0.987,0.681,1,0.818,-5.382,1,0.111,103.886,4,0.202
1,5VKlSohlApanCSQ1YHA1xq,https://api.spotify.com/v1/tracks/5VKlSohlApan...,https://open.spotify.com/track/5VKlSohlApanCSQ...,Zip-A-Dee-Doo-Dah (Song of the South),101 Strings Orchestra,"Most Amazing Movie, Musical & TV Themes, Vol.6",2022-10-07,200,https://i.scdn.co/image/ab67616d00001e0234180b...,1,...,0.195,0.546,10,0.122,-16.292,1,0.0373,119.205,4,0.277
2,7d3iTe7NNJzt4sviLqQmgb,https://api.spotify.com/v1/tracks/7d3iTe7NNJzt...,https://open.spotify.com/track/7d3iTe7NNJzt4sv...,Misery Path - From the Privilege of Evil,Amorphis,Karelian Isthmus,1992-11-01,255,https://i.scdn.co/image/ab67616d00001e0255f6d7...,10,...,0.983,0.947,9,0.415,-4.797,1,0.237,79.823,4,0.0397
3,0OsqzEdkrKIz4cMMdFtrvJ,https://api.spotify.com/v1/tracks/0OsqzEdkrKIz...,https://open.spotify.com/track/0OsqzEdkrKIz4cM...,"Nuovi Re, Pt. I I (feat. Tek Money - Lady Tamb...",Inoki,Nobiltà di strada,2007-02-16,259,https://i.scdn.co/image/ab67616d00001e022d85b5...,8,...,0.8,0.0,1,0.623,-4.925,1,0.281,87.954,4,0.755
4,4QL6epCx6rlUbDG4TF3sj9,https://api.spotify.com/v1/tracks/4QL6epCx6rlU...,https://open.spotify.com/track/4QL6epCx6rlUbDG...,Halloween,Dead Kennedys,"Plastic Surgery Disasters/In God We Trust, Inc.",2001,215,https://i.scdn.co/image/ab67616d00001e02af5c4f...,41,...,0.964,2e-06,0,0.125,-4.993,1,0.0935,185.157,4,0.764
