<img src="images/Spotify_Logo_RGB_Green.png" width="400"/>

### Import modules

In [1]:
import pandas as pd

### Import the dataset

In [2]:
Hot100 = pd.read_csv('Hot100Complete.csv')

### Isolate the unique tracks from January 1, 2000

In [3]:
# Unique tracks
ut = Hot100.groupby(['Title', 'Artist']).ngroups

print('{:,} unique tracks'.format(ut))

28,519 unique tracks


In [4]:
# Unique records indexed by the highest position of the track
indices = Hot100.groupby(['Title', 'Artist'])['Peak'].idxmin
Hot100 = Hot100.loc[indices].sort_index()
Hot100.reset_index(drop = True, inplace = True)

Musical preferences and tastes change over the years so it is important to only use current music data for the classifier. In this case, I have subset the data to records from January 1, 2000 to current date. Audio features and audio analysis will be taken from Spotify API. These defining characteristics will become the features for my classifier.

In [5]:
Hot100 = Hot100[Hot100['Date'] >= '2000-01-01']

records = len(Hot100)
print('{:,} total records'.format(records))

8,034 total records


In [6]:
Hot100.drop(['Date', 'Rank', 'Peak', 'Weeks'], axis = 1, inplace = True)
Hot100.head()

Unnamed: 0,Title,Artist
0,The Box,Roddy Ricch
1,Life Is Good,Future Featuring Drake
2,Godzilla,Eminem Featuring Juice WRLD
3,Circles,Post Malone
4,Memories,Maroon 5


The Spotipy module allows us to look up tracks by artist and title. To improve chances of retrieving songs, I will use regular expressions to format the search terms to avoid returning 'Not Found'. Most, but not all, songs found on the charts will be found on Spotify. Those that aren't will be removed.

In [None]:
import re

In [None]:
title = Hot100['Title'].tolist()
title_match = ['\'',
               '\s\(.*$']
title = [re.sub('|'.join(title_match), '', i) for i in title]

artist = Hot100['Artist'].tolist()
artist_match = ['\sFeaturing.*$',
                '\sfeaturing.*$',
                '\sFeauring.*$',
                '\s\(Featuring.*$',
                '\s\(With.*$',
                '\s\(Duet.*$',
                '\sIntroducing.*$',
                '\s\"Fenderella\"',
                'The West Coast Rap All-Stars',
                '\sVs.*$',
                '\sX\s.*$',
                '\sx\s.*$',
                '\sWith\s.*$',
                '\swith\s.*$',
                '\sPresents.*$',
                '\sPresents.*$',
                '\s&\s.*$',
                ',.*$',
                '\'']

artist = [re.sub('|'.join(artist_match), '', i) for i in artist]

### Use the Spotipy API to access audio features

In [None]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

In [None]:
client_id = 'yourclientidhere'
client_secret = 'yourclientsecrethere'

client_credentials_manager = SpotifyClientCredentials(client_id, client_secret)
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

Retrieve Spotify URIs (ID number) for all tracks in the dataframe. The list created will be used as a new column in the dataframe. Not found items will be removed as they cannot be used in our modelling.

In [None]:
# Retrieve Spotify URI
spotify_uri = []

for i in range(len(title)):
    
    t = title[i]
    a = artist[i]
    
    search = '{} {}'.format(t, a)
    results = sp.search(q = search, limit = 1, type = 'track')    
    
    try:
        uri = results['tracks']['items'][0]['uri']
        spotify_uri.append(uri)
   
    except:
        spotify_uri.append('Not Found')
        
# Add a column for SpotifyURI 
Hot100['SpotifyURI'] = spotify_uri
            
# Remove 'Not Found'           
Hot100 = Hot100[Hot100['SpotifyURI'] != 'Not Found']

Retrieve all audio features and audio analysis from Spotify using the URIs we found in the previous step.

In [None]:
# Add Audio Features/Audio Analysis
acousticness = []
danceability = []
duration_ms = []
dyn_range = []
energy = []
explicit = []
instrumentalness = []
key = []
liveness = []
loudness = []
mode = []
num_sections = []
popularity = []
release_date = []
speechiness = []
tempo = []
time_signature = []
valence = []

spotify_uri = Hot100['SpotifyURI'].tolist()

for uri in spotify_uri:
    track_data = sp.track(uri)
         
    release_date.append(track_data['album']['release_date'])
    popularity.append(track_data['popularity'])
    x = track_data['explicit']
    if x == True:
        explicit.append(1)
    else:
        explicit.append(0)
    
    metadata = sp.audio_features(uri)
    section_data = sp.audio_analysis(uri)
    num = len(section_data['sections'])
    num_sections.append(num)
    
    for audio_features in metadata:
        acousticness.append(audio_features['acousticness'])
        danceability.append(audio_features['danceability'])
        duration_ms.append(audio_features['duration_ms'])
        energy.append(audio_features['energy'])
        instrumentalness.append(audio_features['instrumentalness'])
        key.append(audio_features['key'])
        liveness.append(audio_features['liveness'])
        loudness.append(audio_features['loudness'])
        mode.append(audio_features['mode'])
        speechiness.append(audio_features['speechiness'])
        tempo.append(audio_features['tempo'])    
        time_signature.append(audio_features['time_signature'])    
        valence.append(audio_features['valence'])
        
        dB = []
        for i in range(num):
            dB.append(section_data['sections'][i]['loudness'])
        try:
            dyn_range.append(max(dB) - min(dB))
        except:
            dyn_range.append('Not Found')           
            
Hot = pd.DataFrame({
    'SpotifyURI' : spotify_uri,
    'Acousticness' : acousticness,
    'Danceability' : danceability,
    'DurationMS' : duration_ms,
    'DynamicRange' : dyn_range,
    'Energy' : energy,
    'Explicit' : explicit,
    'Instrumentalness' : instrumentalness,
    'Key' : key,
    'Liveness' : liveness,
    'Loudness' : loudness,
    'Mode' : mode,
    'Popularity' : popularity,
    'ReleaseDate' : release_date,
    'Sections' : num_sections,
    'Speechiness' : speechiness,
    'Tempo' : tempo,
    'TimeSignature' : time_signature,
    'Valence' : valence})

# Add a label for Hit status
Hot['Hot'] = 1

# Export dataframe
Hot.to_csv('Hot.csv', index = False)

In [7]:
Hot = pd.read_csv('Hot.csv')

In [8]:
Hot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7937 entries, 0 to 7936
Data columns (total 20 columns):
SpotifyURI          7937 non-null object
Acousticness        7937 non-null float64
Danceability        7937 non-null float64
DurationMS          7937 non-null int64
DynamicRange        7937 non-null float64
Energy              7937 non-null float64
Explicit            7937 non-null int64
Instrumentalness    7937 non-null float64
Key                 7937 non-null int64
Liveness            7937 non-null float64
Loudness            7937 non-null float64
Mode                7937 non-null int64
Popularity          7937 non-null int64
ReleaseDate         7937 non-null object
Sections            7937 non-null int64
Speechiness         7937 non-null float64
Tempo               7937 non-null float64
TimeSignature       7937 non-null int64
Valence             7937 non-null float64
Hot                 7937 non-null int64
dtypes: float64(10), int64(8), object(2)
memory usage: 1.2+ MB


### Retrieve random, non-Hit songs

Now we need to create records for non-hits. We will use the same steps as before but we will use a random search to find the tracks from Spotify.

In [9]:
import string
from random import choice

In [10]:
letters = list(string.ascii_lowercase)
numbers = list(range(100))
years = list(range(2020, 1999, -1))

In [None]:
# Retrieve random spotify uris
uri = []

for i in tqdm(range(50_000)):
    t = choice(letters)
    a = choice(letters)
    y = choice(years)
    o = choice(numbers)
    search = 'title:{} artist:{} year:{}'.format(t, a, y)
    results = sp.search(q = search, limit = 1, type = 'track', offset = o)   
    
    try:
        spotify_uri = results['tracks']['items'][0]['uri']
        uri.append(spotify_uri)
    
    except:
        pass

To ensure that any of the randomly chosen songs are not already in the "Hot" songs, we will remove all songs that intersect between both sets.

In [None]:
# List of all uris from Hot dataframe
spotify_uri = Hot['SpotifyURI'].tolist()

# Unique uris not in Hot
uri = list(set(uri) - set(spotify_uri).intersection(uri))

In the same way we got features for the Hot tracks, we will do the same for the "Not Hot" songs.

In [None]:
# Export dataframe
Not.to_csv('Not.csv', index = False)

In [11]:
Not = pd.read_csv('Not.csv')

In [12]:
Not.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10227 entries, 0 to 10226
Data columns (total 20 columns):
SpotifyURI          10227 non-null object
Acousticness        10227 non-null float64
Danceability        10227 non-null float64
DurationMS          10227 non-null int64
DynamicRange        10227 non-null float64
Energy              10227 non-null float64
Explicit            10227 non-null int64
Instrumentalness    10227 non-null float64
Key                 10227 non-null int64
Liveness            10227 non-null float64
Loudness            10227 non-null float64
Mode                10227 non-null int64
Popularity          10227 non-null int64
ReleaseDate         10227 non-null object
Sections            10227 non-null int64
Speechiness         10227 non-null float64
Tempo               10227 non-null float64
TimeSignature       10227 non-null int64
Valence             10227 non-null float64
Hot                 10227 non-null int64
dtypes: float64(10), int64(8), object(2)
memory u

### The Final Dataset

Concatenate the datasets create our final dataset.

In [None]:
hotornot = pd.concat([Hot, Not], axis = 0)