# Import of Spotify Dataset and Dependencies

In [1]:
# Import base dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Ignore Python warnings about past model versions
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load the Spotify dataset into a DataFrame
music = pd.read_csv("data/musicData_raw.csv")
print(music.shape)
music

(50005, 18)


Unnamed: 0,instance_id,artist_name,track_name,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,obtained_date,valence,music_genre
0,32894.0,Röyksopp,Röyksopp's Night Out,27.0,0.00468,0.652,-1.0,0.941,0.79200,A#,0.115,-5.201,Minor,0.0748,100.889,4-Apr,0.759,Electronic
1,46652.0,Thievery Corporation,The Shining Path,31.0,0.01270,0.622,218293.0,0.890,0.95000,D,0.124,-7.043,Minor,0.0300,115.00200000000001,4-Apr,0.531,Electronic
2,30097.0,Dillon Francis,Hurricane,28.0,0.00306,0.620,215613.0,0.755,0.01180,G#,0.534,-4.617,Major,0.0345,127.994,4-Apr,0.333,Electronic
3,62177.0,Dubloadz,Nitro,34.0,0.02540,0.774,166875.0,0.700,0.00253,C#,0.157,-4.498,Major,0.2390,128.014,4-Apr,0.270,Electronic
4,24907.0,What So Not,Divide & Conquer,32.0,0.00465,0.638,222369.0,0.587,0.90900,F#,0.157,-6.266,Major,0.0413,145.036,4-Apr,0.323,Electronic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50000,58878.0,BEXEY,GO GETTA,59.0,0.03340,0.913,-1.0,0.574,0.00000,C#,0.119,-7.022,Major,0.2980,98.02799999999999,4-Apr,0.330,Hip-Hop
50001,43557.0,Roy Woods,Drama (feat. Drake),72.0,0.15700,0.709,251860.0,0.362,0.00000,B,0.109,-9.814,Major,0.0550,122.04299999999999,4-Apr,0.113,Hip-Hop
50002,39767.0,Berner,Lovin' Me (feat. Smiggz),51.0,0.00597,0.693,189483.0,0.763,0.00000,D,0.143,-5.443,Major,0.1460,131.079,4-Apr,0.395,Hip-Hop
50003,57944.0,The-Dream,Shawty Is Da Shit,65.0,0.08310,0.782,262773.0,0.472,0.00000,G,0.106,-5.016,Minor,0.0441,75.88600000000001,4-Apr,0.354,Hip-Hop


# Pre-Processing of Spotify Dataset

## Check and Remove Missing or Invalid Values

In [3]:
# Display missing values within the DataFrame
print(music.isnull().sum())

# Remove the rows which have missing values
music.dropna(inplace = True)

instance_id         5
artist_name         5
track_name          5
popularity          5
acousticness        5
danceability        5
duration_ms         5
energy              5
instrumentalness    5
key                 5
liveness            5
loudness            5
mode                5
speechiness         5
tempo               5
obtained_date       5
valence             5
music_genre         5
dtype: int64


In [4]:
# Display the types of all features within data
print(music.dtypes)

instance_id         float64
artist_name          object
track_name           object
popularity          float64
acousticness        float64
danceability        float64
duration_ms         float64
energy              float64
instrumentalness    float64
key                  object
liveness            float64
loudness            float64
mode                 object
speechiness         float64
tempo                object
obtained_date        object
valence             float64
music_genre          object
dtype: object


In [5]:
# Drop artist name, instance ID, track name, and obtained date as irrelevant features
music = music.drop(["artist_name", "track_name", "obtained_date", "instance_id"], axis = 1)

# Replace "?" values in tempo feature column with NaN
music["tempo"] = music["tempo"].replace("?", np.nan)
# Convert tempo feature from an object to a float
music["tempo"] = music["tempo"].astype(float)

# Replace all missing values in tempo feature with median value
tempo_median = music["tempo"].mean()
music["tempo"] = music["tempo"].fillna(tempo_median)

In [6]:
# Final check for missing values
print(music.isnull().sum())

# Reset the index of the DataFrame
music.reset_index(drop = True, inplace = True)

popularity          0
acousticness        0
danceability        0
duration_ms         0
energy              0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
speechiness         0
tempo               0
valence             0
music_genre         0
dtype: int64


I first needed to determine how many missing values were present within the dataset. Displaying the sum of missing values made me realize that there were just **5 songs** within the dataset which contained **no feature information.** Because of this, I simply removed these 5 songs from the dataset.

Some of the features of the song were categorized incorrectly. For example, the "tempo" feature in the data was categorized as an **object type.** Along with this there were invalid values for the "tempo" feature with some songs containing simply *'?'* strings rather than as float representations. I removed these strings and replaced them with **NaN values**, converted them to **float object types**, and then replaced all missing "tempo" features with the **median** of the "tempo" column.

## Convert Categorical Labels into Numeric Representations

In [7]:
# Dictionary of mapping of keys to numeric values
key_map = {"C": 0, "C#": 1, "D": 2, "D#": 3, "E": 4, "F": 5,
          "F#": 6, "G":7, "G#": 8, "A": 9, "A#" : 10, "B": 11}

# Apply mapping to key feature within DataFrame
music["key"] = music["key"].map(key_map)

# Dummy code the model column
music = pd.get_dummies(music, columns = ["mode"], prefix = "mode")

In [8]:
# Display all genres within dataset
print(music["music_genre"].unique())

# Dictionary of genre labels to numeric labels
genre_map = {'Electronic': 0, 'Anime': 1, 'Jazz': 2, 'Alternative': 3, 'Country': 4, 
             'Rap': 5, 'Blues': 6, 'Rock': 7, 'Classical': 8, 'Hip-Hop': 9}

# Apply mapping to genre feature within the DataFrame
music["music_genre"] = music["music_genre"].map(genre_map)

['Electronic' 'Anime' 'Jazz' 'Alternative' 'Country' 'Rap' 'Blues' 'Rock'
 'Classical' 'Hip-Hop']


I first converted string values within the "key" column into **numerical representations** by creating a **mapping** of each key value to a numerical value with the use of a dictionary. I then **dummy-coded** the "mode" column, essentially creating a new column for each unique mode in the original dataset, assigning either a value for whether the song is in **Mode Major** or **Mode Minor**.

## Train/Test Split for the Dataset

In [9]:
# Import additional dependencies for train/test split
from sklearn.model_selection import train_test_split

# Create DataFrames for both training data and test data
train = pd.DataFrame()
test = pd.DataFrame()

# Loop through each genre and randomly select 500 songs for test data
for genre in music["music_genre"].unique():
    genre_data = music[music["music_genre"] == genre]
    train_genre, test_genre = train_test_split(genre_data, test_size = 500)
    train = train.append(train_genre)
    test = test.append(test_genre)

In [10]:
# Define features and target datasets for both train and test set
X_train = pd.DataFrame(train.drop(["music_genre"], axis = 1))
X_test = pd.DataFrame(test.drop(["music_genre"], axis = 1))
y_train = pd.DataFrame(train["music_genre"])
y_test = pd.DataFrame(test["music_genre"])

In [11]:
# Export pre-processed datasets into data folder
X_train.to_csv("data/x_train.csv")
X_test.to_csv("data/x_test.csv")
y_train.to_csv("data/y_train.csv")
y_test.to_csv("data/y_test.csv")

To perform a **train/test split** of the dataset, given that there were 5,000 songs for each genre within the dataset, I wanted to **randomly** select 500 songs from each genre for the test set. I then used the other 4,500 songs from each genre into the training set. I finally established feature and target datasets for both the newly defined training and test sets before exporting the datasets. Finally I verfied that there was **no leakage** within the split, which would occur if any training data was also used within the test data.