## Spotify Hit Prediction  

Given *data about Spotify songs from the 1960s-2010s*, let's try to predict whether a given song will be a **hit** or not.

We will use a variety of classification models to make our predictions.

In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [17]:
dfs = [pd.read_csv(f'dataset-of-{decade}0s.csv') for decade in ['6', '7', '8', '9', '0', '1']]

In [18]:
for i, decade in enumerate([1960, 1970, 1980, 1990, 2000, 2010]):
    dfs[i]['decade'] = pd.Series(decade, index=dfs[i].index)

df= pd.concat(dfs, axis=0).sample(frac=1.0, random_state=42).reset_index(drop=True)

In [19]:
df

Unnamed: 0,track,artist,uri,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,chorus_hit,sections,target,decade
0,Nachtlager N. Granada - The Lower Castle Yard,Traditional,spotify:track:16BjtTPPOnzv83TQWrnQQf,0.530,0.592,10,-16.461,1,0.0366,0.942000,0.933000,0.3930,0.9160,148.098,83400,4,21.26378,6,0,1970
1,How's It Going To Be,Third Eye Blind,spotify:track:3Uvx1TO0Kg5HgGPk58lHXv,0.562,0.593,5,-9.362,1,0.0263,0.003270,0.001390,0.0967,0.5740,80.289,253413,4,22.58257,8,1,1990
2,Heavenly Perverse,Dimmu Borgir,spotify:track:1J1Z0XIL18hClKmm5T5ytO,0.138,0.981,4,-3.797,1,0.1160,0.000016,0.814000,0.0703,0.0399,97.579,392813,3,19.71308,15,0,2000
3,Era um Garoto Que Como Eu Amava os Beatles e o...,Os Incríveis,spotify:track:0mrP69xWBmlnixuVLFqCSl,0.536,0.713,4,-9.203,1,0.0679,0.251000,0.000000,0.9480,0.7720,128.594,209947,4,66.80246,8,0,1970
4,Clavel Sevillano,Javier Solís,spotify:track:15fxd1I7i8BFsZhKwoKANr,0.296,0.462,0,-9.521,1,0.0362,0.782000,0.000000,0.6320,0.4430,141.942,226000,3,36.35407,8,0,1960
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41101,Always Together,Al Martino,spotify:track:146vkvsbGH7xQYRKjAYEhG,0.281,0.272,8,-12.982,1,0.0310,0.735000,0.002450,0.2410,0.3580,105.912,158000,4,19.57412,8,1,1960
41102,Walk Away From Love,David Ruffin,spotify:track:5cAGsX0EzCqKuQr7nui6T7,0.620,0.804,10,-7.606,1,0.0937,0.248000,0.000011,0.2130,0.7760,102.265,328920,4,27.88281,16,1,1970
41103,Doing The Funk,C. Da Afro,spotify:track:0XIkVkScOkBHUH139SjaWz,0.783,0.813,0,-4.538,1,0.0445,0.020800,0.775000,0.1030,0.6660,101.995,381176,4,98.84256,10,0,2010
41104,Long Live Love,Sandie Shaw,spotify:track:7007XzDvqzxH3pl84LkS9z,0.534,0.468,0,-8.760,1,0.0571,0.599000,0.000000,0.2370,0.8980,151.580,162133,4,35.69243,10,1,1960


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41106 entries, 0 to 41105
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track             41106 non-null  object 
 1   artist            41106 non-null  object 
 2   uri               41106 non-null  object 
 3   danceability      41106 non-null  float64
 4   energy            41106 non-null  float64
 5   key               41106 non-null  int64  
 6   loudness          41106 non-null  float64
 7   mode              41106 non-null  int64  
 8   speechiness       41106 non-null  float64
 9   acousticness      41106 non-null  float64
 10  instrumentalness  41106 non-null  float64
 11  liveness          41106 non-null  float64
 12  valence           41106 non-null  float64
 13  tempo             41106 non-null  float64
 14  duration_ms       41106 non-null  int64  
 15  time_signature    41106 non-null  int64  
 16  chorus_hit        41106 non-null  float6

In [23]:
df.isna().sum()

track               0
artist              0
uri                 0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
duration_ms         0
time_signature      0
chorus_hit          0
sections            0
target              0
decade              0
dtype: int64

In [21]:
df["target"].value_counts()

1    20553
0    20553
Name: target, dtype: int64

# Preprocessing

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [25]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop high-cardinality categorical columns
    df = df.drop(['track', 'artist', 'uri'], axis=1)
    
    # Split df into X and y
    y = df['target']
    X = df.drop('target', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [26]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [27]:
X_train

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,chorus_hit,sections,decade
17085,0.302320,0.686659,-1.476879,0.351597,0.662019,0.671378,-0.925209,-0.437039,0.557119,0.712452,0.802450,-0.823274,0.252337,-0.714349,-0.722460,-1.302178
5559,0.144545,0.516236,-1.476879,-0.200313,0.662019,-0.447261,-0.505318,-0.506697,0.029834,1.224613,-0.909156,0.143750,0.252337,-0.447262,-0.099982,-0.156078
32888,-0.047039,1.308901,-1.476879,0.640299,0.662019,0.671378,-1.065666,-0.506991,-0.538012,-1.037120,0.884005,-0.999852,0.252337,1.130341,-1.344938,0.990021
14170,-1.850180,0.290326,-0.911356,1.173837,0.662019,0.053552,-1.072544,-0.508385,2.196919,-2.031535,1.129661,-1.870680,0.252337,-2.112818,-1.759923,0.990021
7958,0.358668,-1.152325,-1.476879,-0.022967,0.662019,0.319170,1.706503,-0.508385,-0.642311,-0.057659,0.191893,-1.062355,-2.104281,-0.549904,-1.137445,-0.729128
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7813,-0.165370,-0.074300,-0.911356,0.062894,0.662019,-0.490556,-0.679780,-0.508231,0.580297,0.686283,-0.633696,-0.554598,0.252337,-0.286273,-0.929953,-0.729128
32511,0.195258,1.645784,0.502454,1.273008,-1.510532,-0.336100,-1.011849,2.269477,-0.126614,-1.067028,0.299202,1.063188,0.252337,-0.482225,0.937481,-0.729128
5192,0.764375,-1.465428,-0.911356,-1.913221,0.662019,-0.510448,1.795212,-0.161565,-0.572779,-1.186657,-0.887618,-1.288577,0.252337,-0.261412,-1.137445,0.990021
12172,0.138910,0.191243,-1.476879,0.621177,0.662019,-0.591187,0.470485,-0.508385,-0.653320,-0.304394,-1.004668,0.090811,0.252337,-0.950627,0.315003,0.416972


In [28]:
y_train

17085    0
5559     1
32888    0
14170    0
7958     0
        ..
7813     1
32511    1
5192     0
12172    1
33003    1
Name: target, Length: 28774, dtype: int64

# Training

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [30]:
models = {
    "Logistic Regression": LogisticRegression(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
}

for name, model in models.items():
    model.fit(X_train, y_train)

# Results

In [31]:
for name, model in models.items():
    print(name + ": {:.2f}%".format(model.score(X_test, y_test) * 100))

Logistic Regression: 74.50%
K-Nearest Neighbors: 75.32%
Decision Tree: 72.01%
Random Forest: 80.54%
