In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict

from sklearn.metrics import  classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, \
r2_score, mean_squared_error, mean_absolute_error, median_absolute_error, mean_squared_log_error, explained_variance_score

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

# Read and clean dataset

In [2]:
# read
df = pd.read_csv("spotify-dataset.csv")

# lower the attribute
df['Top Genre'] = (df["Top Genre"].str.strip()).str.lower()

# drop genres that have less than 20 instances
to_remove = [genre for genre in df['Top Genre'].unique() if df['Top Genre'].value_counts()[genre] < 20] 
for r in to_remove:
    df = df[df['Top Genre'] != r]

# convert negative values to positive, because the percenption of sound is relative (-N dB == N dB in human ear)
df['Loudness (dB)'] = df['Loudness (dB)'].abs()

# convert duration to int
df['Length (Duration)'] = pd.to_numeric(df['Length (Duration)'].str.replace(',',''))

# drop not used columns
df.drop(columns = ['Index', 'Title', 'Artist', 'Year'], inplace=True)

# encode genres
genres = list(df['Top Genre'].unique())
df['Top Genre'] = df['Top Genre'].apply(lambda x: genres.index(x))

In [3]:
df

Unnamed: 0,Top Genre,Beats Per Minute (BPM),Energy,Danceability,Loudness (dB),Liveness,Valence,Length (Duration),Acousticness,Speechiness,Popularity
0,0,157,30,53,14,11,68,201,94,3,71
1,1,135,79,50,11,17,81,207,17,7,39
3,2,173,96,43,4,3,37,269,0,4,76
4,3,106,82,58,5,10,87,256,1,3,59
6,4,102,71,71,6,13,54,257,6,3,74
...,...,...,...,...,...,...,...,...,...,...,...
1987,0,119,24,75,15,9,43,216,83,12,68
1988,0,168,7,17,21,14,10,298,92,3,66
1989,0,94,21,70,12,11,72,128,84,7,63
1990,0,175,76,36,8,76,95,136,73,6,69


# Perform analysis

The analisis performed, conssits in a classification with random forest, that ....

### Prepare data

In [4]:
targets = ['Top Genre']
features = ['Beats Per Minute (BPM)', 'Energy', 'Danceability', 'Loudness (dB)', 'Liveness', 'Valence', 'Length (Duration)', 'Acousticness', 'Speechiness', 'Popularity']
X = df[features]
Y = df[targets]
train_X, val_X, train_y, val_y = train_test_split(X, Y, random_state=1)

### Train model

In [5]:
model = RandomForestClassifier(random_state=1)
model.fit(train_X, train_y)

RandomForestClassifier(random_state=1)

### Perform validation predictions

In [6]:
validation_predictions = model.predict(val_X)

# Plot model statistics

In [7]:
r2 = r2_score(val_y, validation_predictions)
mse = mean_squared_error(val_y, validation_predictions) 
mean_absolute_error = mean_absolute_error(val_y, validation_predictions) 
explained_variance = explained_variance_score(val_y, validation_predictions)
median_absolute_error = median_absolute_error(val_y, validation_predictions)
mean_squared_log_error = mean_squared_log_error(val_y, validation_predictions)

print(f'r2:                     {round(r2,4)}')
print(f'MSE:                    {round(mse,4)}')
print(f'RMSE:                   {round(np.sqrt(mse),4)}')
print(f'MAE:                    {round(mean_absolute_error,4)}')
print(f'explained_variance:     {round(explained_variance,4)}')    
print(f'mean_squared_log_error: {round(mean_squared_log_error,4)}')

r2:                     -0.6434
MSE:                    55.8883
RMSE:                   7.4758
MAE:                    5.049
explained_variance:     -0.3778
mean_squared_log_error: 1.5328


In [8]:
cross_val_score_ = cross_val_score(model, train_X, train_y, cv = 10, scoring = 'accuracy')
print(f'cross validation mean {cross_val_score_.mean()}')
print(f'cross validation std  {cross_val_score_.std()}')

cross validation mean 0.3124437030859049
cross validation std  0.03363924528459963


In [9]:
y_randforest = cross_val_predict(model, train_X, train_y, cv = 10)
recall_score_ = recall_score(train_y, y_randforest, average = "micro")
precission_score_ = precision_score(train_y, y_randforest, average = "micro")
f1_score = 2 * (precission_score_ * recall_score_) / (precission_score_ + recall_score_)
print(f'Recall Score:     {recall_score_}')
print(f'Precision Score:  {precission_score_}')
print(f'F1 Score:         {f1_score}')

Recall Score:     0.3123861566484517
Precision Score:  0.3123861566484517
F1 Score:         0.3123861566484517
