In [49]:
import pandas as pd

# Data preparation

In [66]:
df = pd.read_csv('../raw_data/universal_top_spotify_songs.csv')

In [67]:
df.dropna(inplace=True)

In [68]:
df.reset_index(inplace=True)

In [69]:
def top_what(row):
    if row.daily_rank <= 5:
        cat = 'top5'
    elif row.daily_rank <= 10:
        cat = 'top10'
    elif row.daily_rank <= 20:
        cat = 'top20'
    else:
        cat = 'average'
    return row.country + '_' + cat

In [70]:
df['target']=df.apply(top_what, axis = 1)

In [71]:
df = df[['popularity', 'is_explicit', 'duration_ms', 'danceability', 'energy', 'key', 'loudness',
       'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'time_signature', 'target']]

# Preprocessing

In [72]:
X = df[df.columns[:-1]]
y=df[df.columns[-1]]

In [73]:
from sklearn.preprocessing import LabelEncoder

# Instantiate the LabelEncoder
label_encoder = LabelEncoder()

# Fit it to the target
label_encoder.fit(y)

# Find the encoded classes
print(f"The Label Encoder has encoded the penguin classes into {label_encoder.classes_}")

# Transform the targets
encoded_target = label_encoder.transform(y)

The Label Encoder has encoded the penguin classes into ['AE_average' 'AE_top10' 'AE_top5' 'AR_average' 'AR_top10' 'AR_top5'
 'AT_average' 'AT_top10' 'AT_top5' 'AU_average' 'AU_top10' 'AU_top5'
 'BE_average' 'BE_top10' 'BE_top5' 'BG_average' 'BG_top10' 'BG_top5'
 'BO_average' 'BO_top10' 'BO_top5' 'BR_average' 'BR_top10' 'BR_top5'
 'BY_average' 'BY_top10' 'BY_top5' 'CA_average' 'CA_top10' 'CA_top5'
 'CH_average' 'CH_top10' 'CH_top5' 'CL_average' 'CL_top10' 'CL_top5'
 'CO_average' 'CO_top10' 'CO_top5' 'CR_average' 'CR_top10' 'CR_top5'
 'CZ_average' 'CZ_top10' 'CZ_top5' 'DE_average' 'DE_top10' 'DE_top5'
 'DK_average' 'DK_top10' 'DK_top5' 'DO_average' 'DO_top10' 'DO_top5'
 'EC_average' 'EC_top10' 'EC_top5' 'EE_average' 'EE_top10' 'EE_top5'
 'EG_average' 'EG_top10' 'EG_top5' 'ES_average' 'ES_top10' 'ES_top5'
 'FI_average' 'FI_top10' 'FI_top5' 'FR_average' 'FR_top10' 'FR_top5'
 'GB_average' 'GB_top10' 'GB_top5' 'GR_average' 'GR_top10' 'GR_top5'
 'GT_average' 'GT_top10' 'GT_top5' 'HK_average' 

In [74]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, encoded_target , test_size=0.3, random_state=88)

In [75]:
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
import numpy as np

# Fit model
log_model = LogisticRegression().fit(X, y)

# Performs Permutation
permutation_score = permutation_importance(log_model, X, y, n_repeats=10)

# Unstack results showing the decrease in performance after shuffling features
importance_df = pd.DataFrame(np.vstack((X.columns,
                                        permutation_score.importances_mean)).T)
importance_df.columns=['feature','score decrease']

# Show the important features
importance_df.sort_values(by="score decrease", ascending = False)

Unnamed: 0,feature,score decrease
2,duration_ms,0.017726
0,popularity,0.011868
13,tempo,0.008999
6,loudness,0.000903
12,valence,5e-06
14,time_signature,3e-06
3,danceability,0.0
4,energy,0.0
8,speechiness,0.0
10,instrumentalness,0.0


In [60]:
X_train.dtypes

popularity            int64
is_explicit            bool
duration_ms           int64
danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
time_signature        int64
dtype: object

In [76]:
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier

num_transformer = make_pipeline(SimpleImputer(), StandardScaler())
num_col = make_column_selector(dtype_include=['float64', 'int64'])

cat_transformer = OneHotEncoder()
cat_col = make_column_selector(dtype_include=['object','bool'])

preproc_basic = make_column_transformer(
    (num_transformer, num_col),
    (cat_transformer, cat_col),
    remainder='passthrough'
)

full_pipeline = make_pipeline(preproc_basic, RandomForestClassifier())

full_pipeline

In [77]:
from sklearn.model_selection import cross_val_score

# Cross-validate Pipeline
cross_val_score(full_pipeline, X_train, y_train, cv=5, scoring='accuracy', n_jobs= 100).mean()

0.4411722217006739