## Importando das bibliotecas

In [1]:
import pandas as pd
import numpy as np
import ydata_profiling as prof
from ydata_profiling import ProfileReport
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

#Modelos
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix

# métricas de avaliação
from sklearn.metrics import precision_recall_curve, average_precision_score, confusion_matrix, auc, roc_curve
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score

# Reamostragem dos dados
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import NearMiss, RandomUnderSampler

# outras
import time
import pickle
import warnings
from scipy import interp
from pprint import pprint
from scipy.stats import norm
from collections import Counter
from imblearn.pipeline import Pipeline
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline

# mesagens de warning
warnings.filterwarnings("ignore")




## Importando o conjunto de dados

O Pandas cria automaticamente a coluna "Unnamed: 0" quando nenhum cabeçalho é fornecido ou quando há uma coluna adicional sem nome que serve como índice de linha no arquivo CSV. Para impedir que a coluna "Unnamed: 0" apareça durante a leitura de um arquivo CSV com o Pandas, você pode definir qual coluna é o índice do DataFrame usando o parâmetro index_col na função read_csv().

In [2]:
df = pd.read_csv ("dataset 2.csv", index_col=0)

In [3]:
df.head()

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [4]:
#Gerando o profiling
profile = ProfileReport(df, title="Profiling Spotify Tracks",html={'style':{'full_width':True}})

In [5]:
#Exportando o profiling
profile.to_file("profiling_spotify_tracks.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
#Verificando o tipo de dado
df.dtypes

track_id             object
artists              object
album_name           object
track_name           object
popularity            int64
duration_ms           int64
explicit               bool
danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
time_signature        int64
track_genre          object
dtype: object

In [7]:
# Descrição dos atributos numéricos
df.describe()

Unnamed: 0,popularity,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
count,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0
mean,33.238535,228029.2,0.5668,0.641383,5.30914,-8.25896,0.637553,0.084652,0.31491,0.15605,0.213553,0.474068,122.147837,3.904035
std,22.305078,107297.7,0.173542,0.251529,3.559987,5.029337,0.480709,0.105732,0.332523,0.309555,0.190378,0.259261,29.978197,0.432621
min,0.0,0.0,0.0,0.0,0.0,-49.531,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,17.0,174066.0,0.456,0.472,2.0,-10.013,0.0,0.0359,0.0169,0.0,0.098,0.26,99.21875,4.0
50%,35.0,212906.0,0.58,0.685,5.0,-7.004,1.0,0.0489,0.169,4.2e-05,0.132,0.464,122.017,4.0
75%,50.0,261506.0,0.695,0.854,8.0,-5.003,1.0,0.0845,0.598,0.049,0.273,0.683,140.071,4.0
max,100.0,5237295.0,0.985,1.0,11.0,4.532,1.0,0.965,0.996,1.0,1.0,0.995,243.372,5.0


In [8]:
#Tipo do DataFrame
type(df)

pandas.core.frame.DataFrame

## Análise Exploratória dos Dados

In [9]:
#Visualização da dimensão
df.shape

(114000, 20)

In [10]:
#Visualização da quantidade em cada linha
df.count()

track_id            114000
artists             113999
album_name          113999
track_name          113999
popularity          114000
duration_ms         114000
explicit            114000
danceability        114000
energy              114000
key                 114000
loudness            114000
mode                114000
speechiness         114000
acousticness        114000
instrumentalness    114000
liveness            114000
valence             114000
tempo               114000
time_signature      114000
track_genre         114000
dtype: int64

In [11]:
#Visualização dos dados ausentes
df.isnull().sum()

track_id            0
artists             1
album_name          1
track_name          1
popularity          0
duration_ms         0
explicit            0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
track_genre         0
dtype: int64

In [12]:
#Os artistas mais citados
df['artists'].value_counts()

The Beatles                                                                 279
George Jones                                                                271
Stevie Wonder                                                               236
Linkin Park                                                                 224
Ella Fitzgerald                                                             222
                                                                           ... 
Automatic Tasty                                                               1
o9                                                                            1
Pyotr Ilyich Tchaikovsky;National Philharmonic Orchestra;Richard Bonynge      1
tstewart;Ólafur Arnalds                                                       1
Jesus Culture                                                                 1
Name: artists, Length: 31437, dtype: int64

In [13]:
#As 50 músicas mais populares
df.sort_values('popularity', ascending=False).head(50)

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
20001,3nqQXoyQOWXiESFLlDF1hG,Sam Smith;Kim Petras,Unholy (feat. Kim Petras),Unholy (feat. Kim Petras),100,156943,False,0.714,0.472,2,-7.375,1,0.0864,0.013,5e-06,0.266,0.238,131.121,4,dance
81051,3nqQXoyQOWXiESFLlDF1hG,Sam Smith;Kim Petras,Unholy (feat. Kim Petras),Unholy (feat. Kim Petras),100,156943,False,0.714,0.472,2,-7.375,1,0.0864,0.013,5e-06,0.266,0.238,131.121,4,pop
51664,2tTmW7RDtMQtBk7m2rYeSw,Bizarrap;Quevedo,"Quevedo: Bzrp Music Sessions, Vol. 52","Quevedo: Bzrp Music Sessions, Vol. 52",99,198937,False,0.621,0.782,2,-5.548,1,0.044,0.0125,0.033,0.23,0.55,128.033,4,hip-hop
81210,4uUG5RXrOk84mYEfFvj3cK,David Guetta;Bebe Rexha,I'm Good (Blue),I'm Good (Blue),98,175238,True,0.561,0.965,7,-3.673,0,0.0343,0.00383,7e-06,0.371,0.304,128.04,4,pop
89411,5ww2BF9slyYgNOk37BlC4u,Manuel Turizo,La Bachata,La Bachata,98,162637,False,0.835,0.679,7,-5.329,0,0.0364,0.583,2e-06,0.218,0.85,124.98,4,reggaeton
68303,5ww2BF9slyYgNOk37BlC4u,Manuel Turizo,La Bachata,La Bachata,98,162637,False,0.835,0.679,7,-5.329,0,0.0364,0.583,2e-06,0.218,0.85,124.98,4,latino
88410,5ww2BF9slyYgNOk37BlC4u,Manuel Turizo,La Bachata,La Bachata,98,162637,False,0.835,0.679,7,-5.329,0,0.0364,0.583,2e-06,0.218,0.85,124.98,4,reggae
30003,4uUG5RXrOk84mYEfFvj3cK,David Guetta;Bebe Rexha,I'm Good (Blue),I'm Good (Blue),98,175238,True,0.561,0.965,7,-3.673,0,0.0343,0.00383,7e-06,0.371,0.304,128.04,4,edm
20008,4uUG5RXrOk84mYEfFvj3cK,David Guetta;Bebe Rexha,I'm Good (Blue),I'm Good (Blue),98,175238,True,0.561,0.965,7,-3.673,0,0.0343,0.00383,7e-06,0.371,0.304,128.04,4,dance
67356,5ww2BF9slyYgNOk37BlC4u,Manuel Turizo,La Bachata,La Bachata,98,162637,False,0.835,0.679,7,-5.329,0,0.0364,0.583,2e-06,0.218,0.85,124.98,4,latin


In [14]:
#Os artistas mais populares
artistas_popularidade = df[["artists","popularity"]]
artistas_popularidade = artistas_popularidade.groupby("artists").max().sort_values(by="popularity", ascending=False).head()

print(artistas_popularidade)

                            popularity
artists                               
Sam Smith;Kim Petras               100
Bizarrap;Quevedo                    99
David Guetta;Bebe Rexha             98
Manuel Turizo                       98
Bad Bunny;Chencho Corleone          97


In [15]:
#As músicas mais longas
musicas_longas = df[["track_name","duration_ms"]].sort_values(by="duration_ms", ascending=False).head(5)
musicas_longas

Unnamed: 0,track_name,duration_ms
73617,Unity (Voyage Mix) Pt. 1,5237295
10984,Crossing Wires 002 - Continuous DJ Mix,4789026
10935,Crossing Wires 002 - Continuous DJ Mix,4789026
24348,The Lab 03 - Continuous DJ Mix Part 1,4730302
73840,Amnesia Ibiza Underground 10 DJ Mix,4563897


In [16]:
#Os gêneros mais populares
pop_genero = df[["track_genre","popularity"]].sort_values(by="popularity", ascending=False).head(5)
pop_genero

Unnamed: 0,track_genre,popularity
20001,dance,100
81051,pop,100
51664,hip-hop,99
81210,pop,98
89411,reggaeton,98


In [17]:
#Músicas mais dançantes
dancante = df[["track_name", "artists", "danceability"]].sort_values(by="danceability", ascending=False).head(5)
dancante

Unnamed: 0,track_name,artists,danceability
111635,Sol Clap,Quantic,0.985
66576,Medicaid Baby,That Girl Lay Lay,0.984
24916,Inspiration,Delano Smith,0.983
73374,Daily Routines,Oliver Schories,0.982
66793,"Featuring Mixx Master Lee, Red Rum & J. Smoov ...",Mixx Master Lee,0.981


In [18]:
#Correlação dos dados

corr = df.corr(numeric_only=True)
plt.figure(figsize=(12,8), dpi=200)
sns.heatmap(corr, annot=True, cmap='viridis')
plt.title("Correlação Heatmap entre as variáveis")
plt.show() 

## Pré-processamento de Dados

In [19]:
#Removendo itens duplicados
df = df.drop_duplicates()
df.head()

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [20]:
#Removendo valores nulos
df = df.dropna()

In [21]:
#Criando classe de popularidade
def classificacao_popularidade(popularity):
    if popularity > 70:
        return 1
    else:
        return 0
df.insert(5,"popular", df["popularity"].apply(classificacao_popularidade))
df.head()

Unnamed: 0,track_id,artists,album_name,track_name,popularity,popular,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,1,230666,False,0.676,0.461,...,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,0,149610,False,0.42,0.166,...,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,0,210826,False,0.438,0.359,...,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,1,201933,False,0.266,0.0596,...,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,1,198853,False,0.618,0.443,...,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [22]:
df = df.drop(columns=['popularity'])

In [23]:
df.dtypes

track_id             object
artists              object
album_name           object
track_name           object
popular               int64
duration_ms           int64
explicit               bool
danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
time_signature        int64
track_genre          object
dtype: object

In [24]:
#Mantendo apenas as colunas quantitativas
df_quantitative =df.select_dtypes(include=['int64', 'float64']).copy()
df_quantitative.head()

Unnamed: 0,popular,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,1,230666,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4
1,0,149610,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4
2,0,210826,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4
3,1,201933,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3
4,1,198853,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4


In [25]:
#Normalizar os dados
scaler = MinMaxScaler()
df_normalized = pd.DataFrame(scaler.fit_transform(df_quantitative), columns=df_quantitative.columns)
df_normalized.head()

Unnamed: 0,popular,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,1.0,0.042473,0.686294,0.461,0.090909,0.791392,0.0,0.148187,0.032329,1e-06,0.358,0.718593,0.361245,0.8
1,0.0,0.026971,0.426396,0.166,0.090909,0.597377,1.0,0.079067,0.927711,6e-06,0.101,0.268342,0.318397,0.8
2,0.0,0.038679,0.44467,0.359,0.0,0.736123,1.0,0.05772,0.210843,0.0,0.117,0.120603,0.313643,0.8
3,1.0,0.036978,0.270051,0.0596,0.0,0.573701,1.0,0.037617,0.908635,7.1e-05,0.132,0.143719,0.746758,0.6
4,1.0,0.036389,0.627411,0.443,0.181818,0.737103,1.0,0.054508,0.470884,0.0,0.0829,0.167839,0.492863,0.8


## Dados de treino, validação e teste

In [26]:
#Separação em dados de treino e test
df_train, df_test = train_test_split(df_normalized, test_size=0.2, random_state=14, shuffle=True)

In [27]:
# Visualizando as proporções da variável alvo
df_train.popular.value_counts(normalize=True)

0.0    0.957232
1.0    0.042768
Name: popular, dtype: float64

In [28]:
# Visualizando as proporções da variável alvo
df_test.popular.value_counts(normalize=True)

0.0    0.957816
1.0    0.042184
Name: popular, dtype: float64

In [29]:
# Dividindo X e Y
X = df_train.drop('popular', axis=1)
y = df_train.popular

In [30]:
# Separando os dados mantendo a porcentagem de amostras em cada classe
StratifKfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=14)

In [31]:
# separando os dados em treino e validação usando StratifiedKFold
for train_index, val_index in StratifKfold.split(X, y):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

In [32]:
# checando as proporções da classe 1 na divisão
print(f'Dimensões: {X_train.shape, X_val.shape, y_train.shape, y_val.shape}\n')
print(f"Proporção do df_train para classe=1: {round(len(df_train[df_train.popular==1]) / df_train.shape[0], 4)}\n")
print(f"Proporção de X_train para classe=1:  {round(len(y_train[y_train==1]) / X_train.shape[0], 4)}")
print(f"Proporção de X_val para classe=1:    {round(len(y_val[y_val==1]) / X_val.shape[0], 4)}")

Dimensões: ((72672, 13), (18167, 13), (72672,), (18167,))

Proporção do df_train para classe=1: 0.0428

Proporção de X_train para classe=1:  0.0428
Proporção de X_val para classe=1:    0.0428


## Definição de uma baseline

In [33]:
#Instanciando o modelo
logReg = LogisticRegression(max_iter=1000)

In [34]:
#Treinando o modelo
logReg.fit(X_train, y_train)

In [35]:
#Predição nos dados de treino
y_pred_base_train=logReg.predict(X_train)

In [36]:
#Predição nos dados de validação
y_pred_base_val=logReg.predict(X_val)

In [37]:
#Listando os coeficientes
logReg.coef_.tolist()[0]

[-1.678621266897505,
 1.3356054193133662,
 -1.8093738609607608,
 -0.02807421359885088,
 5.737868804661005,
 -0.13550045536731908,
 -1.525182254675035,
 -1.3920192244425424,
 -2.2467022131190424,
 -1.5245643903364952,
 -0.34094492547216965,
 -0.7958954732613817,
 -0.039795324136916194]

In [38]:
#Criando um dataframe dos coeficientes
df_coef = df_normalized.drop(columns="popular")
coeficientes = pd.DataFrame({
    "Features": df_coef.columns, 
    "Coeficientes": logReg.coef_.tolist()[0]
})
coeficientes

Unnamed: 0,Features,Coeficientes
0,duration_ms,-1.678621
1,danceability,1.335605
2,energy,-1.809374
3,key,-0.028074
4,loudness,5.737869
5,mode,-0.1355
6,speechiness,-1.525182
7,acousticness,-1.392019
8,instrumentalness,-2.246702
9,liveness,-1.524564


## Validação da baseline

In [39]:
print('Nos dados de TREINO:')

print('---' * 10)
print('Modelo:    Regressão Logística (baseline)\n')
print(f"accuracy: {round(accuracy_score(y_train, y_pred_base_train), 4)}")
print(f"precision: {precision_score(y_train, y_pred_base_train)}")
print(f"recall:    {recall_score(y_train, y_pred_base_train)}")
print(f"f1:        {f1_score(y_train, y_pred_base_train)}")
print()
print('---' * 10)
print('---' * 10)
print()
print('Nos dados de VALIDAÇÃO:')
print('---' * 10)
print('Modelo:    Regressão Logística (baseline)\n')
print(f"accuracy:  {round(accuracy_score(y_val, y_pred_base_val), 4)}")
print(f"precision: {precision_score(y_val, y_pred_base_val)}")
print(f"recall:    {recall_score(y_val, y_pred_base_val)}")
print(f"f1:        {f1_score(y_val, y_pred_base_val)}")
print('---' * 10)

Nos dados de TREINO:
------------------------------
Modelo:    Regressão Logística (baseline)

accuracy: 0.9572
precision: 0.0
recall:    0.0
f1:        0.0

------------------------------
------------------------------

Nos dados de VALIDAÇÃO:
------------------------------
Modelo:    Regressão Logística (baseline)

accuracy:  0.9572
precision: 0.0
recall:    0.0
f1:        0.0
------------------------------


## Reamostragem, teste e validação

In [40]:
#Classificadores
classificadores = {
    "LogisticRegression": LogisticRegression(),
    "KNearest": KNeighborsClassifier(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "RandomForest": RandomForestClassifier()
}

## Distribuição - Random UnderSampling

In [41]:
#Intanciando o método
undersampling=NearMiss()

In [42]:
# transformando o conjunto de dados
X_under, y_under = undersampling.fit_resample(X_train, y_train)

In [43]:
# resumindo a distribuição da nova classe
counter = Counter(y_under)
print(counter)

Counter({0.0: 3108, 1.0: 3108})


Função Counter: retorna um dicionário que mapeia cada valor da variável alvo (classe) aos seus respectivos contadores (frequências).

In [44]:
for nome, modelo in classificadores.items():
    modelo.fit(X_under, y_under)
    y_pred = modelo.predict(X_val)
    
    print(f'Modelo:    {nome}\n')
    print(f"accuracy:  {accuracy_score(y_val, y_pred)}")
    print(f"precision: {precision_score(y_val, y_pred)}")
    print(f"recall:    {recall_score(y_val, y_pred)}")
    print(f"f1:        {f1_score(y_val, y_pred)}")
    print('---' * 20)
    print('---' * 20)

Modelo:    LogisticRegression

accuracy:  0.26526118786811254
precision: 0.033751205400192864
recall:    0.5855855855855856
f1:        0.0638238182073222
------------------------------------------------------------
------------------------------------------------------------
Modelo:    KNearest

accuracy:  0.278141685473661
precision: 0.03616813294232649
recall:    0.6190476190476191
f1:        0.0683432793407218
------------------------------------------------------------
------------------------------------------------------------
Modelo:    DecisionTreeClassifier

accuracy:  0.288930478339847
precision: 0.04909752655425982
recall:    0.8507078507078507
f1:        0.09283707865168539
------------------------------------------------------------
------------------------------------------------------------
Modelo:    RandomForest

accuracy:  0.16463918093245997
precision: 0.04407573934519663
recall:    0.8957528957528957
f1:        0.08401738290680831
-----------------------------------

## Distribuição - Random Over-Sampling

In [45]:
#Instanciando o método
oversampling = RandomOverSampler(sampling_strategy="minority")

In [46]:
#Transformando o conjunto de dados
X_over, y_over = oversampling.fit_resample(X_train,y_train)

In [47]:
# resumindo a distribuição da nova classe
counter = Counter(y_over)
print(counter)

Counter({0.0: 69564, 1.0: 69564})


In [48]:
for nome, modelo in classificadores.items():
    modelo.fit(X_over, y_over)
    y_pred = modelo.predict(X_val)
    
    print(f'Modelo:    {nome}\n')
    print(f"accuracy:  {accuracy_score(y_val, y_pred)}")
    print(f"precision: {precision_score(y_val, y_pred)}")
    print(f"recall:    {recall_score(y_val, y_pred)}")
    print(f"f1:        {f1_score(y_val, y_pred)}")
    print('---' * 20)
    print('---' * 20)

Modelo:    LogisticRegression

accuracy:  0.576925194033137
precision: 0.06683385579937304
recall:    0.685971685971686
f1:        0.12180073126142595
------------------------------------------------------------
------------------------------------------------------------
Modelo:    KNearest

accuracy:  0.9106071448230307
precision: 0.2651136993899057
recall:    0.6151866151866152
f1:        0.3705426356589147
------------------------------------------------------------
------------------------------------------------------------
Modelo:    DecisionTreeClassifier

accuracy:  0.9494137722243629
precision: 0.4336448598130841
recall:    0.5971685971685972
f1:        0.5024363833243097
------------------------------------------------------------
------------------------------------------------------------
Modelo:    RandomForest

accuracy:  0.9690097429404965
precision: 0.6597014925373135
recall:    0.5688545688545689
f1:        0.6109191430545957
------------------------------------------

## SMOTE (Over-Sampling)

In [49]:
#Instanciando o método
oversample_smote = SMOTE(sampling_strategy="minority")

In [50]:
#Transformando o conjunto de dados
X_smote, y_smote = oversample_smote.fit_resample(X_train,y_train)

In [51]:
# resumindo a distribuição da nova classe
counter = Counter(y_smote)
print(counter)

Counter({0.0: 69564, 1.0: 69564})


In [52]:
for nome, modelo in classificadores.items():
    modelo.fit(X_smote, y_smote)
    y_pred = modelo.predict(X_val)
    
    print(f'Modelo:    {nome}\n')
    print(f"accuracy:  {accuracy_score(y_val, y_pred)}")
    print(f"precision: {precision_score(y_val, y_pred)}")
    print(f"recall:    {recall_score(y_val, y_pred)}")
    print(f"f1:        {f1_score(y_val, y_pred)}")
    print('---' * 20)
    print('---' * 20)

Modelo:    LogisticRegression

accuracy:  0.590246050531183
precision: 0.06702169112871802
recall:    0.6640926640926641
f1:        0.1217555450684285
------------------------------------------------------------
------------------------------------------------------------
Modelo:    KNearest

accuracy:  0.8581493917542797
precision: 0.17994310099573257
recall:    0.6512226512226512
f1:        0.2819726943438284
------------------------------------------------------------
------------------------------------------------------------
Modelo:    DecisionTreeClassifier

accuracy:  0.9032861782352617
precision: 0.24505723204994798
recall:    0.6061776061776062
f1:        0.3490181548721749
------------------------------------------------------------
------------------------------------------------------------
Modelo:    RandomForest

accuracy:  0.9568448285352562
precision: 0.4960893854748603
recall:    0.5714285714285714
f1:        0.5311004784688995
----------------------------------------

## Oversampling e Undersampling - Método Híbrido

In [53]:
# Transformando os dados
over = RandomOverSampler (sampling_strategy=0.1)
under = RandomUnderSampler (sampling_strategy = 0.5)

In [54]:
# Instanciando um pipeline com os dois métodos
pipeline = Pipeline (steps=[("o", over), ("u", under)])

In [55]:
# Transformando o conjunto de dados
X_over_under, y_over_under = pipeline.fit_resample(X_train, y_train)

In [56]:
# resumindo a distribuição da nova classe
counter = Counter(y_smote)
print(counter)

Counter({0.0: 69564, 1.0: 69564})


In [57]:
for nome, modelo in classificadores.items():
    modelo.fit(X_over_under, y_over_under)
    y_pred = modelo.predict(X_val)
    
    print(f'Modelo:    {nome}\n')
    print(f"accuracy:  {accuracy_score(y_val, y_pred)}")
    print(f"precision: {precision_score(y_val, y_pred)}")
    print(f"recall:    {recall_score(y_val, y_pred)}")
    print(f"f1:        {f1_score(y_val, y_pred)}")
    print('---' * 20)
    print('---' * 20)

Modelo:    LogisticRegression

accuracy:  0.8737821324379369
precision: 0.10105263157894737
recall:    0.2471042471042471
f1:        0.14344415390362347
------------------------------------------------------------
------------------------------------------------------------
Modelo:    KNearest

accuracy:  0.7769582209500743
precision: 0.11260941566122545
recall:    0.6126126126126126
f1:        0.19024780175859313
------------------------------------------------------------
------------------------------------------------------------
Modelo:    DecisionTreeClassifier

accuracy:  0.8725161006220069
precision: 0.19906140007821665
recall:    0.6550836550836551
f1:        0.3053389322135573
------------------------------------------------------------
------------------------------------------------------------
Modelo:    RandomForest

accuracy:  0.961853910937414
precision: 0.5506024096385542
recall:    0.5881595881595881
f1:        0.5687616677037959
--------------------------------------

## Avaliação dos modelos

O random forest com o método de balanceamento Random Over-Sampling performou melhor

## Ajustando o modelo

## Definição dos hiperparâmetros

In [58]:
# Transformação dos dados
X_train_array = X_train.values
X_val_array = X_val.values
y_train_array = y_train.values
y_val_array = y_val.values

In [59]:
# Hiperparâmetros para Random Forest

# número de árvores
n_estimators = [3,5,10,100]

# número de features para considerar em cada split
max_features = [3,4,5,6,None]

# número máximo de níveis da árvore
max_depth = [3,4,5,6,None]

# pesos para cada classe
class_weight = ['balanced',None]

# colocando os parametros definidos em um dicionário
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'class_weight': class_weight}

## Random Forest + Over-Sampling

In [60]:
# definindo listas vazias para receber os resultados da validação cruzada
accuracy_lst_rf = []
precision_lst_rf = []
recall_lst_rf = []
f1_lst_rf = []
auc_lst_rf = []

# instanciando o RandomizedSearchCV para validação cruzada com o grid de parametros
rf_grid = RandomizedSearchCV(RandomForestClassifier(), random_grid, random_state=14)

# iniciando o tempo e contador para o loop
start = time.time()
count = 0

# looping para a validação cruzada
for train, test in StratifKfold.split(X_train_array, y_train_array):
    
    # instanciando o pipeline com técnica de reamostragem
    pipeline_rf = imbalanced_make_pipeline(RandomOverSampler(sampling_strategy='minority'), rf_grid)
    
    # executando o pipeline para treinar o modelo
    model_rf = pipeline_rf.fit(X_train_array[train], y_train_array[train])
    
    # identificando os melhores parâmetros no treinamento
    best_est_rf = rf_grid.best_estimator_
    
    # realizando as previsões com o resultado do treinamento
    prediction_rf = best_est_rf.predict(X_train_array[test])
    
    # armazenando os resultados
    accuracy_lst_rf.append(pipeline_rf.score(X_train_array[test], y_train_array[test]))
    precision_lst_rf.append(precision_score(y_train_array[test], prediction_rf))
    recall_lst_rf.append(recall_score(y_train_array[test], prediction_rf))
    f1_lst_rf.append(f1_score(y_train_array[test], prediction_rf))
    auc_lst_rf.append(roc_auc_score(y_train_array[test], prediction_rf))
    
    # identificando em qual loop está
    count = count + 1
    print(f"kfold nº: {count}")
    
    # medindo o tempo de cada loop
    intervalo = time.time()
    print(f"tempo intermediário: {(intervalo - start)/60:.1f} minutos")
    print()

print('---' * 20)
print(f"melhores estimadores para RF: {rf_grid.best_params_}")

# imprimindo a média dos resultados da validação cruzada
print('---' * 20)
print('Resultados')
print(f"accuracy:  {np.mean(accuracy_lst_rf)}")
print(f"precision: {np.mean(precision_lst_rf)}")
print(f"recall:    {np.mean(recall_lst_rf)}")
print(f"f1:        {np.mean(f1_lst_rf)}")
print(f"AUC:       {np.mean(auc_lst_rf)}")
print('---' * 20)

# imprimindo o tempo total da validação cruzada
end = time.time()
print(f"Tempo do loop para validação cruzada: {(end - start)/60:.1f} minutos")

kfold nº: 1
tempo intermediário: 4.5 minutos

kfold nº: 2
tempo intermediário: 8.8 minutos

kfold nº: 3
tempo intermediário: 13.5 minutos

kfold nº: 4
tempo intermediário: 18.6 minutos

kfold nº: 5
tempo intermediário: 23.1 minutos

------------------------------------------------------------
melhores estimadores para RF: {'n_estimators': 10, 'max_features': None, 'max_depth': None, 'class_weight': None}
------------------------------------------------------------
Resultados
accuracy:  0.9654887343149776
precision: 0.6181561221975604
recall:    0.5067456803931011
f1:        0.5568671263159294
AUC:       0.7463649133181635
------------------------------------------------------------
Tempo do loop para validação cruzada: 23.1 minutos


In [78]:
def draw_cv_roc_curve(classificadores, cv, X, y, title='ROC Curve'):
    mean_fpr = np.linspace(0, 1, 100)
    tprs = []
    aucs = []
    
    for i, (train, test) in enumerate(cv.split(X, y)):
        probas_ = classificadores.fit(X.iloc[train], y.iloc[train]).predict_proba(X.iloc[test])
        fpr, tpr, thresholds = roc_curve(y.iloc[test], probas_[:, 1])
        roc_auc = auc(fpr, tpr)
        tprs.append(interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0
        aucs.append(roc_auc)
        plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
    
    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Luck', alpha=.8)
    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)
    std_tpr = np.std(tprs, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    plt.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc), lw=2, alpha=.8)
    plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.')
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate', fontsize=12)
    plt.ylabel('True Positive Rate', fontsize=12)
    plt.title(title)
    plt.legend(loc="lower right")
    sns.despine()
    plt.show()

draw_cv_roc_curve(best_est_rf, StratifKfold, X_train, y_train, title='ROC Curve')
plt.show()

## Treinamento do modelo Final

In [62]:
# separando novamente os dados
X_all = df_train.drop('popular', axis=1)
y_all = df_train.popular

# instanciando o modelo final com os parametros encontrados na validação cruzada
final_model = rf_grid.best_estimator_

# definindo o pipeline para reamostragem oversampling e o modelo final
pipeline = imbalanced_make_pipeline(RandomOverSampler(sampling_strategy='minority'), final_model)

# treinando o modelo com os parametros ajustados com todos os dados
modelo = pipeline.fit(X_all, y_all)

In [63]:
lista = modelo.steps[1][1].feature_importances_

In [74]:
# lista de nomes
nomes = df_normalized.columns

plt.barh(df_normalized.drop(columns=['popular']).columns.tolist(), lista.tolist())

plt.show()

## Preparação, previsão e avaliação nos dados de teste

In [66]:
# separando os dados de teste
X_test = df_test.drop('popular', axis=1)
y_test = df_test.popular

# realizando as predições nos dados de teste
y_pred_test = modelo.predict(X_test)

# imprimindo os resultados
print(f"accuracy:  {accuracy_score(y_test, y_pred_test)}")
print(f"precision: {precision_score(y_test, y_pred_test)}")
print(f"recall:    {recall_score(y_test, y_pred_test)}")
print(f"f1:        {f1_score(y_test, y_pred_test)}")
     


accuracy:  0.9699691765741963
precision: 0.6393939393939394
recall:    0.6607515657620042
f1:        0.6498973305954826


In [79]:
plt.figure(figsize=(5,5))
sns.heatmap(confusion_matrix(y_test, y_pred_test), annot=True, cmap=plt.cm.copper, fmt='g', cbar=False)
plt.title("Matriz de Confusão", fontsize=16, fontweight='bold')
plt.show()

In [68]:
pd.DataFrame(index=['Baseline', 'Validação', 'Teste'], 
             data = {"accuracy": [accuracy_score(y_val, y_pred_base_val), np.mean(accuracy_lst_rf), accuracy_score(y_test, y_pred_test)], 
                     "precision": [precision_score(y_val, y_pred_base_val), np.mean(precision_lst_rf), precision_score(y_test, y_pred_test)],
                     "recall": [recall_score(y_val, y_pred_base_val), np.mean(recall_lst_rf), recall_score(y_test, y_pred_test)],
                     "f1": [f1_score(y_val, y_pred_base_val), np.mean(f1_lst_rf), f1_score(y_test, y_pred_test)]}
            )

Unnamed: 0,accuracy,precision,recall,f1
Baseline,0.95723,0.0,0.0,0.0
Validação,0.965489,0.618156,0.506746,0.556867
Teste,0.969969,0.639394,0.660752,0.649897


## Previsões

In [81]:
# armazenando as previsões no dataframe de teste
df_test['Predicted'] = y_pred_test

# comparando com real com o previsto no dataframe
df_test['Correct?'] = np.where(df_test['popular'] == df_test['Predicted'], 'Yes', 'No')
df_test

Unnamed: 0,popular,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,Predicted,Correct?
112072,0.0,0.035461,0.919797,0.58200,0.363636,0.750754,0.0,0.233161,0.053112,0.026900,0.2460,0.500503,0.542359,0.8,0.0,Yes
55204,0.0,0.010077,0.618274,0.63100,0.000000,0.772044,1.0,0.039793,0.524096,0.000028,0.1020,0.482412,0.386400,0.8,0.0,Yes
71214,1.0,0.037640,0.395939,0.93900,0.363636,0.845366,0.0,0.081554,0.000051,0.000000,0.0968,0.497487,0.665635,0.8,1.0,Yes
109047,0.0,0.050793,0.568528,0.99600,0.090909,0.818582,1.0,0.061347,0.000140,0.688000,0.1430,0.161809,0.591736,0.8,0.0,Yes
69749,0.0,0.040362,0.616244,0.92600,0.454545,0.848862,1.0,0.238342,0.377510,0.000000,0.2640,0.826131,0.822498,0.8,0.0,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30099,0.0,0.035909,0.367513,0.85800,0.181818,0.843664,1.0,0.076995,0.002118,0.000001,0.2500,0.283417,0.525496,0.8,0.0,Yes
45716,0.0,0.022743,0.185787,0.00002,0.000000,0.409078,1.0,0.101969,0.942771,0.997000,0.1130,0.000000,0.306075,0.6,0.0,Yes
12524,0.0,0.036255,0.598985,0.31800,0.272727,0.704160,1.0,0.032228,0.694779,0.000000,0.1140,0.223116,0.560290,0.8,0.0,Yes
1038,0.0,0.045888,0.742132,0.57500,0.818182,0.800529,0.0,0.042487,0.705823,0.000000,0.1210,0.707538,0.591629,0.8,0.0,Yes
