### Importando o banco de dados

#### Importando as bibliotecas necessárias

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
import numpy as np
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

#### Importando o banco de dados

In [3]:
df = pd.read_csv('dataset.csv')

In [4]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,...,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic


#### Limpando o banco de dados

##### Retirando as colunas:'Unnamed: 0, track_id, artists, album_name e track_name'

In [4]:
df.drop(columns=['Unnamed: 0', 'track_id', 'artists', 'album_name', 'track_name'],inplace=True)
df.head(2)

Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic


##### Verificando se existem valores NaN

In [6]:
df.isnull().any()

popularity          False
duration_ms         False
explicit            False
danceability        False
energy              False
key                 False
loudness            False
mode                False
speechiness         False
acousticness        False
instrumentalness    False
liveness            False
valence             False
tempo               False
time_signature      False
track_genre         False
dtype: bool

##### Convertendo a coluna das classes(string) para valores inteiros

In [5]:
class_values =(pd.unique(df['track_genre'])).tolist()
dicionario = dict()

for i, value in enumerate(class_values):
    dicionario[value] = i


df['track_genre'] = df['track_genre'].map(dicionario)

### Separando os dados para treino do modelo

##### Selecionando uma amostra de 10% dos dados

In [6]:
df_sample = df.sample(frac=0.1, random_state=5).reset_index(drop=True)
df_sample.head()

Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,151173,False,0.193,0.0371,9,-35.398,0,0.0408,0.984,0.937,0.129,0.146,136.13,3,16
1,39,177960,False,0.717,0.931,0,-6.977,1,0.0314,0.592,4e-06,0.261,0.964,115.993,4,112
2,43,263906,False,0.739,0.74,0,-9.442,1,0.1,0.347,5e-06,0.706,0.791,94.516,4,74
3,20,45139,True,0.677,0.643,3,-10.145,0,0.803,0.726,2e-06,0.934,0.777,62.455,4,18
4,39,261400,False,0.79,0.814,9,-6.358,0,0.0268,0.243,0.0102,0.528,0.914,124.03,4,28


Separando as features das classes

In [7]:
X = df_sample.loc[:, df.columns != 'track_genre']
y = df_sample['track_genre']

##### Criando o codificador para transformar as variáveis categóricas em variáveis dummy

In [8]:
enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False).set_output(transform='pandas')

In [9]:
enc_transform  = enc.fit_transform(df_sample[[ 'explicit', 'key', 'mode', 'time_signature'] ]

In [10]:
# retirando as colunas do dataframe original, visto que serão substituidas pelas suas respectivas variáveis dummy.
X = X.drop(axis=1,columns=['key', 'explicit', 'mode', 'time_signature'])

In [12]:
# unindo o dataframe original com as variáveis dummy
X = pd.concat([X, enc_transform], axis=1)

In [13]:
X.columns

Index(['popularity', 'duration_ms', 'danceability', 'energy', 'loudness',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'explicit_False', 'explicit_True', 'key_0', 'key_1',
       'key_2', 'key_3', 'key_4', 'key_5', 'key_6', 'key_7', 'key_8', 'key_9',
       'key_10', 'key_11', 'mode_0', 'mode_1', 'time_signature_0',
       'time_signature_1', 'time_signature_3', 'time_signature_4',
       'time_signature_5'],
      dtype='object')

Padronizando o valor das features

In [14]:
scaler = StandardScaler()

# essa função serve para aplicar a padronização apenas nas variáveis numéricas quantitativas
ct = ColumnTransformer(
    [
        ('colunas', scaler, 
        ['popularity', 'duration_ms', 'danceability', 'energy',
        'loudness',  'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo'])
    ], remainder='passthrough'
)

# transformando as variáveis quantitativas
X_scaled = ct.fit_transform(X)

Dividir os dados em conjunto de treino e teste 

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.1, random_state = 5)

### Algorítmo KNN

In [14]:
knn = KNeighborsClassifier()

##### Definindo o espaço de parâmetros para busca

In [15]:
param_dist = {
    'n_neighbors': np.arange(1,100),
    'weights': ['uniform', 'distance'],
    'metric': [ 'euclidean', 'manhattan']
}

##### Configurando o RandomizedSearchCV

In [16]:
knn_cv = RandomizedSearchCV(
    knn,
    param_distributions=param_dist,
    n_iter=10,
    cv = 5,
    scoring='accuracy',
    random_state=5,
    verbose=0
)

#### Treinando e validando o modelo

In [17]:
knn_cv.fit(X_train, y_train)

In [18]:
print("Melhores parâmetros: ", knn_cv.best_estimator_)
print("Melhor acurácia na validação cruzada", knn_cv.best_score_)

best_knn = knn_cv.best_estimator_
test_accuracy = best_knn.score(X_test, y_test)
print("Acurácia no conjunto de teste: ", test_accuracy)

Melhores parâmetros:  KNeighborsClassifier(metric='manhattan', n_neighbors=np.int64(99),
                     weights='distance')
Melhor acurácia na validação cruzada 0.18157894736842103
Acurácia no conjunto de teste:  0.1824561403508772


In [19]:
print('Valor do f1_score: ', f1_score(y_test, best_knn.predict(X_test), average='macro') )
print('Precisão do modelo: ', precision_score(y_test, best_knn.predict(X_test), average='macro'))
print("Valor do recall do modelo: ", recall_score(y_test, best_knn.predict(X_test), average='macro'))

Valor do f1_score:  0.1719092794539346
Precisão do modelo:  0.20059519365128098
Valor do recall do modelo:  0.19424864064956945


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Algorítmo LVQ

In [20]:
from sklvq import GLVQ

In [21]:
lvq = GLVQ()

Definindo o espaço de parâmetros para busca

In [23]:
param_dist = {
    'distance_type' : ["euclidean"],
    'prototype_n_per_class' : [1, 2, 3, 4, 5]
}

Configurando o RandomizedSearchCV

In [24]:
lvq_cv = RandomizedSearchCV(
    estimator=lvq,
    param_distributions=param_dist,
    cv = 3,
    random_state=5,
    n_iter = 3
)

In [25]:
lvq_cv.fit(X_train, y_train)



In [26]:
print("Melhores parâmetros: ", lvq_cv.best_estimator_)
print("Melhor acurácia na validação cruzada", lvq_cv.best_score_)

best_lvq = lvq_cv.best_estimator_
test_accuracy = best_lvq.score(X_test, y_test)
print("Acurácia no conjunto de teste: ", test_accuracy)

Melhores parâmetros:  GLVQ(distance_type='euclidean', prototype_n_per_class=2)
Melhor acurácia na validação cruzada 0.17183235867446392
Acurácia no conjunto de teste:  0.1763157894736842




In [27]:
print('Valor do f1_score: ', f1_score(y_test, best_lvq.predict(X_test), average='macro') )
print('Precisão do modelo: ', precision_score(y_test, best_lvq.predict(X_test), average='macro'))
print("Valor do recall do modelo: ", recall_score(y_test, best_lvq.predict(X_test), average='macro'))

Valor do f1_score:  0.15841722107156841
Precisão do modelo:  0.1659440351900613
Valor do recall do modelo:  0.18869747312084775




### Algorítmo SVM

In [25]:
smv = SVC()

In [53]:
param_dist = {
    'C' : [0.01,0.1,1],
    'kernel' : ['poly', 'rbf', 'sigmoid'],
    'gamma' : [0.1,1],
    'degree': [3,5]
}

In [54]:
smv_cv = RandomizedSearchCV(
    estimator=smv,
    param_distributions= param_dist,
    cv=3,
    n_iter=3,
    scoring = 'accuracy',
    random_state=5
)

In [55]:
smv_cv.fit(X_train, y_train)

##### Validando o modelo

In [56]:
print("Melhores parâmetros: ", smv_cv.best_estimator_)
print("Melhor acurácia na validação cruzada", smv_cv.best_score_)

best_smv = smv_cv.best_estimator_
test_accuracy = best_smv.score(X_test, y_test)
print("Acurácia no conjunto de teste: ", test_accuracy)

Melhores parâmetros:  SVC(C=1, degree=5, gamma=0.1)
Melhor acurácia na validação cruzada 0.20672514619883042
Acurácia no conjunto de teste:  0.2219298245614035


In [57]:
print('Valor do f1_score: ', f1_score(y_test, best_smv.predict(X_test), average='macro') )
print('Precisão do modelo: ', precision_score(y_test, best_smv.predict(X_test), average='macro'))
print("Valor do recall do modelo: ", recall_score(y_test, best_smv.predict(X_test), average='macro'))

Valor do f1_score:  0.2076186317229871


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Precisão do modelo:  0.237161837959211
Valor do recall do modelo:  0.23482656762455525
