In [143]:
#!pip install category_encoders
#!pip install -U feature-engine

In [144]:
#!pip install -U tensorflow
#!pip install -U keras

In [145]:
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

from keras.models import Sequential, Model
from keras.layers import Input, Dense

from feature_engine.encoding import RareLabelEncoder, CountFrequencyEncoder
import category_encoders as ce


### Orientações

O dataset fornecido possui diversas características que foram extraídas dos arquivos de áudio bruto. Ele foi construído para auxiliar no desenvolvimento de um modelo que indique se uma música deve ser recomendada para um grupo específico de pessoas. Esses rótulos estão representados na coluna target, onde 0 representa que a música não deve ser recomendado. Além das características extraídas do áudio, o dataset possui algumas variáveis categóricas, que serão o objeto deste exercício.

Você deverá realizar suas atividades com base nos seguintes pontos:

1. Estabelecer um *baseline* do modelo sem a utilização das variáveis categóricas. Para este caso, busque otimizar o valor do parâmetro k no caso de utilização do classificador KNN;

2. Definir quais variáveis categóricas deverão ser utilizadas e quais serão descartadas;

3. Estabelecer a influência da(s) variável(is) categórica(s) no resultado do modelo, levando em conta as diferentes possibilidades de feature engineering estudadas até o momento;



###Carga do Dataset

In [146]:
url = 'datasets/DadosSpotify.csv'
dataset = pd.read_csv(url, engine='python')
dataset.head()

Unnamed: 0,id,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,target,song_title,artist
0,0,0.0102,0.833,204600,0.434,0.0219,2,0.165,-8.795,1,0.431,150.062,4.0,0.286,1,Mask Off,Future
1,1,0.199,0.743,326933,0.359,0.00611,1,0.137,-10.401,1,0.0794,160.083,4.0,0.588,1,Redbone,Childish Gambino
2,2,0.0344,0.838,185707,0.412,0.000234,2,0.159,-7.148,1,0.289,75.044,4.0,0.173,1,Xanny Family,Future
3,3,0.604,0.494,199413,0.338,0.51,5,0.0922,-15.236,1,0.0261,86.468,4.0,0.23,1,Master Of None,Beach House
4,4,0.18,0.678,392893,0.561,0.512,5,0.439,-11.648,0,0.0694,174.004,4.0,0.904,1,Parallel Lines,Junior Boys


In [147]:
dataset.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 2017 entries, 0 to 2016
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                2017 non-null   int64  
 1   acousticness      2017 non-null   float64
 2   danceability      2017 non-null   float64
 3   duration_ms       2017 non-null   int64  
 4   energy            2017 non-null   float64
 5   instrumentalness  2017 non-null   float64
 6   key               2017 non-null   int64  
 7   liveness          2017 non-null   float64
 8   loudness          2017 non-null   float64
 9   mode              2017 non-null   int64  
 10  speechiness       2017 non-null   float64
 11  tempo             2017 non-null   float64
 12  time_signature    2017 non-null   float64
 13  valence           2017 non-null   float64
 14  target            2017 non-null   int64  
 15  song_title        2017 non-null   object 
 16  artist            2017 non-null   object 
dtype

###Pré-processamento

In [148]:
#Coloque Aqui as fases de pré-processamento em uma ou mais células de código
dataset.info()
dataset.drop(['id', 'song_title'], inplace=True, axis=1)

<class 'pandas.core.frame.DataFrame'>
Index: 2017 entries, 0 to 2016
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                2017 non-null   int64  
 1   acousticness      2017 non-null   float64
 2   danceability      2017 non-null   float64
 3   duration_ms       2017 non-null   int64  
 4   energy            2017 non-null   float64
 5   instrumentalness  2017 non-null   float64
 6   key               2017 non-null   int64  
 7   liveness          2017 non-null   float64
 8   loudness          2017 non-null   float64
 9   mode              2017 non-null   int64  
 10  speechiness       2017 non-null   float64
 11  tempo             2017 non-null   float64
 12  time_signature    2017 non-null   float64
 13  valence           2017 non-null   float64
 14  target            2017 non-null   int64  
 15  song_title        2017 non-null   object 
 16  artist            2017 non-null   object 
dtype

In [156]:
#Normalização do Conjunto
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder().fit(dataset['artist'])
dataset['artist_label'] = encoder.transform(dataset['artist'])


from feature_engine.encoding import RareLabelEncoder
encoder = RareLabelEncoder(tol=0.03, n_categories=2,
                                         variables=['artist'],
                                         replace_with='Rare')
# fit the encoder
encoder.fit(X_train)


dataset.drop('artist', inplace=True, axis=1)
#dataset.drop('artist_label', inplace=True, axis=1)
dataset

KeyError: 'artist'

In [150]:
#Separação do Conjunto de Treinamento e Teste
X = dataset.drop('target', axis=1)
y = dataset['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state=25)
X_train

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,artist_label
936,0.18900,0.555,52006,0.759,0.459000,7,0.248,-6.214,0,0.1420,192.272,4.0,0.230,226
1101,0.12500,0.761,211627,0.820,0.000021,1,0.183,-4.003,1,0.0623,88.997,4.0,0.619,350
1418,0.21400,0.523,291280,0.783,0.000000,6,0.612,-3.755,0,0.1850,117.264,5.0,0.312,960
1155,0.97000,0.549,206933,0.371,0.132000,2,0.639,-9.560,1,0.0378,87.024,3.0,0.117,721
1239,0.03530,0.576,254262,0.686,0.000067,3,0.114,-6.032,1,0.0554,153.901,4.0,0.599,985
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1341,0.00115,0.339,219317,0.958,0.000016,9,0.651,-2.909,0,0.1530,128.806,4.0,0.273,26
143,0.02520,0.340,147467,0.805,0.000000,7,0.138,-3.397,1,0.0387,167.922,4.0,0.654,401
474,0.03050,0.874,217783,0.492,0.000000,2,0.117,-4.898,1,0.3280,64.992,4.0,0.644,1233
318,0.41900,0.331,133093,0.957,0.000000,7,0.102,-6.101,0,0.0504,143.299,4.0,0.561,1178


In [151]:
scaler = StandardScaler().fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

###Classificação

In [152]:
#Modelo de Classificador
from sklearn.neighbors import KNeighborsClassifier
Classif_KNN = KNeighborsClassifier(n_neighbors=1)

In [153]:
#Processo de treinamento com 70% dos dados
Classif_KNN.fit(X_train, y_train)

In [154]:
#Processo de teste com 30% dos dados que não foram utilizados no treinamento
y_pred = Classif_KNN.predict(X_test)

In [155]:
#Avaliação do Modelo
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.67      0.71      0.69       301
           1       0.69      0.65      0.67       305

    accuracy                           0.68       606
   macro avg       0.68      0.68      0.68       606
weighted avg       0.68      0.68      0.68       606

0.6798679867986799
