# Dia 2 - Entendendo conceitos iniciais de Machine Learning e Pré-Processamento de Dados

In [76]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.patches as mpatches
import seaborn as sns
import numpy as np

%matplotlib inline
mpl.style.use('ggplot')
plt.style.use('fivethirtyeight')
sns.set(context='notebook', palette='dark', color_codes=True)

In [62]:
data = pd.read_csv("data/dataset-tratado.csv", index_col=0)
data.head(5)

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


## Pré-processamento

In [63]:
# Removendo itens duplicados
print("Antes de deletar duplicados {}".format(data.shape))
data.drop_duplicates(inplace=True)
print("Depois de deletar duplicados: {}".format(data.shape))

Antes de deletar duplicados (113999, 20)
Depois de deletar duplicados: (113549, 20)


In [64]:
# Mudando a coluna de popularidade para valores binários, onde 1 é popular e 0 não. 
# Um corte em que músicas com popularidade originalmente acima de 70 são consideradas populares foi feito
conditions = [data["popularity"] >= 70, data["popularity"] < 70]
arrays = [1, 0]

# Utilizando a função select do numpy que executa uma ação em uma array baseado em condições definidas em uma lista
result = np.select(conditions, arrays)
data["popularity"] = result
data.head(5)

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,1,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,0,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,0,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,1,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,1,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


**Obs:** Tomar cuidado ao executar essa célula, se for executada duas vezes seguidas todos os valores caem para zero

In [65]:
# Mudando os valores de True e False de explicit para 0 e 1
conditions = [data["explicit"] == True, data["explicit"] == False]
arrays = [1, 0]

result = np.select(conditions, arrays)
data["explicit"] = result
data.head(5)

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,1,230666,0,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,0,149610,0,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,0,210826,0,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,1,201933,0,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,1,198853,0,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


## Utilizando somente variaveis quantitativas

In [66]:
cols_droped = []
for col in data:
    if data[col].dtype == 'object':
        cols_droped.append(col)

data_qtde = data.drop(columns=cols_droped)

In [67]:
print("Quantidade de exemplos e colunas que iremos trabalhar: {}".format(data_qtde.shape))

Quantidade de exemplos e colunas que iremos trabalhar: (113549, 15)


In [69]:
data_qtde.head(5)

Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,1,230666,0,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4
1,0,149610,0,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4
2,0,210826,0,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4
3,1,201933,0,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3
4,1,198853,0,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4


In [70]:
data_qtde.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 113549 entries, 0 to 113999
Data columns (total 15 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   popularity        113549 non-null  int32  
 1   duration_ms       113549 non-null  int64  
 2   explicit          113549 non-null  int32  
 3   danceability      113549 non-null  float64
 4   energy            113549 non-null  float64
 5   key               113549 non-null  int64  
 6   loudness          113549 non-null  float64
 7   mode              113549 non-null  int64  
 8   speechiness       113549 non-null  float64
 9   acousticness      113549 non-null  float64
 10  instrumentalness  113549 non-null  float64
 11  liveness          113549 non-null  float64
 12  valence           113549 non-null  float64
 13  tempo             113549 non-null  float64
 14  time_signature    113549 non-null  int64  
dtypes: float64(9), int32(2), int64(4)
memory usage: 13.0 MB


In [71]:
# Normalizando os valores quantitativos
data_qtde_norm = (data_qtde - data_qtde.min()) / (data_qtde.max() - data_qtde.min())

In [72]:
data_qtde_norm.head(5)

Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,1.0,0.042473,0.0,0.686294,0.461,0.090909,0.791392,0.0,0.148187,0.032329,1e-06,0.358,0.718593,0.361245,0.8
1,0.0,0.026971,0.0,0.426396,0.166,0.090909,0.597377,1.0,0.079067,0.927711,6e-06,0.101,0.268342,0.318397,0.8
2,0.0,0.038679,0.0,0.44467,0.359,0.0,0.736123,1.0,0.05772,0.210843,0.0,0.117,0.120603,0.313643,0.8
3,1.0,0.036978,0.0,0.270051,0.0596,0.0,0.573701,1.0,0.037617,0.908635,7.1e-05,0.132,0.143719,0.746758,0.6
4,1.0,0.036389,0.0,0.627411,0.443,0.181818,0.737103,1.0,0.054508,0.470884,0.0,0.0829,0.167839,0.492863,0.8


## Salvando em um dataset para o proximo dia

In [75]:
data_qtde_norm.to_csv("data/dataset_norm.csv")