<h1>Aplicação</h1>

In [2]:
# Importando as bibliotecas que serão utilizadas
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

import plotly.express as px

In [3]:
# Importando a base de dados
base = pd.read_csv(r'Dados/Dados_totais.csv')
base = base.drop(['mode','explicit', 'key'], axis = 1)

In [4]:
base.head()

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,id,instrumentalness,liveness,loudness,name,popularity,speechiness,tempo,artists_song
0,0.285,2000,0.00239,Coldplay,0.429,266773,0.661,3AJwUDP919kvQ9QcozQPxg,0.000121,0.234,-7.227,Yellow,84,0.0281,173.372,Coldplay - Yellow
1,0.613,2000,0.143,OutKast,0.843,270507,0.806,0I3q5fE6wg7LIfHGngUTnV,0.0,0.0771,-5.946,Ms. Jackson,80,0.269,94.948,OutKast - Ms. Jackson
2,0.4,2000,0.00958,Linkin Park,0.556,216880,0.864,60a0Rd6pjrkxjPbaKzXjfq,0.0,0.209,-5.87,In the End,84,0.0584,105.143,Linkin Park - In the End
3,0.543,2000,0.00664,3 Doors Down,0.545,233933,0.865,6ZOBP3NvffbU4SZcrnt1k6,1.1e-05,0.168,-5.708,Kryptonite,78,0.0286,99.009,3 Doors Down - Kryptonite
4,0.76,2000,0.0302,Eminem,0.949,284200,0.661,3yfqSUWxFvZELEM4PmlwIR,0.0,0.0454,-4.244,The Real Slim Shady,80,0.0572,104.504,Eminem - The Real Slim Shady


In [5]:
# Definindo random seed
SEED = 20

In [6]:
np.random.seed(SEED)

# Criando um column transformer para transformar as colunas via one hot encoder

ohe = ColumnTransformer(
    transformers = [
    ('ohe', OneHotEncoder(dtype=int, sparse_output=False), ['artists']),
    ('drop', 'drop', ['artists_song', 'name', 'id'])
    ],
    remainder = 'passthrough'
)

# Criando uma pipeline para padronizar os dados e reduzir a dimensionalidade
pca_pipeline = Pipeline(
    [
    ('ohe', ohe),
    ('scaler', StandardScaler()),
    ('PCA', PCA(n_components=0.7))
    ]
)

pca_data = pca_pipeline.fit_transform(base)
projection = pd.DataFrame(data=pca_data)

In [7]:
# Setando a random seed novamente via numpy
np.random.seed(SEED)

# Criando o modelo kmeans
kmeans_pca = KMeans(n_clusters=50, verbose=False)

kmeans_pca.fit(pca_data)

# Salvando os resultados nos dataframes
base['cluster_pca'] = kmeans_pca.predict(pca_data)
projection['cluster_pca'] = base['cluster_pca'].copy()



In [8]:
projection['artists'] = base['artists'].copy()
projection['name'] = base['name'].copy()

In [9]:
projection.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,605,606,607,608,609,610,611,cluster_pca,artists,name
0,0.174796,0.731252,2.186797,-0.767192,0.594847,0.315968,-0.412322,-0.479171,-2.700668,2.109956,...,-0.009111,0.005154,-0.034027,-0.032617,-2.060759e-10,2.453309e-10,-4.232451e-11,2,Coldplay,Yellow
1,-1.35842,0.052935,-1.789973,1.938444,0.403606,1.023898,-1.172118,0.821698,-1.716897,0.252498,...,-0.031986,0.029708,-0.016322,-0.030234,-1.529848e-11,1.167974e-10,2.013901e-11,2,OutKast,Ms. Jackson
2,-0.972077,0.658094,0.7575,-0.27735,-0.400393,0.402941,1.29296,1.839192,-1.562236,1.410677,...,0.017097,0.017361,-0.010979,-0.012781,-1.817515e-10,2.316721e-10,-4.43343e-11,2,Linkin Park,In the End
3,-0.926464,1.292091,0.398499,-0.997738,0.202219,0.620859,-0.527689,1.135529,-1.343893,1.020964,...,0.089337,0.164777,-0.064148,-0.040911,-6.271411e-10,1.672884e-09,-2.414475e-10,2,3 Doors Down,Kryptonite
4,-1.710077,-0.383502,-1.258562,1.346428,1.223591,1.744579,0.094311,0.479412,-2.125376,0.389565,...,-0.011937,-0.004081,-0.017478,-0.010033,-5.677863e-11,-2.853156e-11,1.462698e-11,41,Eminem,The Real Slim Shady


In [10]:
# Plotando os gráficos. Serão utilizadas as duas primeiras colunas do PCA, pois as colunas do PCA estão ordenadas em ordem decrescente de quanto as mesmas explicam as variáveis inicias.

fig = px.scatter(
    projection, x=0, y =1, color='cluster_pca', hover_data = [0, 1, 'artists', 'name']
)

fig.show()

In [11]:
# Verificando quanto cada variável resultante do PCA explica a variância das variáveis iniciais (limitando as 10 primeiras entradas, ou seja, as 10 que mais explicam)

pca_pipeline[2].explained_variance_ratio_[:10]

array([0.00435434, 0.00288706, 0.00224307, 0.00195745, 0.00165119,
       0.00157155, 0.00146394, 0.00142497, 0.00135635, 0.00128669])

In [12]:
# Verificando o quanto cada variável todas as variáveis resultantes do PCA explicam a variância das variáveis iniciais
pca_pipeline[2].explained_variance_ratio_.sum()

0.7000008463186451

Ou seja, o modelo explica 70% da variância aproximadamente

In [16]:
# É possível também verificar a soma cumulativa de qual fração da variância é explicada (limitando aos 10 primeiros valores para restringir o tamanho do output)
np.cumsum(pca_pipeline[2].explained_variance_ratio_)[:10]

array([0.00435434, 0.0072414 , 0.00948446, 0.01144192, 0.01309311,
       0.01466466, 0.0161286 , 0.01755356, 0.01890992, 0.0201966 ])

In [None]:
# Exportando o modelo para posterior uso

projection.to_csv(r'Dados/cluster_por_musica.csv', index=False)