In [76]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [77]:
dados_totais = pd.read_csv('Dados/Dados_totais.csv')
dados_totais = dados_totais.drop(columns = ['explicit', 'mode', 'key'])
dados_genero = pd.read_csv('Dados/data_by_genres.csv')
dados_genero = dados_genero.drop(columns = ['mode', 'key'])
dados_ano = pd.read_csv('Dados/data_by_year.csv')
dados_ano = dados_ano.drop(columns = ['mode', 'key'])

In [78]:
dados_totais.head()

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,id,instrumentalness,liveness,loudness,name,popularity,speechiness,tempo,artists_song
0,0.285,2000,0.00239,Coldplay,0.429,266773,0.661,3AJwUDP919kvQ9QcozQPxg,0.000121,0.234,-7.227,Yellow,84,0.0281,173.372,Coldplay - Yellow
1,0.613,2000,0.143,OutKast,0.843,270507,0.806,0I3q5fE6wg7LIfHGngUTnV,0.0,0.0771,-5.946,Ms. Jackson,80,0.269,94.948,OutKast - Ms. Jackson
2,0.4,2000,0.00958,Linkin Park,0.556,216880,0.864,60a0Rd6pjrkxjPbaKzXjfq,0.0,0.209,-5.87,In the End,84,0.0584,105.143,Linkin Park - In the End
3,0.543,2000,0.00664,3 Doors Down,0.545,233933,0.865,6ZOBP3NvffbU4SZcrnt1k6,1.1e-05,0.168,-5.708,Kryptonite,78,0.0286,99.009,3 Doors Down - Kryptonite
4,0.76,2000,0.0302,Eminem,0.949,284200,0.661,3yfqSUWxFvZELEM4PmlwIR,0.0,0.0454,-4.244,The Real Slim Shady,80,0.0572,104.504,Eminem - The Real Slim Shady


In [79]:
dados_genero.head()

Unnamed: 0,genres,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity
0,21st century classical,0.979333,0.162883,160297.7,0.071317,0.606834,0.3616,-31.514333,0.040567,75.3365,0.103783,27.833333
1,432hz,0.49478,0.299333,1048887.0,0.450678,0.477762,0.131,-16.854,0.076817,120.285667,0.22175,52.5
2,8-bit,0.762,0.712,115177.0,0.818,0.876,0.126,-9.18,0.047,133.444,0.975,48.0
3,[],0.651417,0.529093,232880.9,0.419146,0.205309,0.218696,-12.288965,0.107872,112.857352,0.513604,20.859882
4,a cappella,0.676557,0.538961,190628.5,0.316434,0.003003,0.172254,-12.479387,0.082851,112.110362,0.448249,45.820071


In [67]:
dados_ano = dados_ano[dados_ano['year'] >= 2000]
dados_ano['year'].unique()
dados_ano.reset_index(inplace=True)

In [68]:
#ANÁLISE GRÁFICA

In [69]:
fig = px.line(dados_ano, x = 'year', y='acousticness', markers=True, title='Variação do acousticness conforme os anos')
fig.show()

In [70]:
fig = px.line(dados_ano, x = 'year', y='loudness', markers=True, title='Variação do loudness conforme os anos')
fig.show()

In [71]:
#CLUSTERIZAÇÃO POR GENERO

dados_genero1 = dados_genero.drop("genres", axis = 1)
# pré-processamento e redução da dimensionalidade dos dados de gênero
SEED = 123
np.random.seed(SEED)

pca_pipeline = Pipeline([('scaler', StandardScaler()), ('PCA', PCA(n_components=2, random_state=SEED))])

genre_embedding_pca = pca_pipeline.fit_transform(dados_genero1)
projection = pd.DataFrame(columns=['x', 'y'], data=genre_embedding_pca)

kmeans_pca = KMeans(n_clusters=5, verbose=True, random_state=SEED)

kmeans_pca.fit(projection)

dados_genero['cluster_pca'] = kmeans_pca.predict(projection)
projection['cluster_pca'] = kmeans_pca.predict(projection)

projection['generos'] = dados_genero['genres']

Initialization complete
Iteration 0, inertia 4975.4223581129345.
Iteration 1, inertia 3873.3860314867015.
Iteration 2, inertia 3665.048448329231.
Iteration 3, inertia 3579.9362805656283.
Iteration 4, inertia 3544.318583870793.
Iteration 5, inertia 3531.5029290879193.
Iteration 6, inertia 3527.146191375396.
Iteration 7, inertia 3524.4738481912186.
Iteration 8, inertia 3522.2375572880146.
Iteration 9, inertia 3520.4491824407255.
Iteration 10, inertia 3519.692787083722.
Iteration 11, inertia 3518.7901579631953.
Iteration 12, inertia 3518.038748214686.
Iteration 13, inertia 3517.6329455699965.
Converged at iteration 13: center shift 0.00021803977124841445 within tolerance 0.0002709886847099649.
Initialization complete
Iteration 0, inertia 4236.981147484262.
Iteration 1, inertia 3687.6454352327355.
Iteration 2, inertia 3601.2643607957652.
Iteration 3, inertia 3569.377245630352.
Iteration 4, inertia 3547.1276944827605.
Iteration 5, inertia 3531.967510353079.
Iteration 6, inertia 3522.5098124





In [72]:
print(projection.sort_values(by="cluster_pca", ascending=True))
df_final = projection.sort_values(by="cluster_pca", ascending=True)
df_final.to_csv("dados_finais.csv")

             x         y  cluster_pca                    generos
0     5.910268 -0.011146            0     21st century classical
1895  3.746891 -0.071537            0  native american spiritual
569   2.803014 -0.607450            0     christmas instrumental
1904  3.126929  0.423391            0              neo-classical
565   2.658957  0.513204            0       christian relaxative
...        ...       ...          ...                        ...
1249 -2.094841  1.083858            4                    gymcore
2462 -0.771040  1.067705            4            sheffield indie
2464 -1.293807  0.425780            4                shimmer pop
530  -1.187643  1.931705            4                  chillstep
1989 -1.603296  0.899414            4        northern irish punk

[2973 rows x 4 columns]


In [73]:
import plotly.express as px
fig = px.scatter(
   projection, x='x', y='y', color='cluster_pca', hover_data=['x', 'y', 'generos'])
     
fig.show()

CLUSTER 0: músicas clássicas
CLUSTER 1: rap e hip hop
CLUSTER 2: músicas mais agitadas, pop, funk
CLUSTER 3: músicas folk e tradicionais
CLUSTER 4: eletrônicas

In [96]:
dados_totais = pd.read_csv('Dados/Dados_totais.csv')
dados_totais = dados_totais.drop(columns = ['explicit', 'mode', 'key'])
dados_genero = pd.read_csv('Dados/data_by_genres.csv')
dados_genero = dados_genero.drop(columns = ['mode', 'key'])
dados_ano = pd.read_csv('Dados/data_by_year.csv')
dados_ano = dados_ano.drop(columns = ['mode', 'key'])

In [97]:
#CLUSTERIZAÇÃO POR DANÇABILIDADE

dados_danca = dados_genero.drop(columns = ['genres'])
# pré-processamento e redução da dimensionalidade dos dados de gênero
SEED = 123
np.random.seed(SEED)

pca_pipeline = Pipeline([('scaler', StandardScaler()), ('PCA', PCA(n_components=3, random_state=SEED))])

danca_embedding_pca = pca_pipeline.fit_transform(dados_danca)
projection = pd.DataFrame(columns=['x', 'y', 'z'], data=danca_embedding_pca)

kmeans_pca = KMeans(n_clusters=5, verbose=True, random_state=SEED)

kmeans_pca.fit(projection)

dados_genero['cluster_pca'] = kmeans_pca.predict(projection)
projection['cluster_pca'] = kmeans_pca.predict(projection)

projection['danca'] = dados_genero['danceability']

Initialization complete
Iteration 0, inertia 9180.818733628275.
Iteration 1, inertia 7674.416586493957.
Iteration 2, inertia 7408.863779530154.
Iteration 3, inertia 7297.241112869668.
Iteration 4, inertia 7252.542609438394.
Iteration 5, inertia 7238.039411411558.
Iteration 6, inertia 7229.271302257934.
Iteration 7, inertia 7224.053581787615.
Iteration 8, inertia 7220.072275172366.
Iteration 9, inertia 7215.774381258094.
Iteration 10, inertia 7209.043037859959.
Iteration 11, inertia 7201.791059626862.
Iteration 12, inertia 7191.243361949478.
Iteration 13, inertia 7167.215082691484.
Iteration 14, inertia 7117.516589675601.
Iteration 15, inertia 7059.1363439979195.
Iteration 16, inertia 6977.894317794173.
Iteration 17, inertia 6855.847000020074.
Iteration 18, inertia 6697.253421112542.
Iteration 19, inertia 6552.562683940658.
Iteration 20, inertia 6437.551772513488.
Iteration 21, inertia 6395.238291603736.
Iteration 22, inertia 6378.849002287983.
Iteration 23, inertia 6372.851108337216.
I





In [98]:
import plotly.express as px
fig = px.scatter(
   projection, x='x', y='y', color='cluster_pca', hover_data=['x', 'y', 'danca'])
     
fig.show()