<a href="https://colab.research.google.com/github/maicon-reis/financas_quantitativas/blob/main/K_Means_and_K_Medoids.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Instalando a biblioteca Yfinance
!pip install -q yfinance

In [None]:
# Instalando o pacote scikit-learn-extra
!pip install -q scikit-learn-extra

In [None]:
# Importando as bibliotecas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yfinance as yf
from sklearn.cluster import KMeans
from sklearn_extra.cluster import KMedoids
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
plt.style.use("fivethirtyeight")

In [None]:
# Importando os dados do Ibov para o ano de 2020
data = yf.download('^BVSP', start='2020-1-1', end='2020-12-31')

# Criando variáveis
data["Retorno"] = np.log(data["Adj Close"]) - np.log(data["Adj Close"].shift(1))
data["std_10"] = data["Adj Close"].rolling(10).std()
data["Dist_10"] = (data["Close"] / data["Close"].rolling(10).mean()) - 1
data["dir_D"] = np.where(data["Close"] > data["Open"] , 1, 0)
data["dir_D_mean"] = data["dir_D"].rolling(10).mean()
data["prop"] = (data["Close"] - data["Open"] / data["High"] - data["Low"])
RetMean = data["Retorno"].min()
RetStd = data["Retorno"].max()
data["zscore"] = (data["Retorno"] - RetMean) / RetStd

# Excluindo as linhas faltantes
data.dropna(axis=0, inplace=True)

# Verificando a base de dados
data.head(10)

In [None]:
# Padronizando as variáveis
vars = [ 'std_10', 'Dist_10', 'dir_D', 'dir_D_mean', 'prop', 'zscore']
data_scaled = pd.DataFrame()
for var in vars:
    var_values = data[var].values.reshape(-1, 1)
    sc = StandardScaler().fit(var_values)
    data_scaled[[var]] = sc.transform(var_values)

In [None]:
# Treinando o algoritmo KMeans
kmeans = KMeans(n_clusters=4, random_state=1)
kmeans.fit(data_scaled)
data['KMeans_Labels'] = kmeans.predict(data_scaled)

In [None]:
# Treinando o algoritmo K-Medoids
kmedo = KMedoids(n_clusters=4, random_state=1)
kmedo.fit(data_scaled)
data['KMedoids_Labels'] = kmedo.predict(data_scaled)

In [None]:
pca = PCA(n_components=2)
X_reduced_pca = pca.fit_transform(data_scaled)

# storing results in a dataframe
reduced_data_df_pca = pd.DataFrame(
    data=X_reduced_pca, columns=["Componente 1", "Componente 2"]
)

reduced_data_df_pca['KMeans_Labels'] = data['KMeans_Labels'].values
reduced_data_df_pca['KMedoids_Labels'] = data['KMedoids_Labels'].values

In [None]:
# Calculando a PCA
explained_variance = pca.explained_variance_ratio_

print("Variability's explain: {}%".format(sum(explained_variance) * 100))

In [None]:
fig = plt.figure(figsize=(11,4))
ax1 = fig.add_subplot(1, 2, 1)
sns.scatterplot(y="Componente 1", x="Componente 2", hue='KMeans_Labels'
                , data=reduced_data_df_pca, palette="coolwarm", legend="brief", ax=ax1)
plt.legend(loc='upper right', frameon=True, ncol=2, fancybox=True
           , framealpha=0.95, shadow=True, borderpad=1)
plt.title('Clusters K-Means')

ax2 = fig.add_subplot(1, 2, 2)
sns.scatterplot(y="Componente 1", x="Componente 2", hue='KMedoids_Labels',
                data=reduced_data_df_pca, palette="coolwarm", legend="brief", ax=ax2)
plt.legend(loc='upper right', frameon=True, ncol=2, fancybox=True
           , framealpha=0.95, shadow=True, borderpad=1)
plt.title('Clusters K-Medoids');