<a href="https://colab.research.google.com/github/palis-dev/jupyter-notebooks/blob/main/ml_kmeans_senioridade.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score


In [2]:
num_classes = 3
class_names = ['Junior', 'Pleno', 'Sênior']

X, y = make_classification(n_samples=500,
                           n_features=3,
                           n_informative=3,
                           n_redundant=0,
                           n_clusters_per_class=1,
                           n_classes=num_classes,
                           random_state=42)

X = X - X.min(axis=0) + 1
escala = 10
X = np.round(X * escala).astype(int)
offset_pleno = 20
offset_senior = 40
X[y == 1, 1:3] += offset_pleno
X[y == 2, 1:3] += offset_senior

intervalos = {
    0: (1, 7),
    1: (8, 14),
    2: (15, 20)
}

X_final = X.copy()


In [3]:
for classe in range(num_classes):
    low, high = intervalos[classe]
    mask = (y == classe)
    career = X[mask, 1].astype(float)
    career_norm = (career - career.min()) / (career.max() - career.min() + 1e-6)
    career_mapped = low + career_norm * (high - low)
    X_final[mask, 1] = np.round(career_mapped).astype(int)

X = X_final.copy()

X_train_orig, X_test_orig, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_orig)
X_test = scaler.transform(X_test_orig)


kmeans_model = KMeans(n_clusters=num_classes, random_state=42)
clusters_train = kmeans_model.fit_predict(X_train)

ari_train = adjusted_rand_score(y_train, clusters_train)
print(f"Adjusted Rand Index (Treino): {ari_train:.4f}")

clusters_test = kmeans_model.predict(X_test)
ari_test = adjusted_rand_score(y_test, clusters_test)
print(f"Adjusted Rand Index (Teste): {ari_test:.4f}")

df_train = pd.DataFrame(X_train_orig, columns=['Commits no Ano', 'Tempo de Carreira', 'Tasks Finalizadas'])
df_train['Cluster'] = clusters_train
df_train['Classe Real'] = [class_names[label] for label in y_train]

fig = px.scatter_3d(df_train,
                    x='Commits no Ano',
                    y='Tempo de Carreira',
                    z='Tasks Finalizadas',
                    color='Cluster',
                    symbol='Classe Real',
                    title='Clustering com K-means nos Dados de Treino - 3D')
fig.show()

print("\nExemplos de clusters para amostras do teste:")
for i in range(10):
    print(f"Amostra {i}: Cluster Predito: {clusters_test[i]}, Classe Real: {class_names[y_test[i]]}")


Adjusted Rand Index (Treino): 0.8350
Adjusted Rand Index (Teste): 0.8349



Exemplos de clusters para amostras do teste:
Amostra 0: Cluster Predito: 2, Classe Real: Sênior
Amostra 1: Cluster Predito: 0, Classe Real: Junior
Amostra 2: Cluster Predito: 1, Classe Real: Pleno
Amostra 3: Cluster Predito: 1, Classe Real: Pleno
Amostra 4: Cluster Predito: 0, Classe Real: Junior
Amostra 5: Cluster Predito: 0, Classe Real: Junior
Amostra 6: Cluster Predito: 0, Classe Real: Junior
Amostra 7: Cluster Predito: 2, Classe Real: Sênior
Amostra 8: Cluster Predito: 2, Classe Real: Sênior
Amostra 9: Cluster Predito: 1, Classe Real: Pleno


# Nova seção