## Imports

In [0]:
pip install hdbscan

In [0]:
dbutils.library.restartPython()

In [0]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import pyspark
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import *
from pyspark.sql import DataFrame,SparkSession

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
from sklearn.metrics import silhouette_score
from sklearn.cluster import DBSCAN
import hdbscan

from sklearn.feature_selection import VarianceThreshold

## Funções

In [0]:
def vars_baixa_variancia(X_train, threshold=0.0):
    selector = VarianceThreshold(threshold=threshold)
    selector.fit(X_train)
    low_variance_features = [column for column, var in zip(X_train.columns, selector.variances_) if var <= threshold]
    constant_features = [column for column, var in zip(X_train.columns, selector.variances_) if var == 0]
    return low_variance_features, constant_features

## Base

In [0]:
base_spine = spark.table("sand_riscos_pm_pf.T789778_base_final_dm").filter(F.col('safra').between(201504,201611)).sample(0.05).drop('features')
base_spine

In [0]:
base_spine = base_spine.fillna(0)

In [0]:
null_columns = [column for column in base_spine.columns if base_spine.filter(F.col(column).isNull()).count() > 0]
null_columns

In [0]:
base_spine.groupBy('safra').count().display()

## Variáveis

In [0]:
id_vars = ['msno', 'safra']
target = ['target']
vars_init = [k for k in base_spine.columns if k not in id_vars+target]
vars_cat_index = [k for k in vars_init if k.endswith('_index')]

vars_cat = ['payment_method_id_1m',
 'city_1m',
 'is_auto_renew_1m',
 'age_group_1m',
 'gender_1m',
 'current_age_1m',
 'account_time_1m'
 ]

variaveis = [k for k in vars_init if k not in vars_cat+vars_cat_index]

## Treino - Teste - Validação

In [0]:
train_data = base_spine.filter(~F.col('safra').isin(201601)).filter(F.col('safra').between(201505,201604)).select(id_vars+variaveis+target).toPandas()
test_data = base_spine.filter(F.col('safra').between(201605,201607)).select(id_vars+variaveis+target).toPandas()
validation_data = base_spine.filter(F.col('safra').between(201608,201611)).select(id_vars+variaveis+target).toPandas()

In [0]:
base_final = pd.concat([train_data, test_data, validation_data])
base_final

In [0]:
X_train, y_train = train_data[variaveis], train_data[target]
X_test, y_test = test_data[variaveis], test_data[target]
X_val, y_val = validation_data[variaveis], validation_data[target]

## Variância e Nulos

In [0]:
low_variance, cte_vars = vars_baixa_variancia(base_final[variaveis], threshold=0.1)
low_variance, cte_vars

## Normalização

In [0]:
scaler = StandardScaler()
df_norm = scaler.fit_transform(base_final[variaveis])

# PCA

In [0]:
n_components = 20
pca = PCA(n_components=n_components)
principal_components = pca.fit_transform(df_norm)
PCA_columns = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10', 'PC11', 'PC12', 'PC13', 'PC14', 'PC15', 'PC16', 'PC17', 'PC18', 'PC19', 'PC20']

df_pca = pd.DataFrame(data=principal_components, 
                      columns=PCA_columns)

df_pca

In [0]:
explained_variance = pca.explained_variance_ratio_
cumulative_explained_variance = explained_variance.cumsum()

plt.figure(figsize=(10, 5))
plt.bar(range(1, n_components+1), cumulative_explained_variance, alpha=0.7, align='center', color='blue')
plt.axhline(y=0.9, color='r', linestyle='--')
plt.xlabel('Componentes Principais')
plt.ylabel('Variância Explicada Acumulada')
plt.title(f'PCA Variância Explicada Acumulada (Total: {cumulative_explained_variance[-1]:.2f})')
plt.show()

In [0]:
PCA_vig = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10', 'PC11', 'PC12']
df_final = df_pca[PCA_vig]
df_final

In [0]:
df_final['msno'] = base_final['msno'].ravel()
df_final['safra'] = base_final['safra'].ravel()
df_final['target'] = base_final['target'].ravel()
df_final

# K-Means

## Número de Clusters

In [0]:
elbow = []
for k in range(2, 20):
    kmeans = KMeans(n_clusters=k, random_state=0).fit(df_final[PCA_vig])
    elbow.append(kmeans.inertia_)

In [0]:
plt.figure(figsize=(10,5))
plt.plot([str(k) for k in range(2, 20)], elbow, marker='o', linestyle='--')
plt.xlabel('Número de Clusters')
plt.ylabel('Inercia')
plt.title('Curva de Elbow')
plt.show()

In [0]:
elbow

In [0]:
diffs = sorted([(i, abs(elbow[i] - elbow[i+1])) for i in range(len(elbow)-1)], key=lambda x: x[1], reverse=True)
diffs_indices = [i for i, diff in diffs]
diffs_indices

In [0]:
len(elbow), len(diffs_indices)

## Escolha Cotovelo

In [0]:
k = 8
kmeans =KMeans(n_clusters=k, random_state=0)
df_final['cluster_kmeans'] = kmeans.fit_predict(df_final[PCA_vig])

In [0]:
df_final

In [0]:
cluster_mean = df_final.groupby('cluster_kmeans')['target'].mean().reset_index()
cluster_mean['target'] = (cluster_mean['target']*100).round(2)
cluster_mean

In [0]:
sns.barplot(x='cluster_kmeans', y='target', data=cluster_mean)
plt.title('Inadimplência Média por Cluster')
plt.show()

In [0]:
df_final.groupby('cluster_kmeans').count()['target']

In [0]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='PC1', y='PC2', hue='cluster_kmeans', data=df_final, palette='viridis')
plt.title('PC1 vs PC2 por Clusterização KMeans')
plt.show()

In [0]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='PC1', y='PC3', hue='cluster_kmeans', data=df_final, palette='viridis')
plt.title('PC1 vs PC3 por Clusterização KMeans')
plt.show()

In [0]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='PC3', y='PC2', hue='cluster_kmeans', data=df_final, palette='viridis')
plt.title('PC3 vs PC2 por Clusterização KMeans')
plt.show()

# HDBSCAN

In [0]:
dbscan = hdbscan.HDBSCAN(min_cluster_size=500)
df_final['cluster_hdbscan'] = dbscan.fit_predict(df_final[PCA_vig])

In [0]:
df_final

In [0]:
df_final.groupby('cluster_hdbscan').count()

In [0]:
cluster_mean = df_final.groupby('cluster_hdbscan')['target'].mean().reset_index()
cluster_mean['target'] = (cluster_mean['target']*100).round(2)
cluster_mean

In [0]:
sns.barplot(x='cluster_hdbscan', y='target', data=cluster_mean)
plt.title('Inadimplência Média por Cluster')
plt.show()

In [0]:
df_final.groupby('cluster_hdbscan').count()['target']

In [0]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='PC1', y='PC2', hue='cluster_hdbscan', data=df_final, palette='viridis')
plt.title('PC1 vs PC2 por Clusterização HDBSCAN')
plt.show()

In [0]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='PC1', y='PC3', hue='cluster_hdbscan', data=df_final, palette='viridis')
plt.title('PC1 vs PC3 por Clusterização HDBSCAN')
plt.show()

In [0]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='PC3', y='PC2', hue='cluster_hdbscan', data=df_final, palette='viridis')
plt.title('PC3 vs PC2 por Clusterização HDBSCAN')
plt.show()

## Salvar Base

In [0]:
df_final

In [0]:
df_final_spark = spark.createDataFrame(df_final)
# df_final_spark.write.mode('overwrite').saveAsTable('sand_riscos_pm_pf.T789778_base_clustering')
# print('sand_riscos_pm_pf.T789778_base_clustering')

## Trazendo de Volta o problema de Classificação

In [0]:
base_score = spark.table('sand_riscos_pm_pf.T789778_base_score_unico')
base_score

In [0]:
base_ar0 = spark.table('sand_riscos_pm_pf.T789778_base_score_ar0')
base_ar0

In [0]:
base_ar1 = spark.table('sand_riscos_pm_pf.T789778_base_score_ar1')
base_ar1

In [0]:
df_final_spark

## Avaliação de Clusters

In [0]:
df_clusters = df_final_spark.join(base_score, on=['msno', 'safra', 'target'], how='left')
df_clusters

In [0]:
df_clusters_ar0 = df_final_spark.join(base_ar0, on=['msno', 'safra', 'target'], how='inner')
df_clusters_ar0

In [0]:
df_clusters_ar1 = df_final_spark.join(base_ar1, on=['msno', 'safra', 'target'], how='inner')
df_clusters_ar1

In [0]:
df_clusters_pd = df_clusters.toPandas()
df_clusters_ar0 = df_clusters_ar0.toPandas()
df_clusters_ar1 = df_clusters_ar1.toPandas()

## Modelo único

In [0]:
inadimplencia_kmeans = df_clusters_pd.groupby('cluster_kmeans').agg(
    inadimplencia_real=('target', 'mean'),
    inadimplencia_predita=('y_pred_lgbm_unico', 'mean')
).reset_index()

inadimplencia_kmeans['inadimplencia_real'] *= 100
inadimplencia_kmeans['inadimplencia_predita'] *= 100

inadimplencia_kmeans

In [0]:
fig, ax = plt.subplots(figsize=(10, 6))
width = 0.35

bars_real = ax.bar(inadimplencia_kmeans['cluster_kmeans'] - width/2, inadimplencia_kmeans['inadimplencia_real'], width, label='Inadimplência Real')
bars_predita = ax.bar(inadimplencia_kmeans['cluster_kmeans'] + width/2, inadimplencia_kmeans['inadimplencia_predita'], width, label='Inadimplência Predita')

ax.set_xlabel('Cluster KMeans')
ax.set_ylabel('Inadimplência (%)')
ax.set_title('Inadimplência Real vs Predita por Cluster KMeans')
ax.legend()

for bar in bars_real:
    height = bar.get_height()
    ax.annotate(f'{height:.0f}%', xy=(bar.get_x() + bar.get_width() / 2, height),
                xytext=(0, 3), textcoords="offset points", ha='center', va='bottom')

for bar in bars_predita:
    height = bar.get_height()
    ax.annotate(f'{height:.0f}%', xy=(bar.get_x() + bar.get_width() / 2, height),
                xytext=(0, 3), textcoords="offset points", ha='center', va='bottom')

plt.show()

In [0]:
inadimplencia_hdbscan = df_clusters_pd.groupby('cluster_hdbscan').agg(
    inadimplencia_real=('target', 'mean'),
    inadimplencia_predita=('y_pred_lgbm_unico', 'mean')
).reset_index()

inadimplencia_hdbscan['inadimplencia_real'] *= 100
inadimplencia_hdbscan['inadimplencia_predita'] *= 100

inadimplencia_hdbscan

In [0]:
fig, ax = plt.subplots(figsize=(10, 6))
width = 0.35

bars_real = ax.bar(inadimplencia_hdbscan['cluster_hdbscan'] - width/2, inadimplencia_hdbscan['inadimplencia_real'], width, label='Inadimplência Real')
bars_predita = ax.bar(inadimplencia_hdbscan['cluster_hdbscan'] + width/2, inadimplencia_hdbscan['inadimplencia_predita'], width, label='Inadimplência Predita')

ax.set_xlabel('Cluster HDBSCAN')
ax.set_ylabel('Inadimplência (%)')
ax.set_title('Inadimplência Real vs Predita por Cluster HDBSCAN')
ax.legend()

for bar in bars_real:
    height = bar.get_height()
    ax.annotate(f'{height:.0f}%', xy=(bar.get_x() + bar.get_width() / 2, height),
                xytext=(0, 3), textcoords="offset points", ha='center', va='bottom')

for bar in bars_predita:
    height = bar.get_height()
    ax.annotate(f'{height:.0f}%', xy=(bar.get_x() + bar.get_width() / 2, height),
                xytext=(0, 3), textcoords="offset points", ha='center', va='bottom')

plt.show()

## Modelo AR0

In [0]:
inadimplencia_kmeans = df_clusters_ar0.groupby('cluster_kmeans').agg(
    inadimplencia_real=('target', 'mean'),
    inadimplencia_predita=('y_pred_lgbm_ar0', 'mean')
).reset_index()

inadimplencia_kmeans['inadimplencia_real'] *= 100
inadimplencia_kmeans['inadimplencia_predita'] *= 100

inadimplencia_kmeans

In [0]:
fig, ax = plt.subplots(figsize=(10, 6))
width = 0.35

bars_real = ax.bar(inadimplencia_kmeans['cluster_kmeans'] - width/2, inadimplencia_kmeans['inadimplencia_real'], width, label='Inadimplência Real')
bars_predita = ax.bar(inadimplencia_kmeans['cluster_kmeans'] + width/2, inadimplencia_kmeans['inadimplencia_predita'], width, label='Inadimplência Predita')

ax.set_xlabel('Cluster KMeans')
ax.set_ylabel('Inadimplência (%)')
ax.set_title('Inadimplência Real vs Predita por Cluster KMeans')
ax.legend()

for bar in bars_real:
    height = bar.get_height()
    ax.annotate(f'{height:.0f}%', xy=(bar.get_x() + bar.get_width() / 2, height),
                xytext=(0, 3), textcoords="offset points", ha='center', va='bottom')

for bar in bars_predita:
    height = bar.get_height()
    ax.annotate(f'{height:.0f}%', xy=(bar.get_x() + bar.get_width() / 2, height),
                xytext=(0, 3), textcoords="offset points", ha='center', va='bottom')

plt.show()

In [0]:
inadimplencia_hdbscan = df_clusters_ar0.groupby('cluster_hdbscan').agg(
    inadimplencia_real=('target', 'mean'),
    inadimplencia_predita=('y_pred_lgbm_ar0', 'mean')
).reset_index()

inadimplencia_hdbscan['inadimplencia_real'] *= 100
inadimplencia_hdbscan['inadimplencia_predita'] *= 100

inadimplencia_hdbscan

In [0]:
fig, ax = plt.subplots(figsize=(10, 6))
width = 0.35

bars_real = ax.bar(inadimplencia_hdbscan['cluster_hdbscan'] - width/2, inadimplencia_hdbscan['inadimplencia_real'], width, label='Inadimplência Real')
bars_predita = ax.bar(inadimplencia_hdbscan['cluster_hdbscan'] + width/2, inadimplencia_hdbscan['inadimplencia_predita'], width, label='Inadimplência Predita')

ax.set_xlabel('Cluster HDBSCAN')
ax.set_ylabel('Inadimplência (%)')
ax.set_title('Inadimplência Real vs Predita por Cluster HDBSCAN')
ax.legend()

for bar in bars_real:
    height = bar.get_height()
    ax.annotate(f'{height:.0f}%', xy=(bar.get_x() + bar.get_width() / 2, height),
                xytext=(0, 3), textcoords="offset points", ha='center', va='bottom')

for bar in bars_predita:
    height = bar.get_height()
    ax.annotate(f'{height:.0f}%', xy=(bar.get_x() + bar.get_width() / 2, height),
                xytext=(0, 3), textcoords="offset points", ha='center', va='bottom')

plt.show()

## Modelo AR1

In [0]:
inadimplencia_kmeans = df_clusters_ar1.groupby('cluster_kmeans').agg(
    inadimplencia_real=('target', 'mean'),
    inadimplencia_predita=('y_pred_log_ar1', 'mean')
).reset_index()

inadimplencia_kmeans['inadimplencia_real'] *= 100
inadimplencia_kmeans['inadimplencia_predita'] *= 100

inadimplencia_kmeans

In [0]:
fig, ax = plt.subplots(figsize=(10, 6))
width = 0.35

bars_real = ax.bar(inadimplencia_kmeans['cluster_kmeans'] - width/2, inadimplencia_kmeans['inadimplencia_real'], width, label='Inadimplência Real')
bars_predita = ax.bar(inadimplencia_kmeans['cluster_kmeans'] + width/2, inadimplencia_kmeans['inadimplencia_predita'], width, label='Inadimplência Predita')

ax.set_xlabel('Cluster KMeans')
ax.set_ylabel('Inadimplência (%)')
ax.set_title('Inadimplência Real vs Predita por Cluster KMeans')
ax.legend()

for bar in bars_real:
    height = bar.get_height()
    ax.annotate(f'{height:.0f}%', xy=(bar.get_x() + bar.get_width() / 2, height),
                xytext=(0, 3), textcoords="offset points", ha='center', va='bottom')

for bar in bars_predita:
    height = bar.get_height()
    ax.annotate(f'{height:.0f}%', xy=(bar.get_x() + bar.get_width() / 2, height),
                xytext=(0, 3), textcoords="offset points", ha='center', va='bottom')

plt.show()

In [0]:
inadimplencia_hdbscan = df_clusters_ar1.groupby('cluster_hdbscan').agg(
    inadimplencia_real=('target', 'mean'),
    inadimplencia_predita=('y_pred_log_ar1', 'mean')
).reset_index()

inadimplencia_hdbscan['inadimplencia_real'] *= 100
inadimplencia_hdbscan['inadimplencia_predita'] *= 100

inadimplencia_hdbscan

In [0]:
fig, ax = plt.subplots(figsize=(10, 6))
width = 0.35

bars_real = ax.bar(inadimplencia_hdbscan['cluster_hdbscan'] - width/2, inadimplencia_hdbscan['inadimplencia_real'], width, label='Inadimplência Real')
bars_predita = ax.bar(inadimplencia_hdbscan['cluster_hdbscan'] + width/2, inadimplencia_hdbscan['inadimplencia_predita'], width, label='Inadimplência Predita')

ax.set_xlabel('Cluster HDBSCAN')
ax.set_ylabel('Inadimplência (%)')
ax.set_title('Inadimplência Real vs Predita por Cluster HDBSCAN')
ax.legend()

for bar in bars_real:
    height = bar.get_height()
    ax.annotate(f'{height:.0f}%', xy=(bar.get_x() + bar.get_width() / 2, height),
                xytext=(0, 3), textcoords="offset points", ha='center', va='bottom')

for bar in bars_predita:
    height = bar.get_height()
    ax.annotate(f'{height:.0f}%', xy=(bar.get_x() + bar.get_width() / 2, height),
                xytext=(0, 3), textcoords="offset points", ha='center', va='bottom')

plt.show()