#### 00. IMPORT DE PACOTES E FUNÇÕES ÚTEIS

- Módulos criados para o projeto também são carregados

In [None]:
import sys, os
sys.path.append(os.path.abspath(".."))

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import tree
from sklearn import model_selection
from sklearn import ensemble
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

from feature_engine.encoding import OneHotEncoder
from feature_engine import discretisation

from lightgbm import LGBMClassifier

pd.set_option('display.max_columns',200)
pd.set_option('display.max_rows',200)

from pipeline.preprocess import *
from pipeline.utils import *
from pipeline.carregar_dados import *
from pipeline.criar_abt import *

from features.features_clientes import *
from features.features_quantidade import *
from features.features_valor import *
from features.features_tempo import *
from features.features_flags import *

#### 01. CARREGAR DADOS ORIGINAIS COM PRÉ-PROCESSAMENTO MÍNIMO

In [None]:
dados = carregar_dados()

df_clientes = dados['clientes']
df_inadimplencia = dados['inadimplencia']
df_transacoes = dados['transacoes']

clientes_prep = preprocessar_clientes(df_clientes)
inad_prep = preprocessar_inadimplencia(df_inadimplencia)
tran_prep = preprocessar_transacoes(df_transacoes)

inad_prep = inad_prep.dropna(subset=["atraso_90d"]).copy()

print("Clientes:", clientes_prep.shape)
print("Inadimplência:", inad_prep.shape)
print("Transações:", tran_prep.shape)

#### 02. GERAÇÃO DA ABT 

- Merge com as bases de features criadas.

- Cada unidade experimental corresponderá ao par *id_cliente* + *data_referencia*, e as variáveis corresponderão a essa referência.

- A base que será utilizada durante todo o desenvolvimento teve os cálculos de suas features baseados numa janela de observação M-1, garantindo assim que não utilize dados do mesmo mês a que a coluna *mes_safra* da base de inadimplência se refere.

- Como será observado adiante, a taxa de inadimplência tem valor de 10%, o que pode gerar alguns problemas durante a modelagem devido ao desbalanceamento.

In [None]:
abt = gerar_abt(clientes_prep, 
                inad_prep, 
                tran_prep, 
                usar_M_1=True)

print("Shape M-1:", abt.shape)

In [None]:
abt_M = gerar_abt(clientes_prep,
                  inad_prep,
                  tran_prep,
                  usar_M_1=False)

print("Shape M:", abt_M.shape)

In [None]:
abt.head(10)

In [None]:
abt_M.head(10)

##### COMPARAÇÃO ENTRE CONCEITOS M-1 E M COM A JANELA DESLOCADA EM 1 MÊS. 

- NO EXEMPLO, AS VARIÁVEIS `vlr_trans_3m` E `qtde_trans_3m`

In [None]:
from IPython.display import display_html

df1_html = tran_prep[tran_prep['id_cliente'] == 'C0001'].sort_values(by='data_transacao').to_html()

df2_html = pd.concat([abt.loc[abt['id_cliente'] == 'C0001',['id_cliente','mes_safra','data_referencia', 'vlr_trans_3m']].head(20), 
           abt_M.loc[abt_M['id_cliente'] == 'C0001',['vlr_trans_3m']].head(20),
           abt.loc[abt['id_cliente'] == 'C0001',['qtde_trans_3m']].head(20),
           abt_M.loc[abt_M['id_cliente'] == 'C0001',['qtde_trans_3m']].head(20)],
           axis=1).to_html()

display_html(df1_html + df2_html, raw=True)


#### CONCENTRAÇÃO DE VALORES NULOS POR VARIÁVEL

In [None]:
pd.DataFrame(abt.isna().mean().sort_values(ascending=False)).head(20)   

#### QUAL A INADIMPLÊNCIA TOTAL DA BASE DE DADOS? E AO LONGO DAS SAFRAS?

In [None]:
plot_categ(abt, 'atraso_90d', titulo='Distribuição do target', xlabel='Default',ylabel='Volumetria')

In [None]:
plot_txmau_categ(abt, column='mes_safra', column_mau='atraso_90d')

#### QUAL A INADIMPLÊNCIA POR CLIENTE?

In [None]:
mau = 1
df2 =abt[['id_cliente', 'atraso_90d']].copy()
df2['mau'] = [1 if x == mau else 0 for x in df2['atraso_90d']]
aux = df2.groupby('id_cliente')["mau"].agg(["mean", 'count']
                                          )
aux = aux.rename(columns={'mean' : 'tx_mau', 'count' : 'N'}).sort_values('tx_mau', ascending=False)
aux.head(20)

#### QUAL A INADIMPLÊNCIA OBSERVADA EM RELAÇÃO ÀS VARIÁVEIS?

- OBS: PARA VARIÁVEIS NUMÉRICAS - CATEGORIZAÇÃO POR QUANTIS

In [None]:
plot_txmau_categ(abt, column='estado_civil', column_mau='atraso_90d')

#### O SCORE INTERNO PRESENTE NA BASE DE CLIENTES ORDENA A INADIMPLÊNCIA?


In [None]:
plot_inad_var(abt, "score_interno", target="atraso_90d", bins=10)

In [None]:
plot_inad_var(abt, "idade", target="atraso_90d", bins=10)

In [None]:
plot_inad_var(abt, "renda_mensal", target="atraso_90d", bins=10)

In [None]:
plot_inad_var(abt, "qtde_produtos", target="atraso_90d", bins=10)

In [None]:
plot_inad_var(abt, "tempo_relacionamento_meses", target="atraso_90d", bins=5)

In [None]:
corr = abt.drop(columns=['id_cliente', 'data_referencia', 'data_abertura_conta', 'mes_abertura_conta','mes_safra', 'estado_civil']).corr(method = 'pearson')


In [None]:
plt.figure(figsize=(10,8), dpi =500)
sns.heatmap(corr,annot=False,fmt=".2f", linewidth=.5)
plt.show()

In [None]:
cols_drop = ['id_cliente', 'data_referencia', 'data_abertura_conta', 'mes_abertura_conta','mes_safra']

resultado_conc = analisar_concentracao(
    abt=abt,
    max_vol=0.95,
    target="atraso_90d",
    cols_drop=cols_drop
)

print("Variáveis COM concentração:", resultado_conc["com_concentracao"])
print("Variáveis SEM concentração:", resultado_conc["sem_concentracao"])
print(resultado_conc["detalhes"].head())


In [None]:
resultado_conc["detalhes"]

In [None]:
var_s_conc = resultado_conc["sem_concentracao"]

In [None]:
remover = remover_vars(
    abt[var_s_conc + ["atraso_90d"]], 
    target="atraso_90d", 
    iv_threshold=0.01, 
    corr_threshold=0.8
)


In [None]:
remover['final']

In [None]:
len(remover['removidas_corr'])

In [None]:
len(remover['final'])

In [None]:
col_retirar = cols_drop + ['atraso_90d']
X_TODAS = abt.drop(columns=col_retirar)

X_train_TODAS, X_test_TODAS = model_selection.train_test_split(X_TODAS,
                                                 test_size=0.2,
                                                 random_state=42
)

In [None]:
X = abt[remover['final']]
y = abt['atraso_90d']

In [None]:
(X.shape, len(y))

In [None]:
corr = X.corr(method = 'pearson')

plt.figure(figsize=(10,8), dpi =500)
sns.heatmap(corr,annot=False,fmt=".2f", linewidth=.5)
plt.show()


In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X,
                                                                    y,
                                                                    test_size=0.2,
                                                                    random_state=42)

### 1. DECISION TREE CLASSIFIER

#### TREINO COM COLUNAS PRÉ-SELECIONADAS A PARTIR DA FUNÇÃO REMOVER_VARS()

In [None]:
arvore = tree.DecisionTreeClassifier(random_state=42,
                                     max_depth=5,
                                     min_samples_split=10,
                                     class_weight='balanced'
                                     )

arvore.fit(X_train,y_train)

In [None]:
tree.plot_tree(arvore,
               feature_names=X_train.columns,
               max_depth=1,
               filled=True,
               class_names=[str(i) for i in arvore.classes_])

In [None]:
feature_importances= (pd.Series(arvore.feature_importances_,
                                index=X_train.columns)
                           .sort_values(ascending=False)
                           .reset_index()
                           )
feature_importances
feature_importances['acum.'] = feature_importances[0].cumsum()

# %%
best_features = feature_importances[feature_importances['acum.'] < 0.96]

In [None]:
best_features

In [None]:
y_train_predict = arvore.predict(X_train[best_features])
y_train_proba = arvore.predict_proba(X_train[best_features])[:, 1]

acc_train = metrics.accuracy_score(y_train, y_train_predict)
auc_train = metrics.roc_auc_score(y_train, y_train_proba)
roc_train = metrics.roc_curve(y_train, y_train_proba)

print("Acurácia treino: ", acc_train)
print("AUC treino: ", auc_train)


y_test_predict = arvore.predict(X_test[best_features])
y_test_proba = arvore.predict_proba(X_test[best_features])[:, 1]

acc_test = metrics.accuracy_score(y_test, y_test_predict)
auc_test = metrics.roc_auc_score(y_test, y_test_proba)
roc_test = metrics.roc_curve(y_test,y_test_proba)
                             
                             
print("Acurácia teste: ", acc_test)
print("AUC teste: ", auc_test)

In [None]:
get_precisions_recalls(y_test,y_test_predict)

In [None]:
plotar_ks(y_test,y_test_proba)

### 2. CATBOOST


In [None]:
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, confusion_matrix
import numpy as np

#### 2.1 TREINO COM TODAS AS COLUNAS QUE NÃO SEJAM ID OU DATAS

In [None]:
cat_features = [col for col in X_train_TODAS.columns if X_train_TODAS[col].dtype.name in ["object", "category"]]

cat_features

In [None]:
# ===============================
# 1. Identificar variáveis categóricas
# ===============================
cat_features = [col for col in X_train_TODAS.columns if X_train_TODAS[col].dtype.name in ["object", "category"]]

print("Variáveis categóricas:", cat_features)

# ===============================
# 2. Definir modelo base
# ===============================
weight_minority_class = np.sum(y_train == 0) / np.sum(y_train == 1)

catboost = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="AUC",
    random_seed=42,
    class_weights=[1, weight_minority_class],   # penaliza mais a classe 1 (inadimplente), ajuste se precisar
    logging_level='Info'
)

# ===============================
# 3. Espaço de hiperparâmetros
# ===============================
param_dist = {
    "depth": [4, 6, 8, 10],
    "learning_rate": np.linspace(0.01, 0.2, 5),
    "l2_leaf_reg": [1, 3, 5, 7, 9],
    "iterations": [200, 400, 600, 800],
    "border_count": [32, 64, 128]
}

# ===============================
# 4. RandomizedSearchCV
# ===============================
random_search = RandomizedSearchCV(
    estimator=catboost,
    param_distributions=param_dist,
    n_iter=20,           
    cv=3,
    scoring="roc_auc",
    n_jobs=-1,
    verbose=2,
    random_state=42
)

# CatBoost precisa de Pool para saber quais colunas são categóricas
train_pool = Pool(X_train_TODAS, y_train, cat_features=cat_features)

# Ajustar busca
random_search.fit(X_train_TODAS, y_train, cat_features=cat_features)




In [None]:
# ===============================
# 5. Avaliar no teste
# ===============================
best_model = random_search.best_estimator_
y_proba_cat = best_model.predict_proba(X_test_TODAS)[:, 1]
y_pred_cat = best_model.predict(X_test_TODAS)

print("Melhores parâmetros:", random_search.best_params_)
print("AUC:", roc_auc_score(y_test, y_proba_cat))
print("F1 :", f1_score(y_test, y_pred_cat))
print("Precisão:", precision_score(y_test, y_pred_cat))
print("Recall:", recall_score(y_test, y_pred_cat))
print("Matriz de confusão:\n", confusion_matrix(y_test, y_pred_cat))

In [None]:
plotar_ks(y_test,y_proba_cat, titulo='KS CATBOOST')

In [None]:
import pandas as pd
feat_imp = pd.DataFrame({
    "feature": X_train_TODAS.columns,
    "importance": best_model.feature_importances_
}).sort_values("importance", ascending=False)
print(feat_imp.head(20))


#### 2.2 TREINO COM COLUNAS PRÉ-SELECIONADAS A PARTIR DA FUNÇÃO REMOVER_VARS()

In [None]:
# ===============================
# 1. Identificar variáveis categóricas
# ===============================
cat_features = [col for col in X_train.columns if X_train[col].dtype.name in ["object", "category"]]

print("Variáveis categóricas:", cat_features)

# ===============================
# 2. Definir modelo base
# ===============================
weight_minority_class = np.sum(y_train == 0) / np.sum(y_train == 1)

catboost = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="AUC",
    random_seed=42,
    class_weights=[1, weight_minority_class],   # penaliza mais a classe 1 (inadimplente), ajuste se precisar
    logging_level="Silent"
)

# ===============================
# 3. Espaço de hiperparâmetros
# ===============================
param_dist = {
    "depth": [4, 6, 8, 10],
    "learning_rate": np.linspace(0.01, 0.2, 10),
    "l2_leaf_reg": [1, 3, 5, 7, 9],
    "iterations": [200, 400, 600, 800],
    "border_count": [32, 64, 128]
}

# ===============================
# 4. RandomizedSearchCV
# ===============================
random_search2 = RandomizedSearchCV(
    estimator=catboost,
    param_distributions=param_dist,
    n_iter=20,              # nº de combinações testadas (ajuste se quiser mais)
    cv=3,
    scoring="roc_auc",
    n_jobs=-1,
    verbose=2,
    random_state=42
)

# CatBoost precisa de Pool para saber quais colunas são categóricas
train_pool = Pool(X_train, y_train, cat_features=cat_features)

# Ajustar busca
random_search2.fit(X_train, y_train, cat_features=cat_features)




In [None]:
# ===============================
# 5. Avaliar no teste
# ===============================
best_model2 = random_search2.best_estimator_
y_proba_cat2 = best_model2.predict_proba(X_test)[:, 1]
y_pred_cat2 = best_model2.predict(X_test)

print("Melhores parâmetros:", random_search.best_params_)
print("AUC:", roc_auc_score(y_test, y_proba_cat2))
print("F1 :", f1_score(y_test, y_pred_cat2))
print("Precisão:", precision_score(y_test, y_pred_cat2))
print("Recall:", recall_score(y_test, y_pred_cat2))
print("Matriz de confusão:\n", confusion_matrix(y_test, y_pred_cat2))

In [None]:
plotar_ks(y_test,y_proba_cat2, titulo='KS CATBOOST 2')

## COMO ARRUMAR?

#### Proposta 1 - UPWEIGHT THE MINORITY CLASS

In [None]:
weight_minority_class = np.sum(y_train == 0) / np.sum(y_train == 1)

weight_minority_class

In [None]:
arvore2 = tree.DecisionTreeClassifier(class_weight={0:1, 1:weight_minority_class}, random_state=42)
arvore2.fit(X_train, y_train)
preds = arvore2.predict(X_test)

In [None]:
get_precisions_recalls(y_test, preds)

In [None]:
y_train_predict2 = arvore2.predict(X_train)
y_train_proba2 = arvore2.predict_proba(X_train)[:, 1]

acc_train2 = metrics.accuracy_score(y_train, y_train_predict2)
auc_train2 = metrics.roc_auc_score(y_train, y_train_proba2)
roc_train2 = metrics.roc_curve(y_train, y_train_proba2)

print("Acurácia treino: ", acc_train2)
print("AUC treino: ", auc_train2)

y_test_predict2 = arvore2.predict(X_test)
y_test_proba2 = arvore2.predict_proba(X_test)[:, 1]

acc_test2 = metrics.accuracy_score(y_test, y_test_predict2)
auc_test2 = metrics.roc_auc_score(y_test, y_test_proba2)
roc_test2 = metrics.roc_curve(y_test,y_test_proba2)
                             
                             
print("Acurácia teste: ", acc_test2)
print("AUC teste: ", auc_test2)

#### Proposta 2 - Oversample Minority Class

In [None]:
#get indices of each class
indices_0 = np.where(y_train == 0)[0]
indices_1 = np.where(y_train == 1)[0]
indices = np.concatenate([indices_0, indices_1])

#get weights for each class
weights = np.empty(indices_0.shape[0] + indices_1.shape[0])
weights[:indices_0.shape[0]] = 1
weights[indices_0.shape[0]:] = weight_minority_class
weights = weights/np.sum(weights)

#sample new indices
sampled_indices = np.random.choice(indices, indices.shape[0], p=weights)

In [None]:
# construir datasets oversampled
X_train_oversampled = X_train.iloc[sampled_indices]
labels_train_oversampled = y_train.iloc[sampled_indices]


In [None]:
print('Fraction of positive labels in oversampled data:', str(round(100*np.mean(labels_train_oversampled),3)) + '%')

In [None]:
arvore3 = tree.DecisionTreeClassifier(random_state=42,
                                  max_depth=5,
                                  min_samples_split=10)

arvore3.fit(X_train_oversampled, labels_train_oversampled)
preds = arvore3.predict(X_test)

In [None]:
get_precisions_recalls(y_test, preds)

In [None]:
y_train_predict3 = arvore3.predict(X_train_oversampled)
y_train_proba3 = arvore3.predict_proba(X_train_oversampled)[:, 1]

acc_train3 = metrics.accuracy_score(labels_train_oversampled, y_train_predict3)
auc_train3 = metrics.roc_auc_score(labels_train_oversampled, y_train_proba3)
roc_train3 = metrics.roc_curve(labels_train_oversampled, y_train_proba3)

print("Acurácia treino: ", acc_train3)
print("AUC treino: ", auc_train3)

y_test_predict3 = arvore3.predict(X_test)
y_test_proba3 = arvore3.predict_proba(X_test)[:, 1]

acc_test3 = metrics.accuracy_score(y_test, y_test_predict3)
auc_test3 = metrics.roc_auc_score(y_test, y_test_proba3)
roc_test3 = metrics.roc_curve(y_test,y_test_proba3)
                           
print("Acurácia teste: ", acc_test3)
print("AUC teste: ", auc_test3)

