## Importação de Bibliotecas

In [None]:
import pandas as pd
import numpy as np

## Importação dos Dados

In [None]:
df_train = pd.read_csv("data/train.csv")
X_train = df_train.iloc[:, :-1].values
y_train = df_train.iloc[:, -1].values

df_kaggle = pd.read_csv("data/test.csv")
X_kaggle = df_kaggle.iloc[:, :].values

## Utilização de *Imputer* para substituir dados faltantes

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
X_train[:, 24:30] = imputer.fit_transform(X_train[:, 24:30])

imputer_kaggle = SimpleImputer(missing_values=np.nan, strategy="mean")
X_kaggle[:, 24:30] = imputer_kaggle.fit_transform(X_kaggle[:, 24:30])

## Utilização de *One-hot Encoder* para codificar colunas

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[("encoder", OneHotEncoder(), [0, 1, 2])], remainder="passthrough")
X_train = np.array(ct.fit_transform(X_train))
X_kaggle = np.array(ct.transform(X_kaggle))

## Utilização de *Scaler* para padronizar colunas

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train[:, 33:] = sc.fit_transform(X_train[:, 33:])

sc_kaggle = StandardScaler()
X_kaggle[:, 33:] = sc_kaggle.fit_transform(X_kaggle[:, 33:])

## Utilização de *Gradient Boosting* com a biblioteca *CatBoost* para treinar modelo de classificação

In [None]:
from catboost import CatBoostClassifier

classifier = CatBoostClassifier()
classifier.fit(X_train, y_train)

## Avaliação preliminar do modelo de classificação utilizando *cross validation score*

In [None]:
from sklearn.model_selection import cross_val_score
from datetime import datetime as dt
timestamp = dt.today()

accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=10)
print(f"Accuracy: {accuracies.mean()}")
print(f"StdDev: {accuracies.std()}")
try:
    with open("data/cross_val.log", mode="x+") as cvlog:
        cvlog.write("timestamp,accuracy,stddev")
        cvlog.write(f"\n{timestamp},{accuracies.mean()},{accuracies.std()}")
except:
    with open("data/cross_val.log", mode="a") as cvlog:
        cvlog.write(f"\n{timestamp},{accuracies.mean()},{accuracies.std()}")

## Predição da classificação do conjunto de teste utilizando o modelo treinado

In [None]:
y_kaggle = classifier.predict(X_kaggle)

## Exportação do CSV para submissão no Kaggle

In [None]:
with open(f"data/kaggle/kaggle_{timestamp.strftime('%d-%m-%Y_%Hh%M')}.csv", mode="w") as out:
    out.write("id_solicitante,inadimplente")
    for i in range(len(y_kaggle)):
        out.write(f"\n{20001+i},{y_kaggle[i]}")