<a href="https://colab.research.google.com/github/pedroachagas/case_datarisk/blob/main/eda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! git clone https://github.com/pedroachagas/case_datarisk.git

In [None]:
!pip install lazypredict
!pip install --pre pycaret
!pip install autoviz
!pip install pycaret[full]
!pip install -U ydata-profiling

In [None]:
import pandas as pd
import numpy as np
from pycaret.classification import *

In [None]:
train = pd.read_csv('/content/case_datarisk/treino.csv')
test = pd.read_csv('/content/case_datarisk/teste.csv')

In [None]:
train.head()

In [None]:
exp_1 = setup(data=train, target='inadimplente', fix_imbalance=True)

In [None]:
top3 = compare_models(n_select = 3)
tuned_top3 = [tune_model(i) for i in top3]
blender = blend_models(tuned_top3)
stacker = stack_models(tuned_top3)
best_auc_model = automl(optimize = 'AUC')
model = finalize_model(best_auc_model)
save_experiment('exp1.pkl')

In [None]:
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
from sklearn.model_selection import train_test_split

X = train.drop('inadimplente', axis=1)
y = train.inadimplente
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=.2,random_state =123)

metrics = [
    'recall_weighted',
    # 'recall_micro',
    # 'accuracy',
    'f1_weighted',
    'balanced_accuracy',
    'roc_auc',
    #  'f1_micro',
]

plt.figure(figsize=(15, 6))
for i, metric in enumerate(metrics):
    train_sizes, train_scores, test_scores = learning_curve(estimator=model, X=X, y=y,
                                                            cv=10, train_sizes=np.linspace(0.1, 1.0, 51),
                                                            n_jobs=-1, scoring=metric, shuffle=True,
                                                            )

    # Calculate training and test mean and std
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)

    # Plot the learning curve
    plt.subplot(int(len(metrics)/2), 2, i+1)
    plt.plot(train_sizes, train_mean,  marker='o',
             markersize=5, label=f'Treino - {metric}')
    plt.fill_between(train_sizes, train_mean + train_std,
                     train_mean - train_std, alpha=0.15)
    plt.plot(train_sizes, test_mean,  marker='+', markersize=5,
             linestyle='--', label=f'Validação - {metric}')
    plt.fill_between(train_sizes, test_mean + test_std,
                     test_mean - test_std, alpha=0.15)
    plt.ylim([0.2, 1])
    plt.title(f'{metric}')
    plt.ylabel(f'{metric}')
    plt.grid(alpha=0.5)
    plt.legend()
plt.suptitle('Curvas de aprendizado', fontsize=20)
plt.tight_layout()
plt.show()

In [None]:
dashboard(model)

In [None]:
from ydata_profiling import ProfileReport

profile = ProfileReport(train, title="Profiling Report")
profile.to_notebook_iframe()