## Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import mlflow
import mlflow.sklearn

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from skopt import BayesSearchCV

import logging
import warnings
warnings.filterwarnings("ignore")

## Dataset

In [2]:
data = pd.read_csv('../../data/external/bankloan_cleaned.csv')

In [3]:
data.head()

Unnamed: 0,age,experience,income,zip_code,family,cc_avg,education,mortgage,personal_loan,securities_account,cd_account,online,credit_card
0,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,35,8,45,91330,4,1.0,2,0,0,0,0,0,1


In [4]:
data.columns

Index(['age', 'experience', 'income', 'zip_code', 'family', 'cc_avg',
       'education', 'mortgage', 'personal_loan', 'securities_account',
       'cd_account', 'online', 'credit_card'],
      dtype='object')

## Modelagem

In [5]:
X, y = data.drop(columns='credit_card'), data['credit_card']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42
)

In [8]:
mlflow.set_tracking_uri('http://127.0.0.1:8080')
mlflow.set_experiment('Modelo Protótipo')
mlflow.sklearn.autolog(silent=True)

2024/11/10 18:00:04 INFO mlflow.tracking.fluent: Experiment with name 'Modelo Protótipo' does not exist. Creating a new experiment.


In [12]:
def rand_search_cv(model, param_grid):
    with mlflow.start_run(run_name=f'RandomSearchCV_{model.__class__.__name__}'):
        rand_search = RandomizedSearchCV(model, 
                                         param_grid, 
                                         cv=5, 
                                         n_jobs=-1,
                                         verbose=1)
        rand_search.fit(X_train, y_train)
        
        best_model = rand_search.best_estimator_
        predictions = best_model.predict(X_test)
        accuracy = accuracy_score(predictions, y_test)

        mlflow.log_metric('accuracy', accuracy)
        logging.info(f'Melhores parâmetros: {rand_search.best_params_}')
        logging.info(f'Precisão (acurácia): {accuracy}')

In [14]:
def grid_search_cv(model, param_grid):
    with mlflow.start_run(run_name=f'GridSearchCV{model.__class__.__name__}'):
        grid_search = GridSearchCV(model, 
                                         param_grid, 
                                         cv=5, 
                                         n_jobs=-1,
                                         verbose=1)
        grid_search.fit(X_train, y_train)
        
        best_model = grid_search.best_estimator_
        predictions = best_model.predict(X_test)
        accuracy = accuracy_score(predictions, y_test)

        mlflow.log_metric('accuracy', accuracy)
        logging.info(f'Melhores parâmetros: {grid_search.best_params_}')
        logging.info(f'Precisão (acurácia): {accuracy}')

In [13]:
def bayesian_search_cv(model, param_grid):
    with mlflow.start_run(run_name=f'BayesSearchCV{model.__class__.__name__}'):
        bayesian_search = BayesSearchCV(model, 
                                         param_grid, 
                                         cv=5, 
                                         n_jobs=-1,
                                         verbose=1)
        bayesian_search.fit(X_train, y_train)
        
        best_model = bayesian_search.best_estimator_
        predictions = best_model.predict(X_test)
        accuracy = accuracy_score(predictions, y_test)

        mlflow.log_metric('accuracy', accuracy)
        logging.info(f'Melhores parâmetros: {bayesian_search.best_params_}')
        logging.info(f'Precisão (acurácia): {accuracy}')

In [15]:
param_grid_rf = {'n_estimators': [10, 50, 100],
                 'criterion': ['entropy', 'gini'],
                 'max_depth': [3, 5, 10, None]}

param_grid_gb = {'n_estimators': [10, 50, 100],
                 'learning_rate': [0.01, 0.1, 0.2],
                 'max_depth': [10, 20, 30]}

param_grid_knn = {'n_neighbors': [3, 5, 7, 9]}

## Random Search Cross Validation

In [None]:
rand_search_cv(RandomForestClassifier(), param_grid_rf)

In [None]:
rand_search_cv(GradientBoostingClassifier(), param_grid_gb)

In [None]:
rand_search_cv(KNeighborsClassifier(), param_grid_knn)

## Grid Search Cross Validation

In [None]:
grid_search_cv(RandomForestClassifier(), param_grid_rf)

In [None]:
grid_search_cv(GradientBoostingClassifier(), param_grid_gb)

In [None]:
grid_search_cv(KNeighborsClassifier(), param_grid_knn)

## Bayesian Search Cross Validation

In [None]:
bayesian_search_cv(RandomForestClassifier(), param_grid_rf)

In [None]:
bayesian_search_cv(GradientBoostingClassifier(), param_grid_gb)

In [None]:
bayesian_search_cv(KNeighborsClassifier(), param_grid_knn)