In [8]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from pathlib import Path
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [9]:
# Load data 
base_folder = Path.cwd()
clean_folder = base_folder.parent / 'data/clean_data'
df_train = pd.read_csv(f'{clean_folder}/df_train_selected.csv')
df_test = pd.read_csv(f'{clean_folder}/df_test_selected.csv')

In [10]:
# Create feature variables and target variables
X = df_train.drop(columns=['ID_CLIENTE', 'SAFRA_REF', 'INADIMPLENTE'])
y = df_train['INADIMPLENTE']

# Split train and validation set
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=17, stratify=y)

In [11]:
# Define the parameter grid to search over
param_grid = {
    'n_estimators': [100, 250, 500],
    'criterion': ['entropy', 'log_loss'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': ['balanced', None]
}

# Set up the grid search with 5-fold cross-validation
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=19, n_jobs=16),
    param_grid=param_grid,
    # cv=5,  # 5-fold cross-validation
    scoring='f1',  # Use F1 score as the evaluation metric
    n_jobs=-1,  # Use all available cores for parallel processing
    verbose=2
)

# Fit the model using grid search
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Print the best parameters
print(f"Best parameters found by GridSearchCV: {best_params}")
print(f"Best cross-validated F1 score: {best_score}")

# Evaluate model on validation set using the best parameters
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_val)

Fitting 5 folds for each of 432 candidates, totalling 2160 fits
[CV] END class_weight=balanced, criterion=entropy, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  14.3s
[CV] END class_weight=balanced, criterion=entropy, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  14.5s
[CV] END class_weight=balanced, criterion=entropy, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  14.8s
[CV] END class_weight=balanced, criterion=entropy, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  14.9s
[CV] END class_weight=balanced, criterion=entropy, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  14.9s
[CV] END class_weight=balanced, criterion=entropy, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=250; total time=  32.2s
[CV] END class_weight=balanced, criterion=entropy, max_depth=None,

In [12]:
# Compute performance metrics and confusion matrix
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
conf_matrix = confusion_matrix(y_val, y_pred)

# Print results
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1: {f1}')
print('Confusion Matrix:')
print(conf_matrix)

Accuracy: 0.9593721741377083
Precision: 0.7220602526724975
Recall: 0.6841620626151013
F1: 0.7026004728132388
Confusion Matrix:
[[14110   286]
 [  343   743]]
