In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from pathlib import Path
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [2]:
# Load data 
base_folder = Path.cwd()
clean_folder = base_folder.parent / 'data/clean_data'
df_train = pd.read_csv(f'{clean_folder}/df_train.csv')
df_test = pd.read_csv(f'{clean_folder}/df_test.csv')

In [3]:
df_train.columns

Index(['ID_CLIENTE', 'SAFRA_REF', 'NO_FUNCIONARIOS', 'RENDA_MES_ANTERIOR_LOG',
       'PJ', 'SEGMENTO_INDUSTRIAL', 'PORTE', 'REGIAO',
       'DATA_EMISSAO_DOCUMENTO', 'DATA_PAGAMENTO', 'DATA_VENCIMENTO', 'TAXA',
       'VALOR_A_PAGAR_LOG', 'INADIMPLENTE', 'DATA_EMISSAO_DOCUMENTO_year',
       'DATA_EMISSAO_DOCUMENTO_month', 'DATA_PAGAMENTO_year',
       'DATA_PAGAMENTO_month', 'DATA_VENCIMENTO_year',
       'DATA_VENCIMENTO_month'],
      dtype='object')

In [4]:
df_train = df_train.drop(columns=['DATA_PAGAMENTO', 'DATA_EMISSAO_DOCUMENTO', 'DATA_VENCIMENTO', 'DATA_PAGAMENTO_year', 'DATA_PAGAMENTO_month']).copy()
df_test = df_test.drop(columns=['DATA_EMISSAO_DOCUMENTO', 'DATA_VENCIMENTO']).copy()

In [51]:
df_test.columns, df_train.columns

(Index(['ID_CLIENTE', 'SAFRA_REF', 'NO_FUNCIONARIOS', 'RENDA_MES_ANTERIOR_LOG',
        'PJ', 'SEGMENTO_INDUSTRIAL', 'PORTE', 'REGIAO', 'TAXA',
        'VALOR_A_PAGAR_LOG', 'DATA_EMISSAO_DOCUMENTO_year',
        'DATA_EMISSAO_DOCUMENTO_month', 'DATA_VENCIMENTO_year',
        'DATA_VENCIMENTO_month'],
       dtype='object'),
 Index(['ID_CLIENTE', 'SAFRA_REF', 'NO_FUNCIONARIOS', 'RENDA_MES_ANTERIOR_LOG',
        'PJ', 'SEGMENTO_INDUSTRIAL', 'PORTE', 'REGIAO', 'TAXA',
        'VALOR_A_PAGAR_LOG', 'INADIMPLENTE', 'DATA_EMISSAO_DOCUMENTO_year',
        'DATA_EMISSAO_DOCUMENTO_month', 'DATA_VENCIMENTO_year',
        'DATA_VENCIMENTO_month'],
       dtype='object'))

In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77408 entries, 0 to 77407
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   ID_CLIENTE                    77408 non-null  float64
 1   SAFRA_REF                     77408 non-null  object 
 2   NO_FUNCIONARIOS               77408 non-null  float64
 3   RENDA_MES_ANTERIOR_LOG        77408 non-null  float64
 4   PJ                            77408 non-null  float64
 5   SEGMENTO_INDUSTRIAL           77408 non-null  int64  
 6   PORTE                         77408 non-null  int64  
 7   REGIAO                        77408 non-null  int64  
 8   TAXA                          77408 non-null  float64
 9   VALOR_A_PAGAR_LOG             77408 non-null  float64
 10  INADIMPLENTE                  77408 non-null  float64
 11  DATA_EMISSAO_DOCUMENTO_year   77408 non-null  int64  
 12  DATA_EMISSAO_DOCUMENTO_month  77408 non-null  int64  
 13  D

In [54]:
df_train.drop(columns=['ID_CLIENTE', 'SAFRA_REF']).corr()

Unnamed: 0,NO_FUNCIONARIOS,RENDA_MES_ANTERIOR_LOG,PJ,SEGMENTO_INDUSTRIAL,PORTE,REGIAO,TAXA,VALOR_A_PAGAR_LOG,INADIMPLENTE,DATA_EMISSAO_DOCUMENTO_year,DATA_EMISSAO_DOCUMENTO_month,DATA_VENCIMENTO_year,DATA_VENCIMENTO_month
NO_FUNCIONARIOS,1.0,-0.003017,-0.000182,-0.035745,-0.157987,0.029764,0.012731,-0.000237,-0.008789,0.298558,-0.141595,0.28933,-0.10159
RENDA_MES_ANTERIOR_LOG,-0.003017,1.0,0.140796,-0.026573,-0.002632,0.047571,0.00179,0.043702,-0.12093,0.002131,0.004584,0.000424,0.006972
PJ,-0.000182,0.140796,1.0,-0.044576,-0.012191,0.047481,0.002885,0.024601,-0.025033,-0.004206,0.000992,-0.005303,0.002997
SEGMENTO_INDUSTRIAL,-0.035745,-0.026573,-0.044576,1.0,0.060449,-0.028372,0.005605,-0.081765,0.072889,-0.036793,0.011609,-0.032542,0.010947
PORTE,-0.157987,-0.002632,-0.012191,0.060449,1.0,-0.034705,0.011457,0.002292,0.05201,-0.016132,0.005433,-0.009399,0.003824
REGIAO,0.029764,0.047571,0.047481,-0.028372,-0.034705,1.0,0.00064,0.027886,-0.11497,0.001178,-0.000351,0.00022,-0.002446
TAXA,0.012731,0.00179,0.002885,0.005605,0.011457,0.00064,1.0,-0.016713,0.012032,0.007426,0.004466,0.012189,-0.008566
VALOR_A_PAGAR_LOG,-0.000237,0.043702,0.024601,-0.081765,0.002292,0.027886,-0.016713,1.0,-0.299454,0.038654,-0.014056,0.028723,0.003121
INADIMPLENTE,-0.008789,-0.12093,-0.025033,0.072889,0.05201,-0.11497,0.012032,-0.299454,1.0,0.002633,-0.022199,0.019344,-0.047267
DATA_EMISSAO_DOCUMENTO_year,0.298558,0.002131,-0.004206,-0.036793,-0.016132,0.001178,0.007426,0.038654,0.002633,1.0,-0.455491,0.96036,-0.316201


In [56]:
df_train[['DATA_VENCIMENTO_year', 'DATA_EMISSAO_DOCUMENTO_year', 'DATA_EMISSAO_DOCUMENTO_month', 'DATA_VENCIMENTO_month']].corr()

Unnamed: 0,DATA_VENCIMENTO_year,DATA_EMISSAO_DOCUMENTO_year,DATA_EMISSAO_DOCUMENTO_month,DATA_VENCIMENTO_month
DATA_VENCIMENTO_year,1.0,0.96036,-0.347202,-0.430167
DATA_EMISSAO_DOCUMENTO_year,0.96036,1.0,-0.455491,-0.316201
DATA_EMISSAO_DOCUMENTO_month,-0.347202,-0.455491,1.0,0.654997
DATA_VENCIMENTO_month,-0.430167,-0.316201,0.654997,1.0


In [6]:
# Create feature variables and target variables
X = df_train.drop(columns=['ID_CLIENTE', 'SAFRA_REF', 'INADIMPLENTE'])
y = df_train['INADIMPLENTE']

# Split train and validation set
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=17, stratify=y)


In [22]:
# Define the parameter grid to search over
param_grid = {
    'n_estimators': [100, 256, 512],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': ['balanced', None]
}

# Set up the grid search with 5-fold cross-validation
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=19, n_jobs=16),
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='f1',  # Use F1 score as the evaluation metric
    n_jobs=-1,  # Use all available cores for parallel processing
    verbose=2
)

# Fit the model using grid search
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Print the best parameters
print(f"Best parameters found by GridSearchCV: {best_params}")
print(f"Best cross-validated F1 score: {best_score}")

# Evaluate model on validation set using the best parameters
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_val)


Fitting 5 folds for each of 648 candidates, totalling 3240 fits
[CV] END class_weight=balanced, criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  11.1s
[CV] END class_weight=balanced, criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  11.1s
[CV] END class_weight=balanced, criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  11.3s
[CV] END class_weight=balanced, criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  11.4s
[CV] END class_weight=balanced, criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  11.7s
[CV] END class_weight=balanced, criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=256; total time=  25.2s
[CV] END class_weight=balanced, criterion=gini, max_depth=None, min_samples_leaf=1, 

Best parameters found by GridSearchCV: {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 512}
Best cross-validated F1 score: 0.646219125115989

In [23]:
# Compute performance metrics and confusion matrix
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
conf_matrix = confusion_matrix(y_val, y_pred)

# Print results
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1: {f1}')
print('Confusion Matrix:')
print(conf_matrix)

Accuracy: 0.9485854540757008
Precision: 0.6324143692564745
Recall: 0.6801437556154537
F1: 0.6554112554112554
Confusion Matrix:
[[13929   440]
 [  356   757]]


In [10]:
# Split train and validation set
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=17, stratify=y)'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 512
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=17, stratify=y)

# Set model parameters and train
model = RandomForestClassifier(criterion='entropy', random_state=19, class_weight='balanced', min_samples_leaf=1, min_samples_split=10, n_estimators=512)
model.fit(X_train, y_train)

# Evaluate model on validation set
y_pred = model.predict(X_val)

# Compute performance metrics and confusion matrix
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
conf_matrix = confusion_matrix(y_val, y_pred)

# Print results
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'recall: {recall}')
print(f'F1: {f1}')
print('Confusion Matrix:')
print(conf_matrix)

Accuracy: 0.9523963312233562
Precision: 0.6516072980017377
recall: 0.6906077348066298
F1: 0.6705409029950827
Confusion Matrix:
[[13995   401]
 [  336   750]]


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_validate

# Set model parameters
model = RandomForestClassifier(criterion='entropy', random_state=19, class_weight='balanced', 
                               min_samples_leaf=1, min_samples_split=10, n_estimators=512)

# Define scoring metrics
scoring = {
    'accuracy': 'accuracy',
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score)
}

# Perform cross-validation with 5 folds
scores = cross_validate(model, X, y, cv=5, scoring=scoring)

# Print results
print(f"Mean Accuracy: {scores['test_accuracy'].mean()}")
print(f"Mean Precision: {scores['test_precision'].mean()}")
print(f"Mean Recall: {scores['test_recall'].mean()}")
print(f"Mean F1 Score: {scores['test_f1'].mean()}")


In [26]:
# Create feature variables and target variables
X_test = df_test.drop(columns=['ID_CLIENTE', 'SAFRA_REF']).copy()

# Make predictions
y_pred_test = model.predict_proba(X_test)[:, 1]

# Create final data frame and save it
df_predictions = df_test[["ID_CLIENTE", "SAFRA_REF"]].copy()
df_predictions['INADIMPLENTE'] = y_pred_test.copy()
df_predictions
# df_predictions.to_csv(f'{clean_folder}/df_predictions.csv')

Unnamed: 0,ID_CLIENTE,SAFRA_REF,INADIMPLENTE
0,5058298901476893676,2021-07,0.015625
1,5058298901476893676,2021-07,0.019531
2,5058298901476893676,2021-07,0.019531
3,5058298901476893676,2021-07,0.023438
4,274692171162531764,2021-07,0.011719
...,...,...,...
11822,780065359501679358,2021-11,0.027344
11823,121658372387044248,2021-11,0.117188
11824,798425422240919854,2021-11,0.062500
11825,7828430488099134389,2021-11,0.007812


In [27]:
df_predictions[df_predictions['INADIMPLENTE']>=0.5]

Unnamed: 0,ID_CLIENTE,SAFRA_REF,INADIMPLENTE
268,3421433989501830226,2021-07,0.824219
488,8229446069655666941,2021-07,0.617188
491,8229446069655666941,2021-07,0.613281
822,5714540197488862003,2021-07,0.742188
1054,1156053485989336481,2021-07,0.628906
...,...,...,...
10657,4008627434689715639,2021-11,0.585938
10762,8733525963966339207,2021-11,0.769531
11603,683445437387317282,2021-11,0.550781
11658,8646472172649019955,2021-11,0.585938


In [38]:
((df_predictions[df_predictions['INADIMPLENTE']>0.5].shape[0])/df_predictions.shape[0])*100

0.7017840534370507

In [31]:
((df_train[df_train['INADIMPLENTE']==1].shape[0])/df_train.shape[0])*100

7.01607069036792