# SBA Loan Analysis

# Modeling - Part 4 - CatBoost

## 1. Imports

In [2]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, f1_score, matthews_corrcoef, confusion_matrix

from library.preprocessing import processing_pipeline
from library.modeling import (createModel, createClassificationMetrics,
                             runGridSearchAnalysis, createConfusionMatrix, createFeatureImportanceChart)

## 2. Previewing Data

In [3]:
sba_loans = pd.read_csv('./../data/processed/sba_national_processed_final.csv')

pd.set_option('display.max_columns', None)

In [4]:
sba_loans.head()

Unnamed: 0,Term,NoEmp,CreateJob,RetainedJob,DisbursementGross,GrAppv,SBA_Appv,NAICS_sectors,unemployment_rate,gdp_growth,gdp_annual_change,inflation_rate,inf_rate_annual_chg,NewExist_existing_business,NewExist_new_business,UrbanRural_rural,UrbanRural_urban,isFranchise_not_franchise,RevLineCr_v2_N,RevLineCr_v2_Y,LowDoc_v2_N,LowDoc_v2_Y,MIS_Status_v2_default,state_top10
0,84,4,0,0,60000.0,60000.0,48000.0,45,3.5,4.4472,0.67,2.3377,-0.59,0,1,0,0,1,1,0,0,1,0,0
1,60,2,0,0,40000.0,40000.0,32000.0,72,3.5,4.4472,0.67,2.3377,-0.59,0,1,0,0,1,1,0,0,1,0,0
2,180,7,0,0,287000.0,287000.0,215250.0,62,3.5,4.4472,0.67,2.3377,-0.59,1,0,0,0,1,1,0,1,0,0,0
3,60,2,0,0,35000.0,35000.0,28000.0,0,4.1,4.4472,0.67,2.3377,-0.59,1,0,0,0,1,1,0,0,1,0,0
4,240,14,7,7,229000.0,229000.0,229000.0,0,4.8,4.4472,0.67,2.3377,-0.59,1,0,0,0,1,1,0,1,0,0,1


In [5]:
target = 'MIS_Status_v2_default'
features = sba_loans.drop(columns='MIS_Status_v2_default', axis=1).columns

## 3. CatBoost Classifier with Standard Scaler

In [6]:
X_train_ss, X_test_ss, y_train_ss, y_test_ss = processing_pipeline(sba_loans, target)

### A. Simple CatBoost Model

In [14]:
cat_ss_mod1 = CatBoostClassifier(random_state=42, verbose=0)
y_pred = createModel(cat_ss_mod1, X_train_ss, y_train_ss, X_test_ss)

### B. Evaluation Metrics

In [15]:
acc_score, cr, f1, mcc = createClassificationMetrics(y_pred, y_test_ss)

**Accuracy Score**

In [16]:
print(acc_score)

0.9498872994264546


**Classification Report**

In [17]:
print(cr)

              precision    recall  f1-score   support

        paid       0.97      0.97      0.97    223642
     default       0.84      0.87      0.85     45212

    accuracy                           0.95    268854
   macro avg       0.91      0.92      0.91    268854
weighted avg       0.95      0.95      0.95    268854



**F1 Score**

In [18]:
print(f1)

0.8538957870194653


**Matthew's Correlation Coefficient**

In [19]:
print(mcc)

0.8238930327835728


### Finding Best CatBoost Model with Standard Scaler

In [24]:
param_grid = {
    'learning_rate': [0.03, 0.1],
    'iterations': [500, 1000],
    'l2_leaf_reg': [1.0, 3.0],
    'depth': [3,6]   
}

In [None]:
cbc = CatBoostClassifier(random_state=42, verbose=0)
cbc_ss_best_params, y_pred = runGridSearchAnalysis(cbc, param_grid, X_train_ss, y_train_ss, X_test_ss)

In [None]:
print(cbc_ss_best_params)

In [None]:
acc_score, cr, f1, mcc = createClassificationMetrics(y_pred, y_test_ss)
print('Accuracy Score: {}'.format(acc_score))
print('Matthew\'s Correlation Coefficient: {}'.format(mcc))
print('F1 Score: {}'.format(f1))
print('Classification Report: \n{}'.format(cr))

### D. Confusion Matrix

In [None]:
cbc_ss_fp = createConfusionMatrix(y_test_ss, y_pred)

### E. Feature Importance with Best Params

In [None]:
createFeatureImportanceChart(CatBoostClassifier, cbc_ss_best_params, features, X_train_ss, y_train_ss)

## 4. CatBoost Classifier with Robust Scaler

In [None]:
X_train_rs, X_test_rs, y_train_rs, y_test_rs = processing_pipeline(sba_loans, target, scaler='Robust')

### A. Simple CatBoost Model

In [None]:
cbc_rs_mod1 = CatBoostClassifier(random_state=42, verbose=0)
y_pred = createModel(cat_rs_mod1, X_train_rs, y_train_rs, X_test_rs)

### B. Evaluation Metrics

In [None]:
acc_score, cr, f1, mcc = createClassificationMetrics(y_pred, y_test_ss)

**Accuracy Score**

In [None]:
print(acc_score)

**Classification Report**

In [None]:
print(cr)

**F1 Score**

In [None]:
print(f1)

**Matthew's Correlation Coefficient**

In [None]:
print(mcc)

### C. Finding Best CatBoost Model with Robust Scaler

In [None]:
param_grid = {
    'learning_rate': [0.03, 0.1],
    'iterations': [500, 1000],
    'l2_leaf_reg': [1.0, 3.0],
    'depth': [3,6]   
}

In [None]:
cbc = CatBoostClassifier(random_state=42, verbose=0)
cbc_rs_best_params, y_pred = runGridSearchAnalysis(cbc, param_grid, X_train_rs, y_train_rs, X_test_rs)

In [None]:
print(cbc_rs_best_params)

In [None]:
acc_score, cr, f1, mcc = createClassificationMetrics(y_pred, y_test_rs)
print('Accuracy Score: {}'.format(acc_score))
print('Matthew\'s Correlation Coefficient: {}'.format(mcc))
print('F1 Score: {}'.format(f1))
print('Classification Report: \n{}'.format(cr))

### D. Confusion Matrix

In [None]:
cbc_rs_fp = createConfusionMatrix(y_test_rs, y_pred)

### E. Feature Importance with Best Params

In [None]:
createFeatureImportanceChart(CatBoostClassifier, cbc_rs_best_params, features, X_train_rs, y_train_rs)

## 5. Best CatBoost Classifier Model

In [None]:
if cbc_ss_fp > cbc_rs_fp:
    print('Best Logistic Regression Model Params: {}'.format(cbc_rs_best_params))
    print('Best Scaler: Robust Scaler')
else:
    print('Best Logistic Regression Model Params: {}'.format(cbc_ss_best_params))
    print('Best Scaler: Standard Scaler')