In [1]:
# Import the required modules
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced

In [2]:
# Read the sba_loans.csv file from the Resources folder into a Pandas DataFrame
loans_df = pd.read_csv(
    Path('sba_loans.csv')
)

# Review the DataFrame
loans_df.head()

Unnamed: 0,Year,Month,Amount,Term,Zip,CreateJob,NoEmp,RealEstate,RevLineCr,UrbanRural,Default
0,2001,11,32812,36,92801,0,1,0,1,0,0
1,2001,4,30000,56,90505,0,1,0,1,0,0
2,2001,4,30000,36,92103,0,10,0,1,0,0
3,2003,10,50000,36,92108,0,6,0,1,0,0
4,2006,7,343000,240,91345,3,65,1,0,2,0


In [3]:
# Split the data into X (features) and y (lables)

# The y variable should focus on the Default column
y = loans_df['Default']

# The X variable should include all features except the Default column
X = loans_df.drop(columns=['Default'])

In [4]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [5]:
# Scale the data
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [7]:
# Count the distinct values in the original labels data
y_train.value_counts()

0    1063
1      96
Name: Default, dtype: int64

In [8]:
from sklearn.ensemble import RandomForestClassifier

# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=1)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Making predictions using the testing data
rf_predictions = rf_model.predict(X_test_scaled)

In [9]:
# Import BalancedRandomForestClassifier from imblearn
from imblearn.ensemble import BalancedRandomForestClassifier

# Instantiate a BalancedRandomForestClassifier instance
brf = BalancedRandomForestClassifier()

# Fit the model to the training data
brf.fit(X_train_scaled, y_train)

BalancedRandomForestClassifier()

In [10]:
# Predict labels for testing features
brf_predictions = brf.predict(X_test_scaled)

In [11]:
# Print the accuracy score for the resampled data
basr = balanced_accuracy_score(y_test, brf_predictions)
print(basr)

0.872789566755084


In [13]:
# Print the accuracy score for the resampled data
basrs = balanced_accuracy_score(y_test, rf_predictions)
print(basrs)

0.8189655172413792


In [14]:
# Print the classification report for the resampled data
print(classification_report_imbalanced(y_test, brf_predictions))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      0.90      0.85      0.94      0.87      0.77       348
          1       0.49      0.85      0.90      0.62      0.87      0.76        39

avg / total       0.93      0.89      0.85      0.91      0.87      0.76       387



In [15]:
# Print the classification report for the original data
print(classification_report_imbalanced(y_test, rf_predictions))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.96      0.97      0.67      0.97      0.80      0.67       348
          1       0.72      0.67      0.97      0.69      0.80      0.63        39

avg / total       0.94      0.94      0.70      0.94      0.80      0.66       387

