In [1]:
# Initial imports
import pandas as pd
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

## Loading and Preprocessing Loans Encoded Data

Load the `sba_loans_encoded.csv` in a pandas DataFrame called `df_loans`.

In [2]:
# Loading data
df_loans = pd.read_csv("sba_loans_encoded.csv")
df_loans.head()

Unnamed: 0,Year,Month,Amount,Term,Zip,CreateJob,NoEmp,RealEstate,RevLineCr,UrbanRural,...,City_WILLITS,City_WILMINGTON,City_WINDSOR,City_WINNETKA,City_WOODLAND,City_WOODLAND HILLS,City_WRIGHTWOOD,City_Watsonville,City_YORBA LINDA,City_YUBA CITY
0,2001,11,32812,36,92801,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2001,4,30000,56,90505,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,2001,4,30000,36,92103,0,10,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,2003,10,50000,36,92108,0,6,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,2006,7,343000,240,91345,3,65,1,0,2,...,0,0,0,0,0,0,0,0,0,0


Define the features set, by copying the `df_loans` DataFrame and dropping the `Default` column.

In [6]:
# To simplify, we will only use certain columns for our model
df_loans = df_loans[[
    "Amount",
    "Term",
    "CreateJob",
    "NoEmp",
    "RealEstate",
    "UrbanRural",
    "Default"
]]

In [7]:
# Define features set
X = df_loans.copy()
X.drop("Default", axis=1, inplace=True)
X.head()

Unnamed: 0,Amount,Term,CreateJob,NoEmp,RealEstate,UrbanRural
0,32812,36,0,1,0,0
1,30000,56,0,1,0,0
2,30000,36,0,10,0,0
3,50000,36,0,6,0,0
4,343000,240,3,65,1,2


Create the target vector by assigning the values of the `Default` column from the `df_loans` DataFrame.

In [8]:
# Define target vector
y = df_loans["Default"].values.reshape(-1, 1)
y[:5]

array([[0],
       [0],
       [0],
       [0],
       [0]], dtype=int64)

Split the data into training and testing sets.

In [9]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

Use the `StandardScaler` to scale the features data, remember that only `X_train` and `X_testing` DataFrames should be scaled.

In [10]:
# Create the StandardScaler instance
scaler = StandardScaler()

In [11]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [12]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Fitting the Decision Tree Model

Once data is scaled, create a decision tree instance and train it with the training data (`X_train_scaled` and `y_train`).

In [13]:
# Create the decision tree classifier instance
model = tree.DecisionTreeClassifier()

In [14]:
# Fit the model
model = model.fit(X_train_scaled, y_train)

## Making Predictions Using the Tree Model

Validate the trained model, by predicting fraudulent loan applications using the testing data (`X_test_scaled`).

In [15]:
# Making predictions using the testing data
predictions = model.predict(X_test_scaled)

## Model Evaluation

Evaluate model's results, by using `sklearn` to calculate the confusion matrix, the accuracy score and to generate the classification report.

In [16]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)



In [17]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,331,13
Actual 1,27,154


Accuracy Score : 0.9238095238095239
Classification Report
              precision    recall  f1-score   support

           0       0.92      0.96      0.94       344
           1       0.92      0.85      0.89       181

    accuracy                           0.92       525
   macro avg       0.92      0.91      0.91       525
weighted avg       0.92      0.92      0.92       525



# Model Saving

To run this model elsewhere, we need to:
1. Save the model to file so it can be loaded
2. Also save the transformer we used to scale our X data, so we can similarly scale new input data

In [18]:
import pickle

with open("scaler.pkl", "wb") as f:
    pickle.dump(X_scaler,f)

with open("model.pkl", "wb") as f:
    pickle.dump(model,f)
