In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix


In [2]:
# Load in the generated data sets.
train_df = pd.read_csv(Path('Resources/Generator/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/Generator/2020Q1loans.csv'))

#### Predictions
Because there's a lot of data and a good chunk of it is categorical I believe the logsitic regression will be a better fit and will score better.
Our data has a binary goal (low risk loan target or not) and currently a lot of dimensions to it.

I think the random tree classifier will struggle as it has a lot of decisions to make and taking the average of these will not provide as clear of a result?

In [3]:
# Convert categorical data to numeric and separate target feature for training data

# One hot incoding the data to get split out the categorical data, dropping the first to avoid the "dummy trap"
train_df_num = pd.get_dummies(train_df, drop_first= True)
# train_df_num.head()

# Splitting out the target to get X and Y data
X_train = train_df_num.drop('target_low_risk', axis=1)
y_train = train_df_num['target_low_risk']

In [4]:
# Convert categorical data to numeric and separate target feature for testing data
test_df_num = pd.get_dummies(test_df, drop_first= True)

In [5]:
# Add missing dummy variables to testing set
# Pulling the column names from the training set
column_list = list(train_df_num.columns)

# Adding any missing columns to the testing set. Filling the missing values with 0.
test_df_num2 = test_df_num.reindex(columns=column_list,fill_value=0)

X_test = test_df_num2.drop('target_low_risk', axis=1)
y_test = test_df_num2['target_low_risk']

In [6]:
# Train the Logistic Regression model on the unscaled data and print the model score
# Create an instance of the logistic model
classifier = LogisticRegression()

# Training it.
classifier.fit(X_train, y_train)

# Previewing it's scores.
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.65311986863711
Testing Data Score: 0.5078689919183327


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:
# Train a Random Forest Classifier model and print the model score
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train, y_train)

print(f'Training Score: {clf.score(X_train, y_train)}')
print(f'Testing Score: {clf.score(X_test, y_test)}')

Training Score: 1.0
Testing Score: 0.638664398128456


In [8]:
# Scale the data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
# Train the Logistic Regression model on the scaled data and print the model score
classifier = LogisticRegression()
classifier.fit(X_train_scaled, y_train)

print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

Training Data Score: 0.7103448275862069
Testing Data Score: 0.7598894087622289


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
# Creating a confusion matrix to view the results of this scaled model.
y_true = y_test
y_predict = classifier.predict(X_test)
matrix = confusion_matrix(y_true, y_predict)

matrix


array([[1998,  353],
       [1874,  477]], dtype=int64)

In [11]:
# Pulling the model's precision and sensitivity.
tn, fp, fn, tp = matrix.ravel()

precision = tp/(tp+fp)
sensitivity = tp/(tp+fn)

print(f'The Logistic regression had a Precision of: {precision}')
print(f'The Logistic regression had a Sensitivity of: {sensitivity}')

The Logistic regression had a Precision of: 0.5746987951807229
The Logistic regression had a Sensitivity of: 0.20289238621863037


In [12]:
# Train a Random Forest Classifier model on the scaled data and print the model score
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

Training Score: 1.0
Testing Score: 0.6384517226712038


In [13]:
# Creating a confusion matrix to view the results.
y_true = y_test
y_predict = clf.predict(X_test)
matrix2 = confusion_matrix(y_true, y_predict)

matrix2

array([[1420,  931],
       [2013,  338]], dtype=int64)

In [14]:
# Pulling the model's precision and sensitivity.
tn, fp, fn, tp = matrix2.ravel()

precision = tp/(tp+fp)
sensitivity = tp/(tp+fn)

print(f'The Random Forest Classifier had a Precision of: {precision}')
print(f'The Random Forest Classifier had a Sensitivity of: {sensitivity}')

The Random Forest Classifier had a Precision of: 0.26635145784081954
The Random Forest Classifier had a Sensitivity of: 0.14376860910250958


#### Reflection
Well both models didn't do so great. 

I was particularly shocked at the poor Sensitivity of each model (both were around .20)
This tells me that either one isn't even sure if they are capturing all of the true values consistently.

Looking at the scores, we can see the logistic function did do better with a testing data scores of .76 vs 0.64
The logistic function improved a lot from the scaled data, whereas the random forest classifier did not change.
Because of how tree diagrams make decisions this outcome makes sense.

