In [19]:
#Import dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
import numpy as np

In [20]:
train_df = pd.read_csv('Resources/2019loans.csv')
test_df = pd.read_csv('Resources/2020Q1loans.csv')

In [21]:
#drop rows with missing data in Training dataset
train_df = train_df.dropna()
train_df = train_df.dropna(axis ='columns')
train_df = train_df.drop(columns=['Unnamed: 0'])


In [22]:
#drop rows with missing data in Testing dataset
test_df = test_df.dropna()
test_df = test_df.dropna(axis ='columns')
test_df = test_df.drop(columns=['Unnamed: 0'])

# Preprocessing: Convert categorical data to numeric

In [5]:
# Separate loan_status feature for training data
X_train = train_df.drop('loan_status',axis=1)
y_train = train_df['loan_status']

In [23]:
# Separate loan_status feature for testing data
X_test = test_df.drop('loan_status',axis=1)
y_test = test_df['loan_status']

In [24]:
# One-hot encoding dataframes
X_train_dummies = pd.get_dummies(X_train)
X_test_dummies = pd.get_dummies(X_test)
print(f"Train: {X_train_dummies.shape}, Test: {X_test_dummies.shape}")

Train: (12180, 93), Test: (4702, 92)


In [26]:
# Convert output labels to 0 and 1
y_train_label = LabelEncoder().fit_transform(train_df['loan_status'])

y_test_label = LabelEncoder().fit_transform(test_df['loan_status'])

In [27]:
# Add missing dummy variables to testing set
for col in X_train_dummies.columns:
    if col not in X_test_dummies.columns:
        X_test_dummies[col]=0

In [10]:
print(f"Train: {X_train_dummies.shape}, Test: {X_test_dummies.shape}")

Train: (12180, 93), Test: (4702, 93)


# Prediction on Unscaled Models

###### In my opinion, the RandomForest Classifier will be more accurate than LogisticRegression model. 

    The RandomForest model have a classified output, which is 1 or 0.The RandomClassifier classifies each data into either 1 or 0 and it undergoes multiple iterations. Since the features of training data is within the range of test data, they are behaved in a simlar fashion which results in more accurate prediction.It would have failed to classify if the test data is outside the taining data.
    Whereas Logistic regression model uses a classification algorithm and provide the probability of a test data to behave as the trained dataset.It would have been more accurate if the test data is outside the range of training data.

# Fit a LogisticRegression model and RandomForestClassifier model

### LogisticRegression model

In [28]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',max_iter=1000)
classifier.fit(X_train_dummies, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(max_iter=1000)

In [29]:
print(f"Training Data Score: {classifier.score(X_train_dummies, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_dummies, y_test)}")

Training Data Score: 0.6852216748768473
Testing Data Score: 0.5561463207145896


### RandomForestClassifier model

In [30]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [31]:
# Fit a model, and then print a classification report
clf_forest = RandomForestClassifier(random_state=42).fit(X_train_dummies, y_train)
y_pred = clf_forest.predict(X_test_dummies)

print(f'Training Score: {clf_forest.score(X_train_dummies, y_train)}')
print(f'Testing Score: {clf_forest.score(X_test_dummies, y_test)}')

Training Score: 1.0
Testing Score: 0.6550404083368779


# Compare Predicted Behavior with Actual Results on Unscaled Data

##### As expected, the RandomForest Classifier performed better than the Logistic Regression model with a testing score of 0.65
###### LogisticRegression Unscaled : 
    Training Data Score: 0.6852216748768473
    Testing Data Score: 0.5561463207145896
###### RandomForestClassifier Unscaled : 
    Training Score: 1.0
    Testing Score: 0.6550404083368779        

# Revisit the Preprocessing: Scale the data

In [32]:
# Scaling the X data by using StandardScaler()
scaler = StandardScaler().fit(X_train_dummies)
X_train_scaled = scaler.transform(X_train_dummies)

In [33]:
# Transforming the test dataset based on the fit from the training dataset
X_test_scaled = scaler.transform(X_test_dummies)

# Prediction on Scaled Models

###### As per my understanding, the accuracy of the LogisticRegression changes with scaling. 
    Linear Regression gets more impacted with feature scaling as they depend on the range on data-points.
    Tree-based algorithms like RandomForest Classifiers are independent of scaling.

# Re-fitting the LogisticRegression model on the scaled data

In [34]:
# classifier = LogisticRegression(solver='lbfgs',max_iter=1000)
classifier.fit(X_train_scaled, y_train)

print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

Training Data Score: 0.712807881773399
Testing Data Score: 0.7201190982560612


# Re-fitting the RandomForestClassifier model on the scaled data

In [35]:
# Fit a model, and then print a classification report
clf_forest = RandomForestClassifier(random_state=1).fit(X_train_scaled, y_train)
y_pred = clf_forest.predict(X_test_scaled)

print(f'Training Score: {clf_forest.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf_forest.score(X_test_scaled, y_test)}')

Training Score: 1.0
Testing Score: 0.6688643130582731


# Compare Predicted Behavior with Actual Results on Scaled Data

##### As expected, the Logistic Regression performed better with scaled data whereas the RandomClassifer didn't change with scaling. The score of logistic regression changed drastically from 0.55 to 0.72, which implies that the models can fit better with scaled data.

###### LogisticRegression Unscaled : 
    Training Data Score: 0.6852216748768473
    Testing Data Score: 0.5561463207145896
###### LogisticRegression Scaled : 
    Training Data Score: 0.712807881773399
    Testing Data Score: 0.7201190982560612


###### RandomForestClassifier Unscaled : 
    Training Score: 1.0
    Testing Score: 0.6671629094002552        
###### RandomForestClassifier Scaled : 
    Training Score: 1.0
    Testing Score: 0.6550404083368779