# Machine Learning Project - Making Predictions

In [32]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_predict

In [2]:
loans = pd.read_csv('cleaned_loans_2007.csv')

In [3]:
loans.shape

(38708, 38)

In [4]:
loans.head()

Unnamed: 0,loan_amnt,int_rate,installment,emp_length,annual_inc,loan_status,dti,delinq_2yrs,inq_last_6mths,open_acc,...,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding,term_ 36 months,term_ 60 months
0,5000.0,10.65,162.87,10,24000.0,1,27.65,0.0,1.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2500.0,15.27,59.83,0,30000.0,0,1.0,0.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2400.0,15.96,84.33,10,12252.0,1,8.72,0.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,10000.0,13.49,339.31,10,49200.0,1,20.0,0.0,1.0,10.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,5000.0,7.9,156.46,3,36000.0,1,11.2,0.0,3.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [5]:
loans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38708 entries, 0 to 38707
Data columns (total 38 columns):
loan_amnt                              38708 non-null float64
int_rate                               38708 non-null float64
installment                            38708 non-null float64
emp_length                             38708 non-null int64
annual_inc                             38708 non-null float64
loan_status                            38708 non-null int64
dti                                    38708 non-null float64
delinq_2yrs                            38708 non-null float64
inq_last_6mths                         38708 non-null float64
open_acc                               38708 non-null float64
pub_rec                                38708 non-null float64
revol_bal                              38708 non-null float64
revol_util                             38708 non-null float64
total_acc                              38708 non-null float64
home_ownership_MORTGAGE    

## Selecting an error mertric and handling class inbalance

In [6]:
# Predict that all loans will be paid off on time.

predictions = pd.Series(np.ones(loans.shape[0]))

In [7]:
predictions.shape[0]

38708

In [8]:
false_positive = (predictions == 1) & (loans["loan_status"] == 0)
fp = len(predictions[false_positive])
fp

5615

In [9]:
true_positive = (predictions == 1) & (loans["loan_status"] == 1)
tp = len(predictions[true_positive])
tp

33093

In [10]:
false_negative = (predictions == 0) & (loans["loan_status"] == 1)
fn = len(predictions[false_negative])
fn

0

In [11]:
true_negative = (predictions == 0) & (loans["loan_status"] == 0)
tn = len(predictions[true_negative])
tn

0

In [12]:
# True positive rate

tpr = tp / (tp + fn)
tpr

1.0

In [13]:
# False posirtive rate

fpr = fp / (fp + tn)
fpr

1.0

## Applying Logistic Regression

In [14]:
lr = LogisticRegression()
train_cols = loans.columns.drop("loan_status")
features = loans[train_cols]
target = loans["loan_status"]
lr.fit(features, target)
predictions = lr.predict(features)

In [15]:
false_positive = (predictions == 1) & (loans["loan_status"] == 0)
fp = len(predictions[false_positive])
true_positive = (predictions == 1) & (loans["loan_status"] == 1)
tp = len(predictions[true_positive])
false_negative = (predictions == 0) & (loans["loan_status"] == 1)
fn = len(predictions[false_negative])
true_negative = (predictions == 0) & (loans["loan_status"] == 0)
tn = len(predictions[true_negative])
tpr = tp / (tp + fn)
print(tpr)
fpr = fp / (fp + tn)
print(fpr)

0.9985495421992566
0.996260017809439


## Performing cross validation to reduce overfitting

In [16]:
lr = LogisticRegression()
predictions = cross_val_predict(lr, features, target, cv=3)
predictions = pd.Series(predictions)

In [17]:
false_positive = (predictions == 1) & (loans["loan_status"] == 0)
fp = len(predictions[false_positive])
true_positive = (predictions == 1) & (loans["loan_status"] == 1)
tp = len(predictions[true_positive])
false_negative = (predictions == 0) & (loans["loan_status"] == 1)
fn = len(predictions[false_negative])
true_negative = (predictions == 0) & (loans["loan_status"] == 0)
tn = len(predictions[true_negative])
tpr = tp / (tp + fn)
print(tpr)
fpr = fp / (fp + tn)
print(fpr)

0.9989121566494424
0.9967943009795192


## Penalizing the misclassification of the minority class

In [18]:
lr = LogisticRegression(class_weight="balanced")
predictions = cross_val_predict(lr, features, target, cv=3)
predictions = pd.Series(predictions)

In [19]:
false_positive = (predictions == 1) & (loans["loan_status"] == 0)
fp = len(predictions[false_positive])
true_positive = (predictions == 1) & (loans["loan_status"] == 1)
tp = len(predictions[true_positive])
false_negative = (predictions == 0) & (loans["loan_status"] == 1)
fn = len(predictions[false_negative])
true_negative = (predictions == 0) & (loans["loan_status"] == 0)
tn = len(predictions[true_negative])
tpr = tp / (tp + fn)
print(tpr)
fpr = fp / (fp + tn)
print(fpr)

0.6579034841205089
0.38290293855743546


In [20]:
# Setting manual penalties

penalty = {0:10,1:1}
lr=LogisticRegression(class_weight=penalty)
predictions = cross_val_predict(lr, features, target, cv=3)
predictions = pd.Series(predictions)

In [21]:
false_positive = (predictions == 1) & (loans["loan_status"] == 0)
fp = len(predictions[false_positive])
true_positive = (predictions == 1) & (loans["loan_status"] == 1)
tp = len(predictions[true_positive])
false_negative = (predictions == 0) & (loans["loan_status"] == 1)
fn = len(predictions[false_negative])
true_negative = (predictions == 0) & (loans["loan_status"] == 0)
tn = len(predictions[true_negative])
tpr = tp / (tp + fn)
print(tpr)
fpr = fp / (fp + tn)
print(fpr)

0.24005076602302602
0.09029385574354408


## Applying random forests

In [22]:
rf = RandomForestClassifier(class_weight = "balanced", random_state = 1)
predictions = cross_val_predict(rf, features, target, cv=3)
predictions = pd.Series(predictions)

In [23]:
false_positive = (predictions == 1) & (loans["loan_status"] == 0)
fp = len(predictions[false_positive])
true_positive = (predictions == 1) & (loans["loan_status"] == 1)
tp = len(predictions[true_positive])
false_negative = (predictions == 0) & (loans["loan_status"] == 1)
fn = len(predictions[false_negative])
true_negative = (predictions == 0) & (loans["loan_status"] == 0)
tn = len(predictions[true_negative])
tpr = tp / (tp + fn)
print(tpr)
fpr = fp / (fp + tn)
print(fpr)

0.9708699725017376
0.9271593944790739


In [30]:
penalty = {0:7,1:1}
rf = RandomForestClassifier(class_weight = penalty, random_state = 1)
predictions = cross_val_predict(rf, features, target, cv=3)
predictions = pd.Series(predictions)

In [31]:
false_positive = (predictions == 1) & (loans["loan_status"] == 0)
fp = len(predictions[false_positive])
true_positive = (predictions == 1) & (loans["loan_status"] == 1)
tp = len(predictions[true_positive])
false_negative = (predictions == 0) & (loans["loan_status"] == 1)
fn = len(predictions[false_negative])
true_negative = (predictions == 0) & (loans["loan_status"] == 0)
tn = len(predictions[true_negative])
tpr = tp / (tp + fn)
print(tpr)
fpr = fp / (fp + tn)
print(fpr)

0.9692684253467501
0.9178984861976848
