In [1]:
# import necessary libraries
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegressionCV
import sklearn.metrics as metrics
from sklearn.preprocessing import PolynomialFeatures
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import Imputer
from sklearn.metrics import mean_squared_error

from sklearn.tree import export_graphviz
from IPython.display import Image
from IPython.display import display
%matplotlib inline

In [2]:
# read in data
raw_loan_data = pd.read_csv('loan.csv')
raw_loan_data.shape

  interactivity=interactivity, compiler=compiler, result=result)


(2260668, 145)

In [3]:
# filter for relevant columns
relevant_cols = ['loan_amnt', 'funded_amnt', 'term', 'int_rate', 'grade', 'annual_inc', 'issue_d', 'dti', 'revol_bal', 'total_pymnt', 'loan_status']
# save filtered data to avoid reading large raw data file again
raw_loan_data[relevant_cols].to_csv('filtered_loan_data.csv', index = False)

In [6]:
# read in filtered data
filtered_loan_data = pd.read_csv('filtered_loan_data.csv')

In [7]:
# sanity check that number of rows has remained constant in filtering
filtered_loan_data.shape

(2260668, 11)

In [5]:
# preview filtered data
filtered_loan_data.head()

Unnamed: 0,loan_amnt,funded_amnt,term,int_rate,grade,annual_inc,issue_d,dti,revol_bal,total_pymnt,loan_status
0,2500,2500,36 months,13.56,C,55000.0,Dec-2018,18.24,4341,167.02,Current
1,30000,30000,60 months,18.94,D,90000.0,Dec-2018,26.52,12315,1507.11,Current
2,5000,5000,36 months,17.97,D,59280.0,Dec-2018,10.51,4599,353.89,Current
3,4000,4000,36 months,18.94,D,92000.0,Dec-2018,16.74,5468,286.71,Current
4,30000,30000,60 months,16.14,C,57250.0,Dec-2018,26.35,829,1423.21,Current


# Part 1: Data Exploration and Evaluation

We begin by data exploration and evaluation by cleaning the data before processing
1. Identifying any non-numeric types (either due to formatting errors or being categorical)
2. Coding categorical variables as dummy variables for analysis
3. Checking for missing values
4. Checking for outliers
5. Standardizing the data

In [11]:
# data cleaning
## step 1 - identify non-numeric types
filtered_loan_data.dtypes

loan_amnt        int64
funded_amnt      int64
term            object
int_rate       float64
grade           object
annual_inc     float64
issue_d         object
dti            float64
revol_bal        int64
total_pymnt    float64
loan_status     object
dtype: object

In [10]:
# quick summary statistics
filtered_loan_data.describe().apply(lambda s: s.apply(lambda x: format(x, 'g')))

Unnamed: 0,loan_amnt,funded_amnt,int_rate,annual_inc,dti,revol_bal,total_pymnt
count,2260670.0,2260670.0,2260670.0,2260660.0,2258960.0,2260670.0,2260670.0
mean,15046.9,15041.7,13.0929,77992.4,18.8242,16658.5,11824.0
std,9190.25,9188.41,4.83211,112696.0,14.1833,22948.3,9889.6
min,500.0,500.0,5.31,0.0,-1.0,0.0,0.0
25%,8000.0,8000.0,9.49,46000.0,11.89,5950.0,4272.58
50%,12900.0,12875.0,12.62,65000.0,17.84,11324.0,9060.87
75%,20000.0,20000.0,15.99,93000.0,24.49,20246.0,16708.0
max,40000.0,40000.0,30.99,110000000.0,999.0,2904840.0,63296.9
