# Load data from Lending Club
Lending Club provides information on all past applications (and current loan status)

In [None]:
import pandas as pd
pd.set_option('display.max_columns', 100)
%matplotlib inline
# 2013-2014 loan data - files are available from deeper history
past_loans = pd.read_csv('LoanStats3c_securev1.csv', header = 1, index_col=0)
past_loans['term'] = past_loans['term'].astype('category')
past_loans['grade'] = past_loans['grade'].astype('category')
past_loans['sub_grade'] = past_loans['sub_grade'].astype('category')
past_loans['home_ownership'] = past_loans['home_ownership'].astype('category')
past_loans['verification_status'] = past_loans['verification_status'].astype('category')

# Making assumption that any currently late loans are bad; may need to revisit
di = {'Current': 'Good', 'Fully Paid': 'Good', 'Charged Off': 'Bad', 'Late (31-120 days)': 'Bad',
       'In Grace Period': 'Bad', 'Late (16-30 days)': 'Bad', 'Default': 'Bad'}
past_loans.replace({'loan_status': di}, inplace=True)
past_loans['loan_status'] = past_loans['loan_status'].astype('category')
past_loans['pymnt_plan'] = past_loans['pymnt_plan'].astype('category')
past_loans['purpose'] = past_loans['purpose'].astype('category')
past_loans['application_type'] = past_loans['application_type'].astype('category')

# convert dates
past_loans['issue_d'] = pd.to_datetime(past_loans['issue_d'])
past_loans['earliest_cr_line'] = pd.to_datetime(past_loans['earliest_cr_line'])
past_loans['last_pymnt_d'] = pd.to_datetime(past_loans['last_pymnt_d'])
past_loans['next_pymnt_d'] = pd.to_datetime(past_loans['next_pymnt_d'])
past_loans['last_credit_pull_d'] = pd.to_datetime(past_loans['last_credit_pull_d'])
past_loans['issue_d'] = pd.to_datetime(past_loans['issue_d'])

# convert floats
past_loans['int_rate'] = pd.to_numeric(past_loans['int_rate'].str.extract('(.*)%'))/100


# drop pointless data
past_loans=past_loans.dropna(axis=1,how='all')
past_loans.drop('application_type',axis = 1) # only contains "individual"
past_loans.head(100)

# Explore different categorical data
What values are inside the categorical fields?

In [54]:
pd.unique(past_loans['term'])

array([' 36 months', ' 60 months', nan], dtype=object)

In [55]:
pd.unique(past_loans['grade'])

array(['A', 'C', 'D', 'B', 'E', 'F', 'G', nan], dtype=object)

In [56]:
pd.unique(past_loans['sub_grade'])

array(['A3', 'C1', 'D4', 'C3', 'D1', 'D5', 'B5', 'B4', 'C4', 'E5', 'D2',
       'B3', 'C5', 'E4', 'E3', 'C2', 'B2', 'A5', 'F1', 'B1', 'D3', 'A4',
       'E1', 'E2', 'G2', 'A1', 'G1', 'A2', 'F3', 'F2', 'G3', 'F4', 'G4',
       'F5', 'G5', nan], dtype=object)

In [57]:
pd.unique(past_loans['home_ownership'])

array(['MORTGAGE', 'RENT', 'OWN', 'ANY', nan], dtype=object)

In [58]:
pd.unique(past_loans['verification_status'])

array(['Not Verified', 'Source Verified', 'Verified', nan], dtype=object)

In [59]:
pd.unique(past_loans['loan_status'])

array(['Good', 'Bad', nan], dtype=object)

In [60]:
pd.unique(past_loans['pymnt_plan'])

array(['n', 'y', nan], dtype=object)

In [61]:
pd.unique(past_loans['purpose'])

array(['credit_card', 'debt_consolidation', 'car', 'house',
       'home_improvement', 'other', 'medical', 'moving', 'major_purchase',
       'vacation', 'small_business', 'renewable_energy', 'wedding', nan], dtype=object)

# Examine field types and amount of data
Is there any missing data?

In [62]:
past_loans.info()

<class 'pandas.core.frame.DataFrame'>
Index: 235631 entries, 36805548 to Total amount funded in policy code 2: 873652739
Data columns (total 60 columns):
member_id                      235629 non-null float64
loan_amnt                      235629 non-null float64
funded_amnt                    235629 non-null float64
funded_amnt_inv                235629 non-null float64
term                           235629 non-null category
int_rate                       235629 non-null object
installment                    235629 non-null float64
grade                          235629 non-null category
sub_grade                      235629 non-null category
emp_title                      222393 non-null object
emp_length                     235629 non-null object
home_ownership                 235629 non-null category
annual_inc                     235629 non-null float64
verification_status            235629 non-null category
issue_d                        235629 non-null datetime64[ns]
loan_status 

# Check out summary stats on the numeric fields

In [63]:
past_loans.describe().transpose() #.to_string()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
member_id,235629,24019299.796719,8825162.024128,137225.0,15579724.0,22953173.0,31767065.0,40860827.0
loan_amnt,235629,14870.156793,8438.318193,1000.0,8325.0,13000.0,20000.0,35000.0
funded_amnt,235629,14870.156793,8438.318193,1000.0,8325.0,13000.0,20000.0,35000.0
funded_amnt_inv,235629,14865.334169,8435.524995,950.0,8325.0,13000.0,20000.0,35000.0
installment,235629,442.482374,245.050238,23.36,265.68,384.12,578.71,1409.99
annual_inc,235629,74854.148281,55547.533374,3000.0,45377.0,65000.0,90000.0,7500000.0
dti,235629,18.04077,8.023002,0.0,12.02,17.63,23.76,39.99
delinq_2yrs,235629,0.344512,0.898319,0.0,0.0,0.0,0.0,22.0
fico_range_low,235629,692.497358,29.246641,660.0,670.0,685.0,705.0,845.0
fico_range_high,235629,696.497418,29.246952,664.0,674.0,689.0,709.0,850.0


# Create dependent and independent data


In [64]:
X = past_loans.drop('loan_status', axis = 1)
y = past_loans['loan_status']

# Split into training and test data

In [65]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
# 10 cross validation iterations with 20% test / 80% train
from sklearn.cross_validation import ShuffleSplit
cv = ShuffleSplit(X_train.shape[0], n_iter=10, test_size=0.2, random_state=0)

# Standardize the data

In [66]:
from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()
# transform our training features
X_train_std = stdsc.fit_transform(X_train)
# transform the testing features in the same way
X_test_std = stdsc.transform(X_test)

ValueError: could not convert string to float: INDIVIDUAL