In [168]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3

%matplotlib inline

In [169]:
def select(query):
    
    conn = sqlite3.connect('./data/lending-club-loan-data/database2.sqlite')
    cursor = conn.cursor()
    temp_df = pd.DataFrame(cursor.execute(query).fetchall())
    temp_df.columns = list(map(lambda x: x[0], cursor.description))
    conn.close()
    
    return temp_df.copy()

In [170]:
loans = select('SELECT * FROM LOAN_FINAL')
loans = loans.drop('index',axis=1)

In [171]:
# separating the data into the two problems at hand: "Completed Loans" vs. "In Progress Loans"

loans_completed = loans[(loans['loan_status']=='Charged Off') | (loans['loan_status']=='Fully Paid')].copy()
features_completed = loans_completed.drop('loan_status',axis=1).copy()
targets_completed = loans_completed['loan_status'].copy()

loans_in_progress = loans[(loans['loan_status'] != 'Charged Off') & (loans['loan_status'] != 'Fully Paid')].copy()
features_in_progress = loans_in_progress.drop('loan_status',axis=1).copy()
targets_in_progress = loans_in_progress['loan_status'].copy()

In [172]:
# split up features into numerical and non-numerical values

numerical = []
strings = []

for i in range(len(loans.dtypes)):
    if (loans.dtypes[i] == 'int64') or (loans.dtypes[i] == 'float64'):
        numerical = numerical + [loans.dtypes.index[i]]
        
    if (loans.dtypes[i] == 'O') and (loans.dtypes.index[i]!='loan_status'):
        strings = strings + [loans.dtypes.index[i]]

In [173]:
outputmap = {'Charged Off':0,'Fully Paid':1}
targets_completed = targets_completed.apply(lambda x: outputmap[x])

In [194]:
from sklearn.model_selection import train_test_split

features_completed_train, features_completed_test, targets_completed_train, targets_completed_test = \
    train_test_split(features_completed[numerical],targets_completed,random_state=10,test_size=0.2)

In [197]:
from sklearn.linear_model import LogisticRegression

completed_lr = LogisticRegression()

In [202]:
features_completed_train = features_completed_train.drop(['mths_since_last_delinq','mths_since_last_major_derog'],axis=1)

In [205]:
# contain NaN values need to figure out how to translate the meaning implied by NaN in these columns

numerical.remove('mths_since_last_delinq')
numerical.remove('mths_since_last_major_derog')

In [208]:
completed_lr.fit(features_completed_train[numerical],targets_completed_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [209]:
from sklearn.metrics import accuracy_score

accuracy_score(completed_lr.predict(features_completed_test[numerical]),targets_completed_test)

In [216]:
# baseline score....already performs ridiculously well. perhaps need to strip away some features that give obvious
# 'hints' to our model about whether the loan will be charged off. (e.g. recoveries, collection recovery fee, etc..)

accuracy_score(completed_lr.predict(features_completed_test[numerical]),targets_completed_test)

0.98848932285464797