In [1]:
from glob import glob
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
pd.set_option('display.max_rows', 500)

In [3]:
# Function allows me to return the intersection of a list of values and
# the columns of a dataframe, either as a new list of values, or the
# dataframe represeting the selected columns.
def safe_select(df, cols, columns=True):
    try:
        if type(cols)==str:
            cols=[cols]
        sel_col = df.columns[df.columns.isin(cols)]
        
        if columns==False:
            results = df
        else: results = sel_col
    except Exception as e:
        print(e)
        results = None
    finally:
        return(results)

# Allows me to drop a column if a dataframe has it, otherwise
# returning the original dataset.
def safe_drop(df, istr):
    try:
        if type(istr)==str:
            istr = [istr]
        flds = df.columns[df.columns.isin(istr)]
        results = df.drop(flds, axis=1)
    except Exception as e:
        print(e)
        results=df
    finally:
        return(results)

In [4]:
os.chdir(glob('H:/Documents/Training/Thinkful/Lender*')[0])

dfile = max(glob('*.csv'))
d = pd.read_csv(dfile)
d = d.iloc[:-2]

print('Initial Dataset Dimensions')
print('Rows: {0:,}'.format(d.shape[0]))
print('Columns: {0:,}'.format(d.shape[1]))

  interactivity=interactivity, compiler=compiler, result=result)


Initial Dataset Dimensions
Rows: 421,095
Columns: 144


# Data Cleaning
### Dropping payment-related, fields with multiple levels, and null fields
In the interest of convenience, I dropped fields that me the following criteria:
- They captured information that would not have been available during the application process (i.e., anything reflecting the client's payments on the loan);
- The number of records with null values for the field was greater than or equal to 20%; and
- They were categorical with more than 10 levels.

I also excluded records that had any null values.

In [5]:
try:
    del md
except Exception as e:
    print(e)

md = pd.DataFrame()
md['field'] = d.columns
md['d_type'] = d.dtypes.values
md['nulls'] = d.apply(lambda x: sum(x.isna())).values
md['null_pct'] = md.nulls/d.shape[0]
md['levels'] = d.apply(lambda x: len(x.drop_duplicates())).values

name 'md' is not defined


In [6]:
# Drop null columns
md.loc[md.null_pct>=0.2, 'drop_fld'] = 1

# Drop payment columns
pay_flds = ['acc_now_delinq', 'acc_open_past_24mths', 'all_util', 'avg_cur_bal', 'bc_open_to_buy', 'chargeoff_within_12_mths', 'collection_recovery_fee', 'collections_12_mths_ex_med', 'debt_settlement_flag', 'debt_settlement_flag_date', 'deferral_term', 'delinq_2yrs', 'delinq_amnt', 'funded_amnt_inv', 'hardship_amount', 'hardship_dpd', 'hardship_end_date', 'hardship_flag', 'hardship_last_payment_amount', 'hardship_length', 'hardship_loan_status', 'hardship_payoff_balance_amount', 'hardship_reason', 'hardship_start_date', 'hardship_status', 'hardship_type', 'il_util', 'last_credit_pull_d', 'last_fico_range_high', 'last_fico_range_low', 'last_pymnt_amnt', 'last_pymnt_d', 'max_bal_bc', 'mths_since_last_delinq', 'mths_since_last_major_derog', 'mths_since_recent_revol_delinq', 'num_actv_rev_tl', 'open_acc_6m', 'open_act_il', 'open_il_12m', 'open_il_24m', 'open_rv_12m', 'open_rv_24m', 'orig_projected_additional_accrued_interest', 'out_prncp', 'out_prncp_inv', 'payment_plan_start_date', 'pct_tl_nvr_dlq', 'pymnt_plan', 'recoveries', 'revol_bal', 'revol_bal_joint', 'revol_util', 'sec_app_revol_util', 'settlement_amount', 'settlement_date', 'settlement_percentage', 'settlement_status', 'settlement_term', 'term', 'title', 'tot_cur_bal', 'total_bal_ex_mort', 'total_bal_il', 'total_cu_tl', 'total_pymnt', 'total_pymnt_inv', 'total_rec_int', 'total_rec_late_fee', 'total_rec_prncp']
md.loc[md.field.str.contains('pay|settl|charg|hardship'), 'drop_fld'] = 1
md.loc[md.field.isin(pay_flds), 'drop_fld'] = 1
md.loc[(md.levels>10) & (md.d_type=='object'), 'drop_fld'] = 1

In [7]:
md.sort_values('field')

Unnamed: 0,field,d_type,nulls,null_pct,levels,drop_fld
56,acc_now_delinq,float64,0,0.0,8,1.0
74,acc_open_past_24mths,float64,0,0.0,47,1.0
23,addr_state,object,0,0.0,49,1.0
69,all_util,float64,399723,0.949247,140,1.0
13,annual_inc,float64,0,0.0,26903,
53,annual_inc_joint,float64,420584,0.998786,309,1.0
52,application_type,object,0,0.0,2,
75,avg_cur_bal,float64,0,0.0,56003,1.0
76,bc_open_to_buy,float64,3963,0.009411,51128,1.0
77,bc_util,float64,4227,0.010038,1321,


In [8]:
d = safe_drop(d, md.loc[md.drop_fld==1, 'field'])
d = d.dropna()

X = d[md.loc[(md.field != 'loan_status') & (md.drop_fld.isna()), 'field']]
X = pd.get_dummies(X)
y = d.loan_status

# Simple Tree Model

In [9]:
from sklearn.tree import DecisionTreeClassifier as dtc
from sklearn.model_selection import cross_val_score
from time import perf_counter

cv_ = 10
start = perf_counter()
# print(start)
tree = dtc()
scores = cross_val_score(tree, X, y, cv=cv_)
end = perf_counter()
print('Accuracy of single tree model for each of the {0} trials:'.format(cv_))
print(scores)
print('Average accuracy: {0:.2f}'.format(scores.mean()))
print('Time elapsed: {0:.2f} seconds'.format(end-start))

Accuracy of single tree model for each of the 10 trials:
[0.6266423  0.62773363 0.62724056 0.6209247  0.62329224 0.61936532
 0.61955733 0.61245032 0.61455674 0.60631817]
Average accuracy: 0.62
Time elapsed: 232.14 seconds


As per the assignment instructions I put no restrictions on the complexity of the tree, which resulted in an average accuracy of 0.62 and requiring approximately three minutes forty seconds to run. Let's see how the random forest model performs.

# Random Forest Model: 50 trees
As specified by the assignment, I tried to make my random forest as simple as possible using the following hyperparameters:
- Number of trees: 50
- Maximum depth: 3
- Minimum samples in each split: 100

In [11]:
from sklearn.ensemble import RandomForestClassifier as rfc

start = perf_counter()
# print(start)
forest = rfc(n_estimators=50, max_depth=3, min_samples_split=100)
scores = cross_val_score(forest, X, y, cv=cv_)
end = perf_counter()
print('Accuracy of random forest model for each of the {0} trials:'.format(cv_))
print(scores)
print('Average accuracy: {0:.2f}'.format(scores.mean()))
print('Time elapsed: {0:.2f} seconds'.format(end-start))

Accuracy of random forest model for each of the 10 trials:
[0.70843122 0.70845177 0.70845177 0.70849286 0.70848441 0.70850496
 0.70852551 0.70852551 0.70854607 0.70854607]
Average accuracy: 0.71
Time elapsed: 188.55 seconds


# Random Forest: 1,000 trees
Keeping all other hyperparamters the same as the previous trial, what happens when instead of specifying 50 trees we specify 1,000?

In [13]:
from sklearn.ensemble import RandomForestClassifier as rfc

cv_ = 10
start = perf_counter()
# print(start)
forest = rfc(n_estimators=1000, max_depth=3, min_samples_split=100)
scores = cross_val_score(forest, X, y, cv=cv_)
end = perf_counter()
print('Accuracy of random forest model for each of the {0} trials:'.format(cv_))
print(scores)
print('Average accuracy: {0:.2f}'.format(scores.mean()))
print('Time elapsed: {0:.2f} seconds'.format(end-start))

Accuracy of random forest model for each of the 10 trials:
[0.70843122 0.70845177 0.70845177 0.70849286 0.70848441 0.70850496
 0.70852551 0.70852551 0.70854607 0.70854607]
Average accuracy: 0.71
Time elapsed: 3314.19 seconds


# Conclusions
Doing the absolute bare minimum of work still produced an average accuracy of 0.71--a 17% improvement over the single complex tree. Also, a 1,000 tree forest did not perform noticeably better than the 50 tree forest. It did, however, require 17 times more time--55 min compared to 2 min 40 seconds. However, I did not parallelize the forest which is my error.