In [1]:
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.model_selection import train_test_split
import datetime as dt
from sklearn.preprocessing import LabelEncoder

### Read in the Pre-Cleaned Data

In [2]:
data = pd.read_csv("cleaned_data.csv")

In [3]:
data.head()

Unnamed: 0,acc_open_past_24mths,addr_state,annual_inc,avg_cur_bal,bc_util,collection_recovery_fee,delinq_2yrs,dti,earliest_cr_line,emp_length,...,total_acc,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,total_pymnt,total_rec_int,total_rec_late_fee,total_rec_prncp,total_rev_hi_lim,verification_status
0,3.0,AZ,24000.0,7516.0,61.644117,0.0,0.0,27.65,1985-01-01,10+ years,...,9.0,36748.0,14500.0,30791.0,5863.16,863.16,0.0,5000.0,23400.0,Verified
1,4.0,GA,30000.0,7516.0,61.644117,1.11,0.0,1.0,1999-04-01,< 1 year,...,4.0,36748.0,14500.0,30791.0,1014.53,435.17,0.0,456.46,23400.0,Source Verified
2,4.0,IL,12252.0,7516.0,61.644117,0.0,0.0,8.72,2001-11-01,10+ years,...,10.0,36748.0,14500.0,30791.0,3005.67,605.67,0.0,2400.0,23400.0,Not Verified
3,4.0,CA,49200.0,7516.0,61.644117,0.0,0.0,20.0,1996-02-01,10+ years,...,37.0,36748.0,14500.0,30791.0,12231.89,2214.92,16.97,10000.0,23400.0,Source Verified
4,4.0,OR,80000.0,7516.0,61.644117,0.0,0.0,17.94,1996-01-01,1 year,...,38.0,36748.0,14500.0,30791.0,4066.91,1066.91,0.0,3000.0,23400.0,Source Verified


In [7]:
data.shape

(747488, 56)

# Clean the data and get train test split

In [5]:
emp_dict = {'1 year': 1, '2 years': 2, '3 years': 3, '4 years': 4, '5 years': 5, '6 years': 6, '7 years': 7, 
            '8 years': 8, '9 years': 9, '< 1 year': 0, '10+ years': 10, 'n/a': None}
data['emp_length'] = data['emp_length'].map(emp_dict)
# data = data.dropna() # this does nothing, as far as I can tell
y = data['loan_status']
x = data.drop('loan_status', axis = 1)
x = pd.get_dummies(x, columns = ['verification_status', 'initial_list_status', 'grade', 'home_ownership'])
x['earliest_cr_line'] = pd.to_datetime(x['earliest_cr_line']).map(dt.datetime.toordinal)
# x['earliest_cr_line'] = x['earliest_cr_line']
x['issue_d'] = pd.to_datetime(x['issue_d']).map(dt.datetime.toordinal)
# x['issue_d'] = x['issue_d'].map(dt.datetime.toordinal)
x['last_credit_pull_d'] = pd.to_datetime(x['last_credit_pull_d']).map(dt.datetime.toordinal)
# x['last_credit_pull_d'] = x['last_credit_pull_d'].map(dt.datetime.toordinal)
x['last_pymnt_d'] = pd.to_datetime(x['last_pymnt_d']).map(dt.datetime.toordinal)
# x['last_pymnt_d'] = x['last_pymnt_d'].map(dt.datetime.toordinal)
le = LabelEncoder()
x['addr_state'] = le.fit_transform(x['addr_state'])
x['purpose'] = le.fit_transform(x['purpose'])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

In [6]:
x_train.shape, x_test.shape

((523241, 69), (224247, 69))

# Predict on a Decision Tree

In [79]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(x_train, y_train)

In [81]:
y_pred = clf.predict(x_test)
sum(y_pred == y_test)/len(y_pred)

0.9965707456510009

In [82]:
n_nodes = clf.tree_.node_count
children_left = clf.tree_.children_left
children_right = clf.tree_.children_right
feature = clf.tree_.feature
threshold = clf.tree_.threshold
n_nodes, children_left, children_right, feature, threshold

(2569,
 array([ 1,  2,  3, ..., -1, -1, -1], dtype=int64),
 array([2568,  209,  208, ...,   -1,   -1,   -1], dtype=int64),
 array([36, 49, 10, ..., -2, -2, -2], dtype=int64),
 array([  4.99999989e-03,   2.99997510e+03,   2.98750000e+03, ...,
         -2.00000000e+00,  -2.00000000e+00,  -2.00000000e+00]))

In [84]:
clf.decision_path(x_test)

<224247x2569 sparse matrix of type '<class 'numpy.int64'>'
	with 3125366 stored elements in Compressed Sparse Row format>

In [88]:
n_nodes = clf.tree_.node_count
children_left = clf.tree_.children_left
children_right = clf.tree_.children_right
feature = clf.tree_.feature
threshold = clf.tree_.threshold

node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
is_leaves = np.zeros(shape=n_nodes, dtype=bool)
stack = [(0, -1)]
while len(stack) > 0:
    node_id, parent_depth = stack.pop()
    node_depth[node_id] = parent_depth + 1
    if (children_left[node_id] != children_right[node_id]):
        stack.append((children_left[node_id], parent_depth + 1))
        stack.append((children_right[node_id], parent_depth + 1))
    else:
        is_leaves[node_id] = True

print("The binary tree structure has %s nodes and has "
      "the following tree structure:"
      % n_nodes)
for i in range(n_nodes):
    if is_leaves[i]:
        print("%snode=%s leaf node." % (node_depth[i] * "\t", i))
    else:
        print("%snode=%s test node: go to node %s if X[:, %s] <= %s else to "
              "node %s."
              % (node_depth[i] * "\t",
                 i,
                 children_left[i],
                 feature[i],
                 threshold[i],
                 children_right[i],
                 ))
print()

node_indicator = clf.decision_path(x_test)
leave_id = clf.apply(x_test)

sample_id = 0
node_index = node_indicator.indices[node_indicator.indptr[sample_id]:
                                    node_indicator.indptr[sample_id + 1]]

print('Rules used to predict sample %s: ' % sample_id)
for node_id in node_index:
    if leave_id[sample_id] != node_id:
        continue

    if (x_test[sample_id, feature[node_id]] <= threshold[node_id]):
        threshold_sign = "<="
    else:
        threshold_sign = ">"

    print("decision id node %s : (x_test[%s, %s] (= %s) %s %s)"
          % (node_id,
             sample_id,
             feature[node_id],
             x_test[sample_id, feature[node_id]],
             threshold_sign,
             threshold[node_id]))

sample_ids = [0, 1]
common_nodes = (node_indicator.toarray()[sample_ids].sum(axis=0) ==
                len(sample_ids))

common_node_id = np.arange(n_nodes)[common_nodes]

print("\nThe following samples %s share the node %s in the tree"
      % (sample_ids, common_node_id))
print("It is %s %% of all nodes." % (100 * len(common_node_id) / n_nodes,))

The binary tree structure has 2569 nodes and has the following tree structure:
node=0 test node: go to node 1 if X[:, 36] <= 0.00499999988824 else to node 2568.
	node=1 test node: go to node 2 if X[:, 49] <= 2999.97509766 else to node 209.
		node=2 test node: go to node 3 if X[:, 10] <= 2987.5 else to node 208.
			node=3 test node: go to node 4 if X[:, 49] <= 999.929992676 else to node 7.
				node=4 test node: go to node 5 if X[:, 19] <= 975.0 else to node 6.
					node=5 leaf node.
					node=6 leaf node.
				node=7 test node: go to node 8 if X[:, 17] <= 238.644989014 else to node 157.
					node=8 test node: go to node 9 if X[:, 15] <= 735553.0 else to node 106.
						node=9 test node: go to node 10 if X[:, 15] <= 735218.5 else to node 39.
							node=10 test node: go to node 11 if X[:, 25] <= 19.5 else to node 36.
								node=11 test node: go to node 12 if X[:, 24] <= 26.5 else to node 33.
									node=12 test node: go to node 13 if X[:, 54] <= 0.5 else to node 28.
										node=13 

KeyError: (0, -2)

In [92]:
x.columns[36]

'recoveries'

In [94]:
x['recoveries'].head()

0      0.0
1    122.9
2      0.0
3      0.0
4      0.0
Name: recoveries, dtype: float64

In [95]:
x.columns[49]

'total_rec_prncp'

In [96]:
x['total_rec_prncp']

0          5000.00
1           456.46
2          2400.00
3         10000.00
4          3000.00
5          5000.00
6          7000.00
7          3000.00
8           162.02
9           673.48
10         6500.00
11        12000.00
12         1256.14
13         3000.00
14         5433.47
15         1000.00
16        10000.00
17         3600.00
18         6000.00
19         9200.00
20        20250.00
21        10694.96
22        10000.00
23        10000.00
24         1305.58
25        15000.00
26            0.00
27          629.05
28         4000.00
29         8500.00
            ...   
787876    32875.00
787878    15600.00
787879    15000.00
787880    18500.00
787881    11200.00
787882    27000.00
787883    20000.00
787884    14200.00
787885    15000.00
787886    10000.00
787887    26000.00
787888    22200.00
787889    10000.00
787890    15000.00
787891    40000.00
787892    24000.00
787893    24775.00
787894    35000.00
787896    12800.00
787897    36000.00
787899    30000.00
787900    25