Decision Tree Example
=====================

* TODO: Research feature threshold splitting
* TODO: learn more about working with imbalanced data classes
* TODO: Research Backtracking to prevent getting stuck in a local optima

In [1]:
import io
import os
import pydot
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from numpy import random
from struct import unpack

from sklearn import metrics
from sklearn.cross_validation import KFold   #For K-fold cross validation
from sklearn.cross_validation import train_test_split

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz

from graphviz import Digraph

%matplotlib inline

Import & Inspect raw data
=========================

In [2]:
loans = pd.read_csv(r'../data/Trees/lending-club-data.csv', low_memory= False)

In [3]:
len(loans)

122607

In [4]:
loans.columns

Index(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title',
       'emp_length', 'home_ownership', 'annual_inc', 'is_inc_v', 'issue_d',
       'loan_status', 'pymnt_plan', 'url', 'desc', 'purpose', 'title',
       'zip_code', 'addr_state', 'dti', 'delinq_2yrs', 'earliest_cr_line',
       'inq_last_6mths', 'mths_since_last_delinq', 'mths_since_last_record',
       'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc',
       'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
       'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_d', 'last_pymnt_amnt', 'next_pymnt_d', 'last_credit_pull_d',
       'collections_12_mths_ex_med', 'mths_since_last_major_derog',
       'policy_code', 'not_compliant', 'status', 'inactive_loans', 'bad_loans',
       'emp_length_num', 'grade_num', 'sub_gra

In [5]:
loans.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,sub_grade_num,delinq_2yrs_zero,pub_rec_zero,collections_12_mths_zero,short_emp,payment_inc_ratio,final_d,last_delinq_none,last_record_none,last_major_derog_none
0,1077501,1296599,5000,5000,4975,36 months,10.65,162.87,B,B2,...,0.4,1,1,1,0,8.1435,20141201T000000,1,1,1
1,1077430,1314167,2500,2500,2500,60 months,15.27,59.83,C,C4,...,0.8,1,1,1,1,2.3932,20161201T000000,1,1,1
2,1077175,1313524,2400,2400,2400,36 months,15.96,84.33,C,C5,...,1.0,1,1,1,0,8.25955,20141201T000000,1,1,1
3,1076863,1277178,10000,10000,10000,36 months,13.49,339.31,C,C1,...,0.2,1,1,1,0,8.27585,20141201T000000,0,1,1
4,1075269,1311441,5000,5000,5000,36 months,7.9,156.46,A,A4,...,0.8,1,1,1,0,5.21533,20141201T000000,1,1,1


### Set target attribute logic

In [6]:
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x==0 else -1)
loans = loans.drop('bad_loans', axis= 1)

In [7]:
len(loans['safe_loans'][loans['safe_loans'] == 1]) / len(loans)

0.8111853319957262

### Set desired features reduce df

In [8]:
features = ['grade', 
            'sub_grade', 
            'short_emp', 
            'emp_length_num', 
            'home_ownership', 
            'dti', 
            'purpose', 
            'term', 
            'last_delinq_none', 
            'last_major_derog_none',
            'revol_util',
            'total_rec_late_fee',
           ]

target = 'safe_loans'

In [9]:
loans = loans[features + [target]]

In [10]:
loans.head()

Unnamed: 0,grade,sub_grade,short_emp,emp_length_num,home_ownership,dti,purpose,term,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,safe_loans
0,B,B2,0,11,RENT,27.65,credit_card,36 months,1,1,83.7,0.0,1
1,C,C4,1,1,RENT,1.0,car,60 months,1,1,9.4,0.0,-1
2,C,C5,0,11,RENT,8.72,small_business,36 months,1,1,98.5,0.0,1
3,C,C1,0,11,RENT,20.0,other,36 months,0,1,21.0,16.97,1
4,A,A4,0,4,RENT,11.2,wedding,36 months,1,1,28.3,0.0,1


### Manage target class imbalance

In [11]:
safe_loans_raw = loans[loans['safe_loans'] == +1]
risky_loans_raw = loans[loans['safe_loans'] == -1]

print('Number of Safe loans is', len(safe_loans_raw))
print('Number of risky loans is', len(risky_loans_raw))

Number of Safe loans is 99457
Number of risky loans is 23150


In [12]:
percentage = len(risky_loans_raw)/float(len(safe_loans_raw))
percentage

0.2327639080205516

### Make dataset roughly 50/50

In [13]:
risky_loans = risky_loans_raw
safe_loans = safe_loans_raw.sample(frac= percentage, random_state= 1)

print('Number of Safe loans is', len(safe_loans))
print('Number of risky loans is', len(risky_loans))

Number of Safe loans is 23150
Number of risky loans is 23150


In [14]:
loans_data = risky_loans.append(safe_loans)
len(loans_data)

46300

### Inspect Balanced Output

In [15]:
loans_data.to_csv(r'../data/Trees/loans_data.csv')

### One-hot encoding (TESTING)

In [16]:
categorical_variables = []

for feat_name, feat_type in zip(loans_data.columns, loans_data.dtypes):
    #print('name',feat_name)
    #print('type',feat_type)
    if feat_type == object:
        categorical_variables.append(feat_name)
        print(feat_name)
        
for feature in categorical_variables:
    print(feature)
    loans_data_one_hot_encoded = loans_data[feature].apply(lambda x: {x: 1})
    print(type(loans_data_one_hot_encoded))
    print(loans_data_one_hot_encoded)
    #loans_data_unpacked = loans_data_one_hot_encoded.unpack(column_name_prefix= feature)
    '''
    for column in loans_data_unpacked.columns:
        loan_data_unpacked[column] = loans_data_unpacked[column].fillna(0)
        
    loans_data.remove_column(feature)
    loans_data.add_columns(loans_data_unpacked)
    '''

grade
sub_grade
home_ownership
purpose
term
grade
<class 'pandas.core.series.Series'>
1         {'C': 1}
6         {'F': 1}
7         {'B': 1}
10        {'C': 1}
12        {'B': 1}
18        {'B': 1}
21        {'B': 1}
23        {'C': 1}
24        {'D': 1}
41        {'A': 1}
45        {'B': 1}
48        {'C': 1}
50        {'E': 1}
58        {'D': 1}
60        {'F': 1}
63        {'D': 1}
87        {'D': 1}
89        {'B': 1}
93        {'D': 1}
102       {'D': 1}
108       {'B': 1}
111       {'E': 1}
118       {'C': 1}
124       {'A': 1}
132       {'B': 1}
136       {'B': 1}
138       {'C': 1}
140       {'C': 1}
151       {'A': 1}
158       {'C': 1}
            ...   
55027     {'B': 1}
100438    {'C': 1}
122519    {'D': 1}
4161      {'B': 1}
13520     {'C': 1}
51558     {'C': 1}
110837    {'A': 1}
83788     {'C': 1}
6556      {'D': 1}
106731    {'A': 1}
111533    {'B': 1}
115626    {'C': 1}
121848    {'C': 1}
108846    {'C': 1}
111652    {'C': 1}
74640     {'A': 1}
90254     {'B': 1}
73

### Use Pandas for vectorizing categorical data

In [17]:
loans_data_d = pd.get_dummies(loans_data)

loans_data_d.to_csv(r'../data/Trees/loans_data_d.csv')

In [18]:
le = LabelEncoder()

for feature in features:
    print(feature)
    loans_data[feature] = le.fit_transform(loans_data[feature])

grade
sub_grade
short_emp
emp_length_num
home_ownership
dti
purpose
term
last_delinq_none
last_major_derog_none
revol_util
total_rec_late_fee


In [19]:
loans_data.head(20)

Unnamed: 0,grade,sub_grade,short_emp,emp_length_num,home_ownership,dti,purpose,term,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,safe_loans
1,2,13,1,1,3,95,0,1,1,1,107,0,-1
6,5,26,0,5,2,550,9,1,1,1,347,0,-1
7,1,9,1,1,3,1803,8,1,1,1,387,0,-1
10,2,10,1,1,3,1003,2,0,1,1,948,0,-1
12,1,6,0,4,3,701,8,0,1,1,583,0,-1
18,1,8,0,11,3,1317,2,0,1,1,934,0,-1
21,1,7,0,2,3,235,5,0,1,1,316,0,-1
23,2,11,0,10,3,1517,2,0,1,1,606,0,-1
24,3,16,0,3,3,1392,8,1,0,1,625,0,-1
41,0,4,0,11,0,1628,2,0,1,1,651,0,-1


### Split training & test sets

In [20]:
# Divide data into training and test sets
train_data, validation_data = train_test_split(loans_data_d, test_size= 0.2, random_state= 1)  #random_state==seed

print('Number of training examples is', len(train_data))
print('Number of validation examples', len(validation_data))

Number of training examples is 37040
Number of validation examples 9260


### Function to compare classification models and their accuracies

In [21]:
def classification_model(model, data, predictors, outcome):
    #Fit the model:
    model.fit(data[predictors],data[outcome])

    #Make predictions on training set:
    predictions = model.predict(data[predictors])

    #Print accuracy
    accuracy = metrics.accuracy_score(predictions,data[outcome])
    print('Accuracy:', accuracy)

    #Perform k-fold cross-validation with 5 folds
    kf = KFold(data.shape[0], n_folds=5)
    error = []
    
    for train, test in kf:
        # Filter training data
        train_predictors = (data[predictors].iloc[train,:])

        # The target we're using to train the algorithm.
        train_target = data[outcome].iloc[train]

        # Training the algorithm using the predictors and target.
        model.fit(train_predictors, train_target)

        #Record error from each cross-validation run
        error.append(model.score(data[predictors].iloc[test,:], data[outcome].iloc[test]))

    print('Cross-Validation Score:', np.mean(error))

    #Fit the model again so that it can be refered outside the function:
    mdl_output = model.fit(data[predictors],data[outcome])
    
    return mdl_output

### Logistic Regression Test

In [22]:
target = 'safe_loans'
predictor_var = features


model = LogisticRegression()

classification_model(model, loans_data, predictor_var, target)

Accuracy: 0.638315334773
Cross-Validation Score: 0.381771058315


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### Decision Tree Test

In [23]:
model = DecisionTreeClassifier(max_depth= 6)
small_model = DecisionTreeClassifier(max_depth= 2)

classification_model(model, loans_data, predictor_var, target)

classification_model(small_model, loans_data, predictor_var, target)

#export_graphviz()

Accuracy: 0.636630669546
Cross-Validation Score: 0.357710583153
Accuracy: 0.612397408207
Cross-Validation Score: 0.359244060475


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

### Random Forest Test

In [24]:
model = RandomForestClassifier(n_estimators=100)

classification_model(model, loans_data, predictor_var, target)

Accuracy: 1.0
Cross-Validation Score: 0.414406047516


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

### Order most important variables

In [25]:
featimp = pd.Series(model.feature_importances_, index= predictor_var).sort_values(ascending= False)

featimp

dti                      0.279787
revol_util               0.270053
sub_grade                0.113627
emp_length_num           0.106010
purpose                  0.074341
grade                    0.040982
home_ownership           0.030675
total_rec_late_fee       0.026811
last_delinq_none         0.018938
term                     0.017973
last_major_derog_none    0.012730
short_emp                0.008074
dtype: float64

### Grab top n vars

In [26]:
new_features = ['sub_grade', 
                'emp_length_num',  
                'dti', 
                'revol_util',
               ]

### Logistic Regression re-test

In [27]:
target = 'safe_loans'
predictor_var = new_features

model = LogisticRegression()

classification_model(model, loans_data, predictor_var, target)

Accuracy: 0.622915766739
Cross-Validation Score: 0.355982721382


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### Decision Tree re-test

In [28]:
model = DecisionTreeClassifier()

tree_output = classification_model(model, loans_data, predictor_var, target)
tree_output

Accuracy: 0.999827213823
Cross-Validation Score: 0.45252699784


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

### Random Forest re-test

In [29]:
model = RandomForestClassifier(n_estimators=100)

classification_model(model, loans_data, predictor_var, target)

Accuracy: 0.999827213823
Cross-Validation Score: 0.398963282937


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

### New Feature by importance

In [30]:
featimp = pd.Series(model.feature_importances_, index= predictor_var).sort_values(ascending= False)

featimp

dti               0.375977
revol_util        0.349962
sub_grade         0.157938
emp_length_num    0.116123
dtype: float64

## Graphviz Testing - won't work until GraphViz is installed

In [31]:
#export_graphviz(tree_output, out_file= 'tree.dot')

In [32]:
#os.getcwd()

In [33]:
#dotfile = io.StringIO()
#export_graphviz(tree_output, out_file=dotfile)
#pydot.graph_from_dot_data(dotfile.getvalue()).write_png('tree_viz.png')

In [34]:
#(graph, ) = pydot.graph_from_dot_file('tree.dot')
#graph.write_png('tree_viz.png')