In [1]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt

In [11]:
data = pd.read_csv('lending-club-data.csv')
data.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,sub_grade_num,delinq_2yrs_zero,pub_rec_zero,collections_12_mths_zero,short_emp,payment_inc_ratio,final_d,last_delinq_none,last_record_none,last_major_derog_none
0,1077501,1296599,5000,5000,4975,36 months,10.65,162.87,B,B2,...,0.4,1.0,1.0,1.0,0,8.1435,20141201T000000,1,1,1
1,1077430,1314167,2500,2500,2500,60 months,15.27,59.83,C,C4,...,0.8,1.0,1.0,1.0,1,2.3932,20161201T000000,1,1,1
2,1077175,1313524,2400,2400,2400,36 months,15.96,84.33,C,C5,...,1.0,1.0,1.0,1.0,0,8.25955,20141201T000000,1,1,1
3,1076863,1277178,10000,10000,10000,36 months,13.49,339.31,C,C1,...,0.2,1.0,1.0,1.0,0,8.27585,20141201T000000,0,1,1
4,1075269,1311441,5000,5000,5000,36 months,7.9,156.46,A,A4,...,0.8,1.0,1.0,1.0,0,5.21533,20141201T000000,1,1,1


## create target column

In [12]:
data['safe_loans'] = data['bad_loans'].apply( lambda x: +1 if x==0 else -1 )
data.drop('bad_loans',axis=1,inplace=True)

In [13]:
features = ['grade',              # grade of the loan
            'term',               # the term of the loan
            'home_ownership',     # home_ownership status: own, mortgage or rent
            'emp_length',         # number of years of employment
           ]
target = 'safe_loans'

# Extract the feature columns and target column
data = data[features + [target]]

In [14]:
with open('module-6-assignment-train-idx.json','r') as f:
    train_idx = json.load(f)
with open('module-6-assignment-validation-idx.json','r') as f:
    test_idx = json.load(f)

In [15]:
train = data.iloc[train_idx]
test = data.iloc[test_idx]
print(train.shape)
print(test.shape)

(37224, 5)
(9284, 5)


## One hot encoding

In [20]:
df = pd.get_dummies(data)
train_ = pd.get_dummies(train)
test_ = pd.get_dummies(test)

In [21]:
train_.shape

(37224, 25)

## Early stopping methods
    -> Max depth
    -> Min node size
    -> Gain in error reduction

In [22]:
## Condition 2
def reached_minimum_node_size(data, min_node_size):
    if(len(data)<=min_node_size):
        return True
    return False

In [23]:
## Condition 3
def error_reduction(error_before_split, error_after_split):
    error = error_before_split - error_after_split
    return error

In [24]:
## find mistakes
def intermediate_node_num_mistakes(label):
    if len(label) == 0:
        return 0    
    
    pos = (label==+1).sum()   
    
    neg = (label==-1).sum()               
   
    mistake = 0
    if(pos>neg):
        return neg
    else:
        return pos

In [25]:
def best_splitting_feature(data, features, target):
    
    target_values = data[target]
    best_feature = None 
    best_error = 10     
    
    num_data_points = float(len(data))  
    
    for feature in features:
        
        left_split = data[data[feature] == 0]
        
        right_split =  data[data[feature] == 1]
            
        left_mistakes = intermediate_node_num_mistakes(left_split[target])        

        right_mistakes = intermediate_node_num_mistakes(right_split[target])
            
        error = (left_mistakes + right_mistakes)/num_data_points

        if error < best_error:
            best_error = error
            best_feature = feature
    
    return best_feature 

    { 
       'is_leaf'            : True/False.
       'prediction'         : Prediction at the leaf node.
       'left'               : (dictionary corresponding to the left tree).
       'right'              : (dictionary corresponding to the right tree).
       'splitting_feature'  : The feature that this node splits on
    }

In [26]:
def create_leaf(target_values):    

    leaf = {'splitting_feature' : None,
            'left' : None,
            'right' : None,
            'is_leaf': True    }
   
    pos = len(target_values[target_values == +1])
    neg = len(target_values[target_values == -1])    

    if pos > neg:
        leaf['prediction'] = 1    
    else:
        leaf['prediction'] = -1           

    return leaf 

In [38]:
def decision_tree_create(data, features, target, current_depth = 0, max_depth = 10, min_node_size=1, min_error_reduction=0.0):
    
    remaining_features = features[:]
    
    target_values = data[target]
    
    print("--------------------------------------------------------------------")
    print("Subtree, depth = %s (%s data points)." % (current_depth, len(target_values)))
    
    # Stopping condition 1
    
    mistakes = intermediate_node_num_mistakes(target_values)
    if  mistakes == 0:
        print("Stopping condition 1 reached.")
        return create_leaf(target_values)
    
    # Stopping condition 2
    
    if remaining_features == []:
        print("Stopping condition 2 reached.")
        return create_leaf(target_values)    
    
    # Early stopping condition 1
    
    if current_depth >= max_depth: 
        print("Reached maximum depth. Stopping for now.")
        return create_leaf(target_values)

    # Early stopping condition 2
    
    if(reached_minimum_node_size(data,min_node_size)):
        print("Early stopping condition 2 reached. Reached minimum node size.")
        return create_leaf(target_values)
    
    # Find the best splitting feature 
    
    best_feature = best_splitting_feature(data,remaining_features,target)

    # Split on the best feature that we found. 
    
    left_split = data[data[best_feature] == 0]
    right_split = data[data[best_feature]==1]
    
    # Early stopping condition 3:
    
    error_before_split = intermediate_node_num_mistakes(target_values) / float(len(data))
    
    left_mistakes = intermediate_node_num_mistakes(left_split[target])
    right_mistakes = intermediate_node_num_mistakes(right_split[target])
    
    error_after_split = (left_mistakes + right_mistakes) / float(len(data))
    
    if( error_reduction(error_before_split,error_after_split) <= min_error_reduction ):
        print("Early stopping condition 3 reached. Minimum error reduction.")
        return create_leaf(target_values)
    
    
    remaining_features.remove(best_feature)
    
    print("Split on feature %s. (%s, %s)" % (best_feature, len(left_split), len(right_split)))
    
    left_tree = decision_tree_create(left_split, remaining_features, target, current_depth + 1, max_depth, min_node_size, min_error_reduction)        
    right_tree = decision_tree_create(right_split,remaining_features,target, current_depth+1,max_depth,min_node_size,min_error_reduction)
    
    return {'is_leaf'          : False, 
            'prediction'       : None,
            'splitting_feature': best_feature,
            'left'             : left_tree, 
            'right'            : right_tree}

## Build a tree

In [39]:
curr_depth = 0
max_depth = 6
min_node_size = 100
min_error_reduction = 0.0
features = list(train_.columns)
features.remove(target)
print(len(features))

24


In [41]:
my_tree_new = decision_tree_create(train_, features, target,curr_depth , max_depth, min_node_size, min_error_reduction)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade_A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Early stopping condition 3 reached. Minimum error reduction.
--------------------------------------------------------------------
Subtree, depth = 2 (101 data points).
Split on feature emp_length_< 1 year. (90, 11)
--------------------------------------------------------------------
Subtree, depth = 3 (90 data points).
Early stopping condition 2 reached. Reached minimum node size.
--------------------------------------------------------------------
Subtree, depth = 3 (11 data points).
Early stopping condition 2 reached. Reached minimum node size.
------------------------------------

In [44]:
my_tree_old = decision_tree_create(train_,features,target,current_depth=0, max_depth = 6, min_node_size = 0, min_error_reduction=-1)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade_A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Split on feature grade_B. (8074, 1048)
--------------------------------------------------------------------
Subtree, depth = 3 (8074 data points).
Split on feature grade_C. (5884, 2190)
--------------------------------------------------------------------
Subtree, depth = 4 (5884 data points).
Split on feature grade_D. (3826, 2058)
--------------------------------------------------------------------
Subtree, depth = 5 (3826 data points).
Split on feature grade_E. (1693, 2133)
--------------------------------------------------------------------
Subtree, depth = 6 (1693 data points).
R

--------------------------------------------------------------------
Subtree, depth = 6 (2 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 3 (1276 data points).
Split on feature grade_A. (1276, 0)
--------------------------------------------------------------------
Subtree, depth = 4 (1276 data points).
Split on feature grade_B. (1276, 0)
--------------------------------------------------------------------
Subtree, depth = 5 (1276 data points).
Split on feature grade_C. (1276, 0)
--------------------------------------------------------------------
Subtree, depth = 6 (1276 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 6 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 5 (0 data points).
Stopping condition 1 reached.
--------------

## Predictions

In [46]:
def classify(tree, x, annotate = False):
    
    if(tree['is_leaf']):
        if(annotate):
             print("At leaf, predicting %s" % tree['prediction'])
        return tree['prediction']
    
    else:
        # split on feature.
        split_feature_value = x[tree['splitting_feature']]
        
        if(annotate):
             print("Split on %s = %s" % (tree['splitting_feature'], split_feature_value))
                
        if(split_feature_value == 0):
            return classify(tree['left'], x, annotate)
        else:
            return classify(tree['right'],x,annotate)

In [49]:
print(test_.iloc[0][target])
print('Predicted class: %s ' % classify(my_tree_new, test_.iloc[0]))

-1
Predicted class: -1 


In [50]:
classify(my_tree_new, test_.iloc[0], annotate=True)

Split on term_ 36 months = 0
Split on grade_A = 0
At leaf, predicting -1


-1

In [51]:
classify(my_tree_old, test_.iloc[0], annotate=True)

Split on term_ 36 months = 0
Split on grade_A = 0
Split on grade_B = 0
Split on grade_C = 0
Split on grade_D = 1
Split on grade_E = 0
At leaf, predicting -1


-1

## Evaluate decision tree

In [58]:
def evaluate_classification_error(tree, data):
    
    prediction = []
    for i in range(len(data)):
        pred = classify(tree,data.iloc[i])
        prediction.append(pred)
    
    mistakes = (prediction!=data[target]).sum()
    error = mistakes/ float(len(data))
    
    return error

In [59]:
evaluate_classification_error(my_tree_new, test_)

0.37774666092201636

In [60]:
evaluate_classification_error(my_tree_old, test_)

0.37774666092201636

## Effect of max depth

In [53]:
model1 = decision_tree_create(train_,features,target,current_depth=0, max_depth = 2, min_node_size = 0, min_error_reduction=-1)
model2 = decision_tree_create(train_,features,target,current_depth=0, max_depth = 6, min_node_size = 0, min_error_reduction=-1)
model3 = decision_tree_create(train_,features,target,current_depth=0, max_depth = 14, min_node_size = 0, min_error_reduction=-1)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade_A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 2 (101 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 1 (28001 data points).
Split on feature grade_D. (23300, 4701)
--------------------------------------------------------------------
Subtree, depth = 2 (23300 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 2 (4701 data po

Split on feature grade_A. (347, 0)
--------------------------------------------------------------------
Subtree, depth = 6 (347 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 6 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 5 (11 data points).
Split on feature home_ownership_OWN. (9, 2)
--------------------------------------------------------------------
Subtree, depth = 6 (9 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 6 (2 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 3 (1276 data points).
Split on feature grade_A. (1276, 0)
--------------------------------------------------------------------
Subtree, depth = 4 (1276 data points).

Split on feature home_ownership_RENT. (5, 0)
--------------------------------------------------------------------
Subtree, depth = 14 (5 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 14 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 13 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 8 (1353 data points).
Split on feature grade_G. (1353, 0)
--------------------------------------------------------------------
Subtree, depth = 9 (1353 data points).
Split on feature term_ 60 months. (0, 1353)
--------------------------------------------------------------------
Subtree, depth = 10 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 10 (1353 data poin

Split on feature grade_F. (2058, 0)
--------------------------------------------------------------------
Subtree, depth = 7 (2058 data points).
Split on feature grade_G. (2058, 0)
--------------------------------------------------------------------
Subtree, depth = 8 (2058 data points).
Split on feature term_ 60 months. (0, 2058)
--------------------------------------------------------------------
Subtree, depth = 9 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 9 (2058 data points).
Split on feature home_ownership_MORTGAGE. (923, 1135)
--------------------------------------------------------------------
Subtree, depth = 10 (923 data points).
Split on feature home_ownership_OTHER. (922, 1)
--------------------------------------------------------------------
Subtree, depth = 11 (922 data points).
Split on feature home_ownership_OWN. (762, 160)
-----------------------------------------------------------

Split on feature home_ownership_OWN. (1313, 0)
--------------------------------------------------------------------
Subtree, depth = 13 (1313 data points).
Split on feature home_ownership_RENT. (1313, 0)
--------------------------------------------------------------------
Subtree, depth = 14 (1313 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 14 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 13 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 12 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 11 (74 data points).
Split on feature home_ownership_OTHER. (74, 0)
--------------------------------------------------------------------
Subtree, depth 

Split on feature grade_F. (34, 0)
--------------------------------------------------------------------
Subtree, depth = 9 (34 data points).
Split on feature grade_G. (34, 0)
--------------------------------------------------------------------
Subtree, depth = 10 (34 data points).
Split on feature term_ 60 months. (0, 34)
--------------------------------------------------------------------
Subtree, depth = 11 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 11 (34 data points).
Split on feature home_ownership_OTHER. (34, 0)
--------------------------------------------------------------------
Subtree, depth = 12 (34 data points).
Split on feature home_ownership_OWN. (25, 9)
--------------------------------------------------------------------
Subtree, depth = 13 (25 data points).
Split on feature home_ownership_RENT. (0, 25)
--------------------------------------------------------------------
Subtree, dept

Split on feature grade_G. (11, 0)
--------------------------------------------------------------------
Subtree, depth = 9 (11 data points).
Split on feature term_ 60 months. (0, 11)
--------------------------------------------------------------------
Subtree, depth = 10 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 10 (11 data points).
Split on feature home_ownership_MORTGAGE. (8, 3)
--------------------------------------------------------------------
Subtree, depth = 11 (8 data points).
Split on feature home_ownership_OTHER. (8, 0)
--------------------------------------------------------------------
Subtree, depth = 12 (8 data points).
Split on feature home_ownership_OWN. (6, 2)
--------------------------------------------------------------------
Subtree, depth = 13 (6 data points).
Split on feature home_ownership_RENT. (0, 6)
--------------------------------------------------------------------
Subt

Split on feature emp_length_< 1 year. (13, 1)
--------------------------------------------------------------------
Subtree, depth = 12 (13 data points).
Split on feature grade_B. (0, 13)
--------------------------------------------------------------------
Subtree, depth = 13 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 13 (13 data points).
Split on feature term_ 60 months. (13, 0)
--------------------------------------------------------------------
Subtree, depth = 14 (13 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 14 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 12 (1 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 11 (1 data points).
St

Split on feature home_ownership_OWN. (5, 0)
--------------------------------------------------------------------
Subtree, depth = 13 (5 data points).
Split on feature home_ownership_RENT. (5, 0)
--------------------------------------------------------------------
Subtree, depth = 14 (5 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 14 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 13 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 12 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 9 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 8 (0 data points).
Stopping c

Split on feature grade_G. (11, 0)
--------------------------------------------------------------------
Subtree, depth = 12 (11 data points).
Split on feature term_ 60 months. (11, 0)
--------------------------------------------------------------------
Subtree, depth = 13 (11 data points).
Split on feature home_ownership_OWN. (11, 0)
--------------------------------------------------------------------
Subtree, depth = 14 (11 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 14 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 13 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 12 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 11 (0 data points).
Sto

Split on feature term_ 60 months. (230, 0)
--------------------------------------------------------------------
Subtree, depth = 11 (230 data points).
Split on feature home_ownership_OTHER. (230, 0)
--------------------------------------------------------------------
Subtree, depth = 12 (230 data points).
Split on feature home_ownership_OWN. (230, 0)
--------------------------------------------------------------------
Subtree, depth = 13 (230 data points).
Split on feature home_ownership_RENT. (230, 0)
--------------------------------------------------------------------
Subtree, depth = 14 (230 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 14 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 13 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Su

Split on feature grade_A. (1276, 0)
--------------------------------------------------------------------
Subtree, depth = 4 (1276 data points).
Split on feature grade_B. (1276, 0)
--------------------------------------------------------------------
Subtree, depth = 5 (1276 data points).
Split on feature grade_C. (1276, 0)
--------------------------------------------------------------------
Subtree, depth = 6 (1276 data points).
Split on feature grade_F. (1276, 0)
--------------------------------------------------------------------
Subtree, depth = 7 (1276 data points).
Split on feature grade_G. (1276, 0)
--------------------------------------------------------------------
Subtree, depth = 8 (1276 data points).
Split on feature term_ 60 months. (1276, 0)
--------------------------------------------------------------------
Subtree, depth = 9 (1276 data points).
Split on feature home_ownership_MORTGAGE. (855, 421)
--------------------------------------------------------------------
Subtre

Split on feature home_ownership_RENT. (404, 0)
--------------------------------------------------------------------
Subtree, depth = 13 (404 data points).
Split on feature emp_length_1 year. (374, 30)
--------------------------------------------------------------------
Subtree, depth = 14 (374 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 14 (30 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 13 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 11 (10 data points).
Split on feature home_ownership_OWN. (10, 0)
--------------------------------------------------------------------
Subtree, depth = 12 (10 data points).
Split on feature home_ownership_RENT. (10, 0)
-------------------------------------------------------------

In [55]:
print("Training data, classification error (model 1):", evaluate_classification_error(model1, train_) )
print("Training data, classification error (model 2):", evaluate_classification_error(model2, train_) )
print("Training data, classification error (model 3):", evaluate_classification_error(model3, train_) )

Training data, classification error (model 1): 0.40003761014399314
Training data, classification error (model 2): 0.3804266064904363
Training data, classification error (model 3): 0.3772566086395874


In [56]:
print("Validation data, classification error (model 1):", evaluate_classification_error(model1, test_) )
print("Validation data, classification error (model 2):", evaluate_classification_error(model2, test_) )
print("Validation data, classification error (model 3):", evaluate_classification_error(model3, test_) )

Validation data, classification error (model 1): 0.3981042654028436
Validation data, classification error (model 2): 0.37774666092201636
Validation data, classification error (model 3): 0.38140887548470487


## Complexity of tree

In [61]:
##   complexity(T) = number of leaves in the tree T

In [62]:
def count_leaves(tree):
    if tree['is_leaf']:
        return 1
    return count_leaves(tree['left']) + count_leaves(tree['right'])

In [63]:
print( count_leaves(model1) )
print( count_leaves(model2) )
print( count_leaves(model3) )

4
39
341


## Effect of min_error

In [68]:
model4 = decision_tree_create(train_,features,target,current_depth=0, max_depth = 6, min_node_size = 0, min_error_reduction=-1)
model5 = decision_tree_create(train_,features,target,current_depth=0, max_depth = 6, min_node_size = 0, min_error_reduction=0)
model6 = decision_tree_create(train_,features,target,current_depth=0, max_depth = 6, min_node_size = 0, min_error_reduction=5)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade_A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Split on feature grade_B. (8074, 1048)
--------------------------------------------------------------------
Subtree, depth = 3 (8074 data points).
Split on feature grade_C. (5884, 2190)
--------------------------------------------------------------------
Subtree, depth = 4 (5884 data points).
Split on feature grade_D. (3826, 2058)
--------------------------------------------------------------------
Subtree, depth = 5 (3826 data points).
Split on feature grade_E. (1693, 2133)
--------------------------------------------------------------------
Subtree, depth = 6 (1693 data points).
R

Split on feature home_ownership_OWN. (9, 2)
--------------------------------------------------------------------
Subtree, depth = 6 (9 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 6 (2 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 3 (1276 data points).
Split on feature grade_A. (1276, 0)
--------------------------------------------------------------------
Subtree, depth = 4 (1276 data points).
Split on feature grade_B. (1276, 0)
--------------------------------------------------------------------
Subtree, depth = 5 (1276 data points).
Split on feature grade_C. (1276, 0)
--------------------------------------------------------------------
Subtree, depth = 6 (1276 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 6 (0 dat

In [69]:
print("Validation data, classification error (model 4):", evaluate_classification_error(model4, test_) )
print("Validation data, classification error (model 5):", evaluate_classification_error(model5, test_) )
print("Validation data, classification error (model 6):", evaluate_classification_error(model6, test_) )

Validation data, classification error (model 4): 0.37774666092201636
Validation data, classification error (model 5): 0.37774666092201636
Validation data, classification error (model 6): 0.503446790176648


In [70]:
print( count_leaves(model4) )
print( count_leaves(model5) )
print( count_leaves(model6) )

39
12
1


## Effect of min_node_size

In [71]:
model7 = decision_tree_create(train_,features,target,current_depth=0, max_depth = 6, min_node_size = 0, min_error_reduction=-1)
model8 = decision_tree_create(train_,features,target,current_depth=0, max_depth = 6, min_node_size = 2000, min_error_reduction=-1)
model9 = decision_tree_create(train_,features,target,current_depth=0, max_depth = 6, min_node_size = 50000, min_error_reduction=-1)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade_A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Split on feature grade_B. (8074, 1048)
--------------------------------------------------------------------
Subtree, depth = 3 (8074 data points).
Split on feature grade_C. (5884, 2190)
--------------------------------------------------------------------
Subtree, depth = 4 (5884 data points).
Split on feature grade_D. (3826, 2058)
--------------------------------------------------------------------
Subtree, depth = 5 (3826 data points).
Split on feature grade_E. (1693, 2133)
--------------------------------------------------------------------
Subtree, depth = 6 (1693 data points).
R

Split on feature grade_B. (1276, 0)
--------------------------------------------------------------------
Subtree, depth = 5 (1276 data points).
Split on feature grade_C. (1276, 0)
--------------------------------------------------------------------
Subtree, depth = 6 (1276 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 6 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 5 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 4 (0 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 2 (4701 data points).
Split on feature grade_A. (4701, 0)
--------------------------------------------------------------------
Subtree, depth = 3 (4701 data points).
Split on featu

In [72]:
print("Validation data, classification error (model 7):", evaluate_classification_error(model7, test_) )
print("Validation data, classification error (model 8):", evaluate_classification_error(model8, test_) )
print("Validation data, classification error (model 9):", evaluate_classification_error(model9, test_) )

Validation data, classification error (model 7): 0.37774666092201636
Validation data, classification error (model 8): 0.3774235243429556
Validation data, classification error (model 9): 0.503446790176648


In [73]:
print( count_leaves(model7) )
print( count_leaves(model8) )
print( count_leaves(model9) )

39
20
1
