In [1]:
import graphlab                            # see below for install instruction
import matplotlib.pyplot as plt          # plotting
import numpy as np                       # dense matrices
from scipy.sparse import csr_matrix      # sparse matrices
%matplotlib inline

In [2]:
loans = graphlab.SFrame('lending-club-data.gl/')

This non-commercial license of GraphLab Create for academic use is assigned to oaabde01@louisville.edu and will expire on September 22, 2017.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: C:\Users\OMARAB~1\AppData\Local\Temp\graphlab_server_1476584327.log.0


In [3]:
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x==0 else -1)
loans = loans.remove_column('bad_loans')

In [4]:
features = ['grade',              # grade of the loan
            'term',               # the term of the loan
            'home_ownership',     # home_ownership status: own, mortgage or rent
            'emp_length',         # number of years of employment
           ]
target = 'safe_loans'

In [7]:
safe_loans_raw = loans[loans[target] == 1]
risky_loans_raw = loans[loans[target] == -1]

# Since there are less risky loans than safe loans, find the ratio of the sizes
# and use that percentage to undersample the safe loans.
percentage = len(risky_loans_raw)/float(len(safe_loans_raw))
safe_loans = safe_loans_raw.sample(percentage, seed = 1)
risky_loans = risky_loans_raw
loans_data = risky_loans.append(safe_loans)

print "Percentage of safe loans                 :", len(safe_loans) / float(len(loans_data))
print "Percentage of risky loans                :", len(risky_loans) / float(len(loans_data))
print "Total number of loans in our new dataset :", len(loans_data)

Percentage of safe loans                 : 0.502236174422
Percentage of risky loans                : 0.497763825578
Total number of loans in our new dataset : 46508


## One hot encoding for categorical variables

In [8]:
#loans_data = risky_loans.append(safe_loans)


for feature in features:
    loans_data_one_hot_encoded = loans_data[feature].apply(lambda x: {x: 1})
    loans_data_unpacked = loans_data_one_hot_encoded.unpack(column_name_prefix=feature)

    # Change None's to 0's
    for column in loans_data_unpacked.column_names():
        loans_data_unpacked[column] = loans_data_unpacked[column].fillna(0)

    #loans_data.remove_column(feature)
    loans_data.add_columns(loans_data_unpacked)

In [9]:
train_data, validation_data = loans_data.random_split(.8, seed=1)

## Testing early stopping condition 2

In [10]:
def reached_minimum_node_size(data, min_node_size):
    # Return True if the number of data points is less than or equal to the minimum node size.
    if (len(data)<=min_node_size):
        return True
    else:
        return False

## Early stopping condition 3

In [11]:
def error_reduction(error_before_split, error_after_split):
    # Return the error before the split minus the error after the split.
    return abs(error_before_split - error_after_split)

In [12]:
def intermediate_node_num_mistakes(labels_in_node):
    # Corner case: If labels_in_node is empty, return 0
    if len(labels_in_node) == 0:
        return 0    
    # Count the number of 1's (safe loans)
    Numberofsafeloans = 0
    Numberofriskyloans = 0
    for i in range (0,len(labels_in_node)):
        if labels_in_node[i]==+1:
            Numberofsafeloans = Numberofsafeloans + 1 
        
       
    # Count the number of -1's (risky loans)
    for j in range (0,len(labels_in_node)):
        if labels_in_node[j]==-1:
            Numberofriskyloans = Numberofriskyloans + 1
        
    # Return the number of mistakes that the majority classifier makes.
    if(Numberofsafeloans>=Numberofriskyloans):
        return Numberofriskyloans
    else:
        return Numberofsafeloans

In [13]:
def best_splitting_feature(data, features, target):
    
    target_values = data[target]
    best_feature = None # Keep track of the best feature 
    best_error = 10     # Keep track of the best error so far 
    # Note: Since error is always <= 1, we should intialize it with something larger than 1.

    # Convert to float to make sure error gets computed correctly.
    num_data_points = float(len(data))  
    
    # Loop through each feature to consider splitting on that feature
    for feature in features:
        
        # The left split will have all data points where the feature value is 0
        left_split = data[data[feature] == 0]
        
        # The right split will have all data points where the feature value is 1
        ## YOUR CODE HERE
        right_split = data[data[feature] == 1]
            
        # Calculate the number of misclassified examples in the left split.
        # Remember that we implemented a function for this! (It was called intermediate_node_num_mistakes)
        # YOUR CODE HERE
        left_mistakes = intermediate_node_num_mistakes(left_split[target])            

        # Calculate the number of misclassified examples in the right split.
        ## YOUR CODE HERE
        right_mistakes = intermediate_node_num_mistakes(right_split[target])
            
        # Compute the classification error of this split.
        # Error = (# of mistakes (left) + # of mistakes (right)) / (# of data points)
        ## YOUR CODE HERE
        error = (left_mistakes + right_mistakes) / num_data_points

        # If this is the best error we have found so far, store the feature as best_feature and the error as best_error
        ## YOUR CODE HERE
        if error < best_error:
            best_error = error
            best_feature = feature
        
    
    return best_feature # Return the best feature we found

In [14]:
def create_leaf(target_values):    
    # Create a leaf node
    leaf = {'splitting_feature' : None,
            'left' : None,
            'right' : None,
            'is_leaf' : None,
            'prediction' : None}   ## YOUR CODE HERE 
   
    # Count the number of data points that are +1 and -1 in this node.
    num_ones = len(target_values[target_values == +1])
    num_minus_ones = len(target_values[target_values == -1])    

    # For the leaf node, set the prediction to be the majority class.
    # Store the predicted class (1 or -1) in leaf['prediction']
    if num_ones > num_minus_ones:
        leaf['prediction'] = 1          ## YOUR CODE HERE
        leaf['is_leaf'] = True
        #leaf['left'] = None
        #leaf['right'] = None
        #leaf['splitting_feature'] = num_ones
    else:
        leaf['prediction'] = -1          ## YOUR CODE HERE  
        leaf['is_leaf'] = True
        #leaf['right'] = None
        #leaf['left'] = None
        #leaf['splitting_feature'] = num_minus_ones

    # Return the leaf node
    return leaf 

In [30]:
def decision_tree_create(data, features, target, current_depth = 0, max_depth = 10,min_node_size=1, min_error_reduction=0.0):
    remaining_features = features[:] # Make a copy of the features.
    
    target_values = data[target]
    print "--------------------------------------------------------------------"
    print "Subtree, depth = %s (%s data points)." % (current_depth, len(target_values))
    

    # Stopping condition 1
    # (Check if there are mistakes at current node.
    # Recall you wrote a function intermediate_node_num_mistakes to compute this.)
    if intermediate_node_num_mistakes(target_values) == 0:  ## YOUR CODE HERE
        print "Stopping condition 1 reached."     
        # If not mistakes at current node, make current node a leaf node
        return create_leaf(target_values)
    
    # Stopping condition 2 (check if there are remaining features to consider splitting on)
    if remaining_features == 0 :   ## YOUR CODE HERE
        print "Stopping condition 2 reached."    
        # If there are no remaining features to consider, make current node a leaf node
        return create_leaf(target_values)    
    
    # Additional stopping condition (limit tree depth)
    if current_depth >= max_depth:  ## YOUR CODE HERE
        print "Reached maximum depth. Stopping for now."
        # If the max tree depth has been reached, make current node a leaf node
        return create_leaf(target_values)

    # Find the best splitting feature (recall the function best_splitting_feature implemented above)
    ## YOUR CODE HERE
    splitting_feature = best_splitting_feature(data,features,target)
    
    # Split on the best feature that we found. 
    left_split = data[data[splitting_feature] == 0]
    right_split = data[data[splitting_feature] == 1]      ## YOUR CODE HERE
    error_before_split = intermediate_node_num_mistakes(target_values) / float(len(data))
    left_mistakes =    intermediate_node_num_mistakes(left_split[target])
    right_mistakes =   intermediate_node_num_mistakes(right_split[target]) 
    error_after_split = (left_mistakes + right_mistakes) / float(len(data))
    remaining_features.remove(splitting_feature)
    print "Split on feature %s. (%s, %s)" % (\
                      splitting_feature, len(left_split), len(right_split))
    
    # Create a leaf node if the split is "perfect"
    if len(left_split) == len(data):
        print "Creating leaf node."
        return create_leaf(left_split[target])
    if len(right_split) == len(data):
        print "Creating leaf node."
        ## YOUR CODE HERE
        return create_leaf(right_split[target])
    # Create a leaf node if reached_minimum_node_size is reached
    if reached_minimum_node_size(data,min_node_size):
        print "Early stopping condition 2 reached. Reached minimum node size."
        return create_leaf(target_values)
    if abs(error_before_split - error_after_split) <= min_error_reduction:
        print "Early stopping condition 3 reached. Minimum error reduction."
        return create_leaf(target_values)
    
    # Repeat (recurse) on left and right subtrees
    left_tree = decision_tree_create(left_split, remaining_features, target, current_depth + 1, max_depth,min_node_size, min_error_reduction)        
    ## YOUR CODE HERE
    right_tree = decision_tree_create(right_split, remaining_features, target, current_depth + 1, max_depth,min_node_size, min_error_reduction)

    return {'is_leaf'          : False, 
            'prediction'       : None,
            'splitting_feature': splitting_feature,
            'left'             : left_tree, 
            'right'            : right_tree}

In [31]:
my_decision_tree_new = decision_tree_create(train_data, features, 'safe_loans', max_depth = 6, 
                                min_node_size = 100, min_error_reduction=0.0)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term. 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade.A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Split on feature grade.B. (8074, 1048)
Early stopping condition 3 reached. Minimum error reduction.
--------------------------------------------------------------------
Subtree, depth = 2 (101 data points).
Split on feature emp_length.n/a. (96, 5)
--------------------------------------------------------------------
Subtree, depth = 3 (96 data points).
Split on feature emp_length.< 1 year. (85, 11)
Early stopping condition 2 reached. Reached minimum node size.
--------------------------------------------------------------------
Subtree, depth = 3 (5 data points).
Split on feature gra

In [19]:
features = [ 'grade.A',
 'grade.B',
 'grade.C',
 'grade.D',
 'grade.E',
 'grade.F',
 'grade.G',
 'term. 36 months',
 'term. 60 months',
 'home_ownership.MORTGAGE',
 'home_ownership.OTHER',
 'home_ownership.OWN',
 'home_ownership.RENT',
 'emp_length.1 year',
 'emp_length.10+ years',
 'emp_length.2 years',
 'emp_length.3 years',
 'emp_length.4 years',
 'emp_length.5 years',
 'emp_length.6 years',
 'emp_length.7 years',
 'emp_length.8 years',
 'emp_length.9 years',
 'emp_length.< 1 year',
 'emp_length.n/a']


In [32]:
my_decision_tree_old = decision_tree_create(train_data, features, 'safe_loans', max_depth = 6,                                 min_node_size = 0, min_error_reduction=-1)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term. 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade.A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Split on feature grade.B. (8074, 1048)
--------------------------------------------------------------------
Subtree, depth = 3 (8074 data points).
Split on feature grade.C. (5884, 2190)
--------------------------------------------------------------------
Subtree, depth = 4 (5884 data points).
Split on feature grade.D. (3826, 2058)
--------------------------------------------------------------------
Subtree, depth = 5 (3826 data points).
Split on feature grade.E. (1693, 2133)
--------------------------------------------------------------------
Subtree, depth = 6 (1693 data points).
R

In [33]:
def classify(tree, x, annotate = False):
    
       # if the node is a leaf node.
    if tree['is_leaf']:
        if annotate:
             print "At leaf, predicting %s" % tree['prediction']
        return tree['prediction']
    
    else:
    # split on feature.
        split_feature_value = x[tree['splitting_feature']]
        if annotate:
             print "Split on %s = %s" % (tree['splitting_feature'], split_feature_value)
        if split_feature_value == 0:
            return classify(tree['left'], x, annotate)
        else:
            return classify(tree['right'], x, annotate)

In [34]:
print validation_data[0]
print 'Predicted class: %s ' % classify(my_decision_tree_new, validation_data[0])

{'inq_last_6mths': 0L, 'emp_length': '2 years', 'emp_title': 'Frito Lay', 'last_delinq_none': 0L, 'home_ownership.OTHER': 0L, 'safe_loans': -1L, 'pub_rec': 0L, 'title': 'Other', 'emp_length.10+ years': 0L, 'earliest_cr_line': '20031001T000000', 'pymnt_plan': 'n', 'mths_since_last_major_derog': '', 'desc': '  Borrower added on 12/16/11 > I have a stable job where I will never get laid off.<br><br> Borrower added on 12/16/11 > I need the loan to pay for some recent medical expenses and I need to get my car fixed.<br>', 'term': ' 60 months', 'emp_length.4 years': 0L, 'installment': 123.65, 'total_rec_int': 719.11, 'sub_grade_num': 0.4, 'total_rec_late_fee': 0.0, 'home_ownership': 'RENT', 'delinq_2yrs': 3L, 'mths_since_last_delinq': 20L, 'emp_length.9 years': 0L, 'final_d': '20161201T000000', 'revol_util': 59.5, 'last_major_derog_none': 1L, 'zip_code': '150xx', 'total_pymnt_inv': 1609.12, 'last_pymnt_d': '20121201T000000', 'emp_length.7 years': 0L, 'recoveries': 260.96, 'grade_num': 3L, 'c

In [35]:
classify(my_decision_tree_new, validation_data[0], annotate = True)

Split on term. 36 months = 0
Split on grade.A = 0
At leaf, predicting -1


-1

In [36]:
classify(my_decision_tree_old, validation_data[0], annotate = True)

Split on term. 36 months = 0
Split on grade.A = 0
Split on grade.B = 0
Split on grade.C = 0
Split on grade.D = 1
At leaf, predicting -1


-1

In [38]:
print validation_data[0]
print 'Predicted class: %s ' % classify(my_decision_tree_new, validation_data[0])

{'inq_last_6mths': 0L, 'emp_length': '2 years', 'emp_title': 'Frito Lay', 'last_delinq_none': 0L, 'home_ownership.OTHER': 0L, 'safe_loans': -1L, 'pub_rec': 0L, 'title': 'Other', 'emp_length.10+ years': 0L, 'earliest_cr_line': '20031001T000000', 'pymnt_plan': 'n', 'mths_since_last_major_derog': '', 'desc': '  Borrower added on 12/16/11 > I have a stable job where I will never get laid off.<br><br> Borrower added on 12/16/11 > I need the loan to pay for some recent medical expenses and I need to get my car fixed.<br>', 'term': ' 60 months', 'emp_length.4 years': 0L, 'installment': 123.65, 'total_rec_int': 719.11, 'sub_grade_num': 0.4, 'total_rec_late_fee': 0.0, 'home_ownership': 'RENT', 'delinq_2yrs': 3L, 'mths_since_last_delinq': 20L, 'emp_length.9 years': 0L, 'final_d': '20161201T000000', 'revol_util': 59.5, 'last_major_derog_none': 1L, 'zip_code': '150xx', 'total_pymnt_inv': 1609.12, 'last_pymnt_d': '20121201T000000', 'emp_length.7 years': 0L, 'recoveries': 260.96, 'grade_num': 3L, 'c

In [40]:
classify(my_decision_tree_new, validation_data[0], annotate = True)

Split on term. 36 months = 0
Split on grade.A = 0
At leaf, predicting -1


-1

In [41]:
classify(my_decision_tree_old, validation_data[0], annotate = True)

Split on term. 36 months = 0
Split on grade.A = 0
Split on grade.B = 0
Split on grade.C = 0
Split on grade.D = 1
At leaf, predicting -1


-1

In [42]:
def evaluate_classification_error(tree, data):
    # Apply the classify(tree, x) to each row in your data
    prediction = data.apply(lambda x: classify(tree, x))
    mistakes=0
    valid_output = data[target]
    # Once you've made the predictions, calculate the classification error and return it
    #comparison = numpy.equal(prediction,data[target])
    for i in range (0,len(valid_output)):
        if prediction[i] != valid_output[i]:
            mistakes = mistakes + 1
            #print "Iam here"
        
    accuracy = mistakes / len(valid_output)
    print mistakes
    print len(valid_output)
    print accuracy
    return accuracy

In [43]:
evaluate_classification_error(my_decision_tree_new, validation_data)

3562
9284
0


0

In [44]:
evaluate_classification_error(my_decision_tree_old, validation_data)

3563
9284
0


0

In [45]:
model_1 = decision_tree_create(train_data, features, 'safe_loans', max_depth = 2, 
                                min_node_size = 0, min_error_reduction=-1)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term. 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade.A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 2 (101 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 1 (28001 data points).
Split on feature grade.D. (23300, 4701)
--------------------------------------------------------------------
Subtree, depth = 2 (23300 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 2 (4701 data po

In [46]:
model_2 = decision_tree_create(train_data, features, 'safe_loans', max_depth = 6, 
                                min_node_size = 0, min_error_reduction=-1)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term. 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade.A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Split on feature grade.B. (8074, 1048)
--------------------------------------------------------------------
Subtree, depth = 3 (8074 data points).
Split on feature grade.C. (5884, 2190)
--------------------------------------------------------------------
Subtree, depth = 4 (5884 data points).
Split on feature grade.D. (3826, 2058)
--------------------------------------------------------------------
Subtree, depth = 5 (3826 data points).
Split on feature grade.E. (1693, 2133)
--------------------------------------------------------------------
Subtree, depth = 6 (1693 data points).
R

In [47]:
model_3 = decision_tree_create(train_data, features, 'safe_loans', max_depth = 14, 
                                min_node_size = 0, min_error_reduction=-1)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term. 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade.A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Split on feature grade.B. (8074, 1048)
--------------------------------------------------------------------
Subtree, depth = 3 (8074 data points).
Split on feature grade.C. (5884, 2190)
--------------------------------------------------------------------
Subtree, depth = 4 (5884 data points).
Split on feature grade.D. (3826, 2058)
--------------------------------------------------------------------
Subtree, depth = 5 (3826 data points).
Split on feature grade.E. (1693, 2133)
--------------------------------------------------------------------
Subtree, depth = 6 (1693 data points).
S

## Evaluating model_1,model_2, and model_3

In [48]:
print "Training data, classification error (model 1):", evaluate_classification_error(model_1, train_data)
print "Training data, classification error (model 2):", evaluate_classification_error(model_2, train_data)
print "Training data, classification error (model 3):", evaluate_classification_error(model_3, train_data)

Training data, classification error (model 1): 14891
37224
0
0
Training data, classification error (model 2): 14214
37224
0
0
Training data, classification error (model 3): 14003
37224
0
0


In [49]:
print "Training data, classification error (model 1):", evaluate_classification_error(model_1, validation_data)
print "Training data, classification error (model 2):", evaluate_classification_error(model_2, validation_data)
print "Training data, classification error (model 3):", evaluate_classification_error(model_3, validation_data)

Training data, classification error (model 1): 3696
9284
0
0
Training data, classification error (model 2): 3563
9284
0
0
Training data, classification error (model 3): 3503
9284
0
0


In [50]:
def count_leaves(tree):
    if tree['is_leaf']:
        return 1
    return count_leaves(tree['left']) + count_leaves(tree['right'])

In [51]:
count_leaves(model_1)

4

In [52]:
count_leaves(model_2)

19

In [53]:
count_leaves(model_3)

41

In [54]:
model_4 = decision_tree_create(train_data, features, 'safe_loans', max_depth = 6, 
                                min_node_size = 0, min_error_reduction=-1)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term. 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade.A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Split on feature grade.B. (8074, 1048)
--------------------------------------------------------------------
Subtree, depth = 3 (8074 data points).
Split on feature grade.C. (5884, 2190)
--------------------------------------------------------------------
Subtree, depth = 4 (5884 data points).
Split on feature grade.D. (3826, 2058)
--------------------------------------------------------------------
Subtree, depth = 5 (3826 data points).
Split on feature grade.E. (1693, 2133)
--------------------------------------------------------------------
Subtree, depth = 6 (1693 data points).
R

In [55]:
model_5 = decision_tree_create(train_data, features, 'safe_loans', max_depth = 6, 
                                min_node_size = 0, min_error_reduction=0)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term. 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade.A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Split on feature grade.B. (8074, 1048)
Early stopping condition 3 reached. Minimum error reduction.
--------------------------------------------------------------------
Subtree, depth = 2 (101 data points).
Split on feature emp_length.n/a. (96, 5)
--------------------------------------------------------------------
Subtree, depth = 3 (96 data points).
Split on feature emp_length.< 1 year. (85, 11)
--------------------------------------------------------------------
Subtree, depth = 4 (85 data points).
Split on feature grade.B. (85, 0)
Creating leaf node.
----------------------------

In [56]:
model_6 = decision_tree_create(train_data, features, 'safe_loans', max_depth = 6, 
                                min_node_size = 0, min_error_reduction=5)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term. 36 months. (9223, 28001)
Early stopping condition 3 reached. Minimum error reduction.


In [57]:
print "Validation data, classification error (model 4):", evaluate_classification_error(model_4, validation_data)
print "Validation data, classification error (model 5):", evaluate_classification_error(model_5, validation_data)
print "Validation data, classification error (model 6):", evaluate_classification_error(model_6, validation_data)

Validation data, classification error (model 4): 3563
9284
0
0
Validation data, classification error (model 5): 3563
9284
0
0
Validation data, classification error (model 6): 4674
9284
0
0


In [58]:
count_leaves(model_4)

19

In [59]:
count_leaves(model_5)

13

In [60]:
count_leaves(model_6)

1

In [61]:
model_7 = decision_tree_create(train_data, features, 'safe_loans', max_depth = 6, 
                                min_node_size = 0, min_error_reduction=-1)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term. 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade.A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Split on feature grade.B. (8074, 1048)
--------------------------------------------------------------------
Subtree, depth = 3 (8074 data points).
Split on feature grade.C. (5884, 2190)
--------------------------------------------------------------------
Subtree, depth = 4 (5884 data points).
Split on feature grade.D. (3826, 2058)
--------------------------------------------------------------------
Subtree, depth = 5 (3826 data points).
Split on feature grade.E. (1693, 2133)
--------------------------------------------------------------------
Subtree, depth = 6 (1693 data points).
R

In [62]:
model_8 = decision_tree_create(train_data, features, 'safe_loans', max_depth = 6, 
                                min_node_size = 2000, min_error_reduction=-1)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term. 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade.A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Split on feature grade.B. (8074, 1048)
--------------------------------------------------------------------
Subtree, depth = 3 (8074 data points).
Split on feature grade.C. (5884, 2190)
--------------------------------------------------------------------
Subtree, depth = 4 (5884 data points).
Split on feature grade.D. (3826, 2058)
--------------------------------------------------------------------
Subtree, depth = 5 (3826 data points).
Split on feature grade.E. (1693, 2133)
--------------------------------------------------------------------
Subtree, depth = 6 (1693 data points).
R

In [63]:
model_9 = decision_tree_create(train_data, features, 'safe_loans', max_depth = 6, 
                                min_node_size = 50000, min_error_reduction=-1)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term. 36 months. (9223, 28001)
Early stopping condition 2 reached. Reached minimum node size.


In [64]:
print "Validation data, classification error (model 7):", evaluate_classification_error(model_4, validation_data)
print "Validation data, classification error (model 8):", evaluate_classification_error(model_8, validation_data)
print "Validation data, classification error (model 9):", evaluate_classification_error(model_9, validation_data)

Validation data, classification error (model 7): 3563
9284
0
0
Validation data, classification error (model 8): 3570
9284
0
0
Validation data, classification error (model 9): 4674
9284
0
0


In [65]:
count_leaves(model_7)

19

In [66]:
count_leaves(model_8)

12

In [67]:
count_leaves(model_9)

1