# Decision Trees in Practice

In [1]:
import pandas as pd
import numpy as np

In [2]:
loans = pd.read_csv('data/lending-club-data.csv', low_memory=False)
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x==0 else -1)
loans = loans.drop(columns='bad_loans')

In [None]:
features = ['grade',              # grade of the loan
            'term',               # the term of the loan
            'home_ownership',     # home_ownership status: own, mortgage or rent
            'emp_length',         # number of years of employment
           ]
target = 'safe_loans'
loans = loans[features + [target]]

# Encoding cateogoric variables

In [None]:
categorical_variables = loans.select_dtypes(include=['object']).columns.tolist()
print(f"Variables categóricas identificadas: {categorical_variables}")

from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, drop=None)
loans_encoded_data = encoder.fit_transform(loans[categorical_variables])

# Obtener los nombres de las nuevas columnas
feature_names = []
for i, feature in enumerate(categorical_variables):
    feature_vals = encoder.categories_[i]
    for val in feature_vals:
        feature_names.append(f"{feature}_{val}")
len(feature_names)

# Crear un DataFrame con las variables codificadas
loans_encoded_df = pd.DataFrame(loans_encoded_data, columns=feature_names)

# Eliminar las columnas categóricas originales y agregar las nuevas columnas codificadas
loans_numeric_data = loans.drop(columns=categorical_variables)

loans_encoded_df.index=loans_numeric_data.index
loans_enc_data = pd.concat([loans_numeric_data, loans_encoded_df], axis=1)

Variables categóricas identificadas: ['grade', 'term', 'home_ownership', 'emp_length']


In [None]:
train_idx = pd.read_json('data/module-6-assignment-train-idx.json')
validation_idx = pd.read_json('data/module-6-assignment-validation-idx.json')
train_data = loans_enc_data.iloc[train_idx[0].values]
validation_data = loans_enc_data.iloc[validation_idx[0].values]

# Early stopping methods for decision trees

## Early stopping condition 1: Maximum depth

## Early stopping condition 2: Minimum node size

In [None]:
def reached_minimum_node_size(data, min_node_size):
    # Return True if the number of data points is less than or equal to the minimum node size.
    if len(data) <= min_node_size:
        return True
    else:
        return False

## Early stopping condition 3: Minimum gain in error reduction

In [None]:
def error_reduction(error_before_split, error_after_split):
    # Return the error before the split minus the error after the split.
    return (error_before_split - error_after_split)

## Grabbing binary decision tree helper functions from past assignment

In [None]:
def intermediate_node_num_mistakes(labels_in_node):
    # Corner case: If labels_in_node is empty, return 0
    if len(labels_in_node) == 0:
        return 0    
    # Count the number of 1's (safe loans)
    safe_loans = len(labels_in_node[labels_in_node==1])
    # Count the number of -1's (risky loans)
    risky_loans = len(labels_in_node[labels_in_node==-1])        
    # Return the number of mistakes that the majority classifier makes.
    return np.min([safe_loans, risky_loans])

In [None]:
def best_splitting_feature(data, features, target):
    
    best_feature = None # Keep track of the best feature 
    best_error = 10     # Keep track of the best error so far 
    # Note: Since error is always <= 1, we should intialize it with something larger than 1.

    # Convert to float to make sure error gets computed correctly.
    num_data_points = float(len(data))  
    
    # Loop through each feature to consider splitting on that feature
    for feature in features:
        
        # The left split will have all data points where the feature value is 0
        left_split = data[data[feature] == 0]
        
        # The right split will have all data points where the feature value is 1
        right_split =  data[data[feature] == 1]
            
        # Calculate the number of misclassified examples in the left split.
        # Remember that we implemented a function for this! (It was called intermediate_node_num_mistakes)
        left_mistakes = intermediate_node_num_mistakes(left_split[target])

        # Calculate the number of misclassified examples in the right split.
        right_mistakes = intermediate_node_num_mistakes(right_split[target])
            
        # Compute the classification error of this split.
        # Error = (# of mistakes (left) + # of mistakes (right)) / (# of data points)
        error = (left_mistakes + right_mistakes)/num_data_points

        # If this is the best error we have found so far, store the feature as best_feature and the error as best_error
        if error < best_error:
            best_feature = feature
            best_error = error
        
    
    return best_feature # Return the best feature we found

In [None]:
def create_leaf(target_values):    
    # Create a leaf node
    leaf = {'splitting_feature' : None,
            'left' : None,
            'right' : None,
            'is_leaf': True}   ## YOUR CODE HERE 
   
    # Count the number of data points that are +1 and -1 in this node.
    num_ones = len(target_values[target_values == 1])
    num_minus_ones = len(target_values[target_values == -1])    

    # For the leaf node, set the prediction to be the majority class.
    # Store the predicted class (1 or -1) in leaf['prediction']
    if num_ones > num_minus_ones:
        leaf['prediction'] = 1
    else:
        leaf['prediction'] = -1

    # Return the leaf node
    return leaf 

In [None]:
def decision_tree_create(data, features, target, current_depth = 0, 
                         max_depth = 10, min_node_size=1, 
                         min_error_reduction=0.0):
    remaining_features = features[:] # Make a copy of the features.
    
    target_values = data[target]
    print("--------------------------------------------------------------------")
    print("Subtree, depth = %s (%s data points)." % (current_depth, len(target_values)))
    

    # Stopping condition 1
    # (Check if there are mistakes at current node.
    # Recall you wrote a function intermediate_node_num_mistakes to compute this.)
    if  intermediate_node_num_mistakes(target_values)== 0:  
        print("Stopping condition 1 reached.")     
        # If not mistakes at current node, make current node a leaf node
        return create_leaf(target_values)
    
    # Stopping condition 2 (check if there are remaining features to consider splitting on)
    if remaining_features == []:   
        print("Stopping condition 2 reached.")   
        # If there are no remaining features to consider, make current node a leaf node
        return create_leaf(target_values)    
    
    #  Early stopping condition 1: Reached max depth limit.
    if current_depth >= max_depth:  
        print("Reached maximum depth. Stopping for now.")
        # If the max tree depth has been reached, make current node a leaf node
        return create_leaf(target_values)

    # Early stopping condition 2: Reached the minimum node size.
    # If the number of data points is less than or equal to the minimum size, return a leaf.
    if reached_minimum_node_size(data, min_node_size):
        print("Early stopping condition 2 reached. Reached minimum node size.")
        return create_leaf(target_values)

    # Find the best splitting feature (recall the function best_splitting_feature implemented above)
    splitting_feature = best_splitting_feature(data, remaining_features, target)

    
    # Split on the best feature that we found. 
    left_split = data[data[splitting_feature] == 0]
    right_split = data[data[splitting_feature] == 1]

    # Early stopping condition 3: Minimum error reduction
    # Calculate the error before splitting (number of misclassified examples 
    # divided by the total number of examples)

    error_before_split = intermediate_node_num_mistakes(target_values) / float(len(data))
    # Calculate the error after splitting (number of misclassified examples 
    # in both groups divided by the total number of examples)
    left_mistakes =    intermediate_node_num_mistakes(left_split[target])
    right_mistakes =   intermediate_node_num_mistakes(right_split[target])
    error_after_split = (left_mistakes + right_mistakes) / float(len(data))

    # If the error reduction is LESS THAN OR EQUAL TO min_error_reduction, return a leaf.
    if error_reduction(error_before_split, error_after_split)<=min_error_reduction:
        print("Early stopping condition 3 reached. Minimum error reduction.")
        return  create_leaf(target_values)


    remaining_features.remove(splitting_feature)
    print("Split on feature %s. (%s, %s)" % (\
                      splitting_feature, len(left_split), len(right_split)))
    
    # Create a leaf node if the split is "perfect"
    if len(left_split) == len(data):
        print("Creating leaf node.")
        return create_leaf(left_split[target])
    if len(right_split) == len(data):
        print("Creating leaf node.")
        return create_leaf(right_split[target])

        
    # Repeat (recurse) on left and right subtrees
    left_tree = decision_tree_create(left_split, remaining_features, target, 
                                     current_depth + 1, max_depth, min_node_size, min_error_reduction)        
    right_tree = decision_tree_create(right_split, remaining_features, target, 
                                     current_depth + 1, max_depth, min_node_size, min_error_reduction)   

    return {'is_leaf'          : False, 
            'prediction'       : None,
            'splitting_feature': splitting_feature,
            'left'             : left_tree, 
            'right'            : right_tree}

In [None]:
def count_nodes(tree):
    if tree['is_leaf']:
        return 1
    return 1 + count_nodes(tree['left']) + count_nodes(tree['right'])

In [None]:
small_decision_tree = decision_tree_create(train_data, feature_names , 'safe_loans', max_depth = 2, 
                                        min_node_size = 10, min_error_reduction=0.0)
if count_nodes(small_decision_tree) == 7:
    print('Test passed!')
else:
    print('Test failed... try again!')
    print('Number of nodes found                :', count_nodes(small_decision_tree))
    print('Number of nodes that should be there : 7' )

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).


Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade_A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 2 (101 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 1 (28001 data points).
Split on feature grade_D. (23300, 4701)
--------------------------------------------------------------------
Subtree, depth = 2 (23300 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 2 (4701 data points).
Reached maximum depth. Stopping for now.
Test passed!


## Build a tree!

Now that your code is working, we will train a tree model on the **train_data** with
* `max_depth = 6`
* `min_node_size = 100`, 
* `min_error_reduction = 0.0`

**Warning**: This code block may take a minute to learn. 

In [None]:
my_decision_tree_new = decision_tree_create(train_data, feature_names, 'safe_loans', max_depth = 6, 
                                min_node_size = 100, min_error_reduction=0.0)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade_A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Early stopping condition 3 reached. Minimum error reduction.
--------------------------------------------------------------------
Subtree, depth = 2 (101 data points).
Split on feature emp_length_nan. (96, 5)
--------------------------------------------------------------------
Subtree, depth = 3 (96 data points).
Early stopping condition 2 reached. Reached minimum node size.
--------------------------------------------------------------------
Subtree, depth = 3 (5 data points).
Early stopping condition 2 reached. Reached minimum node size.
-------------------------------------------

In [None]:
my_decision_tree_old = decision_tree_create(train_data, feature_names, 'safe_loans', max_depth = 6, 
                                min_node_size = 0, min_error_reduction=-1)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade_A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Split on feature grade_B. (8074, 1048)
--------------------------------------------------------------------
Subtree, depth = 3 (8074 data points).
Split on feature grade_C. (5884, 2190)
--------------------------------------------------------------------
Subtree, depth = 4 (5884 data points).
Split on feature grade_D. (3826, 2058)
--------------------------------------------------------------------
Subtree, depth = 5 (3826 data points).
Split on feature grade_E. (1693, 2133)
--------------------------------------------------------------------
Subtree, depth = 6 (1693 data points).
R

## Making predictions

In [None]:
def classify(tree, x, annotate = False):
    # if the node is a leaf node.
    if tree['is_leaf']:
        if annotate:
             print("At leaf, predicting %s" % tree['prediction'])
        return tree['prediction']
    else:
        # split on feature.
        split_feature_value = x[tree['splitting_feature']]
        if annotate:
             print("Split on %s = %s" % (tree['splitting_feature'], split_feature_value))
        if (split_feature_value == 0).values:
            return classify(tree['left'], x, annotate)
        else:
            return classify(tree['right'], x, annotate)

In [None]:
print('Predicted class: %s ' % classify(my_decision_tree_new, validation_data[0:1]))

Predicted class: -1 


In [None]:
classify(my_decision_tree_new, validation_data[0:1], annotate = True)

Split on term_ 36 months = 24    0.0
Name: term_ 36 months, dtype: float64
Split on grade_A = 24    0.0
Name: grade_A, dtype: float64
At leaf, predicting -1


-1

In [None]:
classify(my_decision_tree_old, validation_data[0:1], annotate = True)

Split on term_ 36 months = 24    0.0
Name: term_ 36 months, dtype: float64
Split on grade_A = 24    0.0
Name: grade_A, dtype: float64
Split on grade_B = 24    0.0
Name: grade_B, dtype: float64
Split on grade_C = 24    0.0
Name: grade_C, dtype: float64
Split on grade_D = 24    1.0
Name: grade_D, dtype: float64
At leaf, predicting -1


-1

## Evaluating the model

In [None]:
def evaluate_classification_error(tree, data, target):
    # Apply the classify(tree, x) to each row in your data
    prediction = [classify(tree, data[i:i+1]) for i in range(len(data))]
    
    # Once you've made the predictions, calculate the classification error and return it
    no_mistakes = abs(prediction + data[target])/2
    num_mistakes = len(data) - no_mistakes.sum()
    return float(num_mistakes)/len(data)

In [None]:
evaluate_classification_error(my_decision_tree_new, validation_data, target)

0.38367083153813014

In [None]:
evaluate_classification_error(my_decision_tree_old, validation_data, target)

0.3837785437311504

# Exploring the effect of max_depth

We will compare three models trained with different values of the stopping criterion. We intentionally picked models at the extreme ends (**too small**, **just right**, and **too large**).

Train three models with these parameters:

1. **model_1**: max_depth = 2 (too small)
2. **model_2**: max_depth = 6 (just right)
3. **model_3**: max_depth = 14 (may be too large)

For each of these three, we set `min_node_size = 0` and `min_error_reduction = -1`.

** Note:** Each tree can take up to a few minutes to train. In particular, `model_3` will probably take the longest to train.

In [None]:
model_1 = decision_tree_create(train_data, feature_names, 'safe_loans', max_depth = 2, 
                                min_node_size = 0, min_error_reduction=-1)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade_A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 2 (101 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 1 (28001 data points).
Split on feature grade_D. (23300, 4701)
--------------------------------------------------------------------
Subtree, depth = 2 (23300 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 2 (4701 data po

In [None]:
model_2 = decision_tree_create(train_data, feature_names, 'safe_loans', max_depth = 6, 
                                min_node_size = 0, min_error_reduction=-1)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade_A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Split on feature grade_B. (8074, 1048)
--------------------------------------------------------------------
Subtree, depth = 3 (8074 data points).
Split on feature grade_C. (5884, 2190)
--------------------------------------------------------------------
Subtree, depth = 4 (5884 data points).
Split on feature grade_D. (3826, 2058)
--------------------------------------------------------------------
Subtree, depth = 5 (3826 data points).
Split on feature grade_E. (1693, 2133)
--------------------------------------------------------------------
Subtree, depth = 6 (1693 data points).
R

In [None]:
model_3 = decision_tree_create(train_data, feature_names, 'safe_loans', max_depth = 14, 
                                min_node_size = 0, min_error_reduction=-1)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade_A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Split on feature grade_B. (8074, 1048)
--------------------------------------------------------------------
Subtree, depth = 3 (8074 data points).
Split on feature grade_C. (5884, 2190)
--------------------------------------------------------------------
Subtree, depth = 4 (5884 data points).
Split on feature grade_D. (3826, 2058)
--------------------------------------------------------------------
Subtree, depth = 5 (3826 data points).
Split on feature grade_E. (1693, 2133)
--------------------------------------------------------------------
Subtree, depth = 6 (1693 data points).
S

In [None]:
print("Training data, classification error (model 1):", evaluate_classification_error(model_1, train_data, target))
print("Training data, classification error (model 2):", evaluate_classification_error(model_2, train_data, target))
print("Training data, classification error (model 3):", evaluate_classification_error(model_3, train_data, target))

Training data, classification error (model 1): 0.40003761014399314
Training data, classification error (model 2): 0.38185041908446166
Training data, classification error (model 3): 0.3761820330969267


In [None]:
print("Training data, classification error (model 1):", evaluate_classification_error(model_1, validation_data, target))
print("Training data, classification error (model 2):", evaluate_classification_error(model_2, validation_data, target))
print("Training data, classification error (model 3):", evaluate_classification_error(model_3, validation_data, target))

Training data, classification error (model 1): 0.3981042654028436
Training data, classification error (model 2): 0.3837785437311504
Training data, classification error (model 3): 0.37731581214993537


### Measuring the complexity of the tree

Recall in the lecture that we talked about deeper trees being more complex. We will measure the complexity of the tree as

```
  complexity(T) = number of leaves in the tree T
```

Here, we provide a function `count_leaves` that counts the number of leaves in a tree. Using this implementation, compute the number of nodes in `model_1`, `model_2`, and `model_3`. 

In [None]:
def count_leaves(tree):
    if tree['is_leaf']:
        return 1
    return count_leaves(tree['left']) + count_leaves(tree['right'])

In [None]:
print("Number of nodes (model 1):", count_leaves(model_1))
print("Number of nodes (model 2):", count_leaves(model_2))
print("Number of nodes (model 3):", count_leaves(model_3))

Number of nodes (model 1): 4
Number of nodes (model 2): 19
Number of nodes (model 3): 41


# Exploring the effect of min_error

We will compare three models trained with different values of the stopping criterion. We intentionally picked models at the extreme ends (**negative**, **just right**, and **too positive**).

Train three models with these parameters:
1. **model_4**: `min_error_reduction = -1` (ignoring this early stopping condition)
2. **model_5**: `min_error_reduction = 0` (just right)
3. **model_6**: `min_error_reduction = 5` (too positive)

For each of these three, we set `max_depth = 6`, and `min_node_size = 0`.

** Note:** Each tree can take up to 30 seconds to train.

In [None]:
model_4 = decision_tree_create(train_data, feature_names, 'safe_loans',max_depth =6, min_node_size = 0, min_error_reduction=-1)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade_A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Split on feature grade_B. (8074, 1048)
--------------------------------------------------------------------
Subtree, depth = 3 (8074 data points).
Split on feature grade_C. (5884, 2190)
--------------------------------------------------------------------
Subtree, depth = 4 (5884 data points).
Split on feature grade_D. (3826, 2058)
--------------------------------------------------------------------
Subtree, depth = 5 (3826 data points).
Split on feature grade_E. (1693, 2133)
--------------------------------------------------------------------
Subtree, depth = 6 (1693 data points).
R

In [None]:
model_5 = decision_tree_create(train_data, feature_names, 'safe_loans',max_depth =6, min_node_size = 0, min_error_reduction=0)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade_A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Early stopping condition 3 reached. Minimum error reduction.
--------------------------------------------------------------------
Subtree, depth = 2 (101 data points).
Split on feature emp_length_nan. (96, 5)
--------------------------------------------------------------------
Subtree, depth = 3 (96 data points).
Split on feature emp_length_< 1 year. (85, 11)
--------------------------------------------------------------------
Subtree, depth = 4 (85 data points).
Early stopping condition 3 reached. Minimum error reduction.
------------------------------------------------------------

In [None]:
model_6 = decision_tree_create(train_data, feature_names, 'safe_loans',max_depth =6, min_node_size = 0, min_error_reduction=5)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Early stopping condition 3 reached. Minimum error reduction.


In [None]:
print("Validation data, classification error (model 4):", evaluate_classification_error(model_4, validation_data, target))
print("Validation data, classification error (model 5):", evaluate_classification_error(model_5, validation_data, target))
print("Validation data, classification error (model 6):", evaluate_classification_error(model_6, validation_data, target))

Validation data, classification error (model 4): 0.3837785437311504
Validation data, classification error (model 5): 0.3837785437311504
Validation data, classification error (model 6): 0.503446790176648


In [None]:
print("Number of nodes (model 4):", count_leaves(model_4))
print("Number of nodes (model 5):", count_leaves(model_5))
print("Number of nodes (model 6):", count_leaves(model_6))

Number of nodes (model 1): 19
Number of nodes (model 2): 13
Number of nodes (model 3): 1


# Exploring the effect of min_node_size

We will compare three models trained with different values of the stopping criterion. Again, intentionally picked models at the extreme ends (**too small**, **just right**, and **just right**).

Train three models with these parameters:
1. **model_7**: min_node_size = 0 (too small)
2. **model_8**: min_node_size = 2000 (just right)
3. **model_9**: min_node_size = 50000 (too large)

For each of these three, we set `max_depth = 6`, and `min_error_reduction = -1`.

** Note:** Each tree can take up to 30 seconds to train.

In [None]:
model_7 = decision_tree_create(train_data, feature_names, 'safe_loans',max_depth =6, min_node_size = 0, min_error_reduction=-1)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade_A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Split on feature grade_B. (8074, 1048)
--------------------------------------------------------------------
Subtree, depth = 3 (8074 data points).
Split on feature grade_C. (5884, 2190)
--------------------------------------------------------------------
Subtree, depth = 4 (5884 data points).
Split on feature grade_D. (3826, 2058)
--------------------------------------------------------------------
Subtree, depth = 5 (3826 data points).
Split on feature grade_E. (1693, 2133)
--------------------------------------------------------------------
Subtree, depth = 6 (1693 data points).
R

In [None]:
model_8 = decision_tree_create(train_data, feature_names, 'safe_loans',max_depth =6, min_node_size = 2000, min_error_reduction=-1)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade_A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Split on feature grade_B. (8074, 1048)
--------------------------------------------------------------------
Subtree, depth = 3 (8074 data points).
Split on feature grade_C. (5884, 2190)
--------------------------------------------------------------------
Subtree, depth = 4 (5884 data points).
Split on feature grade_D. (3826, 2058)
--------------------------------------------------------------------
Subtree, depth = 5 (3826 data points).
Split on feature grade_E. (1693, 2133)
--------------------------------------------------------------------
Subtree, depth = 6 (1693 data points).
R

In [None]:
model_8 = decision_tree_create(train_data, feature_names, 'safe_loans',max_depth =6, min_node_size = 0, min_error_reduction=-1)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).


Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade_A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Split on feature grade_B. (8074, 1048)
--------------------------------------------------------------------
Subtree, depth = 3 (8074 data points).
Split on feature grade_C. (5884, 2190)
--------------------------------------------------------------------
Subtree, depth = 4 (5884 data points).
Split on feature grade_D. (3826, 2058)
--------------------------------------------------------------------
Subtree, depth = 5 (3826 data points).
Split on feature grade_E. (1693, 2133)
--------------------------------------------------------------------
Subtree, depth = 6 (1693 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------


In [None]:
model_9 = decision_tree_create(train_data, feature_names, 'safe_loans',max_depth =6, min_node_size = 50000, min_error_reduction=-1)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Early stopping condition 2 reached. Reached minimum node size.


In [None]:
print("Validation data, classification error (model 7):", evaluate_classification_error(model_7, validation_data, target))
print("Validation data, classification error (model 8):", evaluate_classification_error(model_8, validation_data, target))
print("Validation data, classification error (model 9):", evaluate_classification_error(model_9, validation_data, target))

Validation data, classification error (model 7): 0.3837785437311504
Validation data, classification error (model 8): 0.38453252908229213
Validation data, classification error (model 9): 0.503446790176648


In [None]:
print("Number of nodes (model 7):", count_leaves(model_7))
print("Number of nodes (model 8):", count_leaves(model_8))
print("Number of nodes (model 9):", count_leaves(model_9))

Number of nodes (model 1): 19
Number of nodes (model 2): 12
Number of nodes (model 3): 1
