In [67]:

# Python modules
import numpy as np
import pandas as pd
import random
import sys
import time
import math
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
import sys

random.seed(1234)

In [68]:
######################### Data helper functions #############################
def load_data(filename):
    ''' Returns a dataframe (df) containing the data in filename. You should
        specify the full path plus the name of the file in filename, relative
        to where you are running code
    '''
    df = pd.read_csv(filename)
    return df

def sigmoid(z):
    ''' Sigmoid function '''
    return 1 / (1 + np.exp(-z))

def sigmoid_grad(z):
    ''' Derivative of sigmoid function '''
    sig_z = sigmoid(z)
    return sig_z * (1-sig_z)

def split_data(df, train_proportion):
    ''' Inputs
            * df: dataframe containing data
            * train_proportion: proportion of data in df that will be used for
                training. 1-train_proportion is proportion of data to be used
                for testing
        Output
            * train_df: dataframe containing training data
            * test_df: dataframe containing testing data
    '''
    # Make sure there are row numbers
    df = df.reset_index(drop=True)

    # Reorder examples and split data according to train proportion
    train = df.sample(frac=train_proportion, axis=0)
    test = df.drop(index=train.index)
    return train, test

def divide_k_folds(df, num_folds):
    ''' Inputs
            * df: dataframe containing data
            * num_folds: number of folds
        Output
            * folds: lists of folds, each fold is subset of df dataframe
    '''
    folds = []
    for subset in np.array_split(df, num_folds):
        folds.append(subset)

    return folds

def to_numpy(df):
    a = df.to_numpy()
    return a.T

def get_X_y_data(df, features, target):
    ''' Split dataframe into X and y numpy arrays '''
    X_df = df.loc[:, df.columns != target]
    Y_df = df[target]
    # X = to_numpy(X_df)
    # Y = to_numpy(Y_df)
    return X_df, Y_df

In [69]:
class Node:
    ''' Class for nodes in decision tree '''
    def __init__(self, feature_name='', feature_value=None):
        self.feature_name = feature_name # head if dummy head node, otherwise is feature or label value
        self.feature_value = feature_value # head if dummy head node, otherwise is feature or label value
        self.children = None # list of nodes

def print_tree(node, i):
    ''' Helper function that prints decision tree.
        Inputs:
            * node: root node of decision tree to print
            * i: number of tabs to offset each layer of decision tree
                 when printing.
    '''
    print("hi")
    if node.children is None:
        print("two")
        tabs = i * '\t'
        print(tabs + str(node.feature_value))
    else:
        print("one")
        tabs = i * '\t'
        print(tabs + str(node.feature_name) + ':' + str(node.feature_value))
        for child in node.children:
            print_tree(child, i + 1)

"""
HELPER FUNCTIONS
"""

def get_children_from_fvals(df, f):
    ''' Inputs:
            * df: dataframe containing data
            * f: name of current feature being considered
        Output:
            * List of child nodes from feature values for f
    '''
    fvals = df[f].unique()
    return [Node(f, v) for v in fvals]

def get_df_num_rows(df):
    ''' Returns number of rows in dataframe '''
    return len(df)

def get_df_subset(df, f, fval):
    ''' Inputs:
            * df: dataframe containing data
            * f: name of current feature being considered
            * fval: one value that f can take on
        Output:
            * Dataframe comprising rows of df for which f's value is fval
    '''
    df_fval = df[ df[f] == fval ]
    num_fval = get_df_num_rows(df_fval)
    return df_fval, num_fval

def get_df_num_labels(df, label):
    ''' Inputs:
            * df: dataframe containing data
            * label: column name in df to use as label
        Output:
            * Dictionary, where keys are label value in df_val and values the
            number of rows in df with that label
    '''
    num_label = df[label].value_counts().to_dict()
    return num_label

def information_gain(df, features, label):
    ''' Inputs:
            * df: dataframe containing data
            * features: current features to consider
            * label: column name in df to use as label
        Output:
            * Feature name on which to split, i.e., feature with the maximum
              information gain
    '''

    # How many rows in dataframe are there? Each row is an instance
    num_instances = get_df_num_rows(df)
    min_entropy = None
    split_on = None

    # Iterate through each feature/category to determine
    # which gives maximum information gain
    for f in features:
        sum_entropy = 0

        # Get all values of the current feature to split the dataset
        for fval in df[f].unique():

            # Get rows of dataframe for which f's value is fval
            df_fval, num_fval = get_df_subset(df, f, fval)

            # Get counts for each possible label value in df_val
            num_label_fval = get_df_num_labels(df_fval, label)

            # Calculate entropy
            entropy = 0
            for _, num_label in num_label_fval.items():
                prob = num_label/num_fval
                entropy += - (prob)*np.log2(prob)
            sum_entropy += (num_fval/num_instances) * entropy

        # Get feature with minimum entropy sum
        # because this feaure has the maximum info gain
        if min_entropy is None or sum_entropy < min_entropy:
            min_entropy = sum_entropy
            split_on = f

    return split_on

"""

ID3 Algorithm

"""

def get_ID3_num_correct(df, parent, label):
    ''' Inputs:
            * df: dataframe containing data
            * node: root node in decision tree
        Output
            * num_correct: number of correct predictions made on data in df by tree
    '''
    if parent.children is None:
        return len(df)

    num_correct = 0
    for child in parent.children:
        subset = df.copy()
        subset = subset[subset[child.feature_name] == child.feature_value]
        num_correct += get_ID3_num_correct(subset, child, label)
    return num_correct

def get_ID3_accuracy(df, dtree, features, label):
    num_correct = get_ID3_num_correct(df, dtree, label)
    return num_correct / len(df)

root_node = Node('root')

def ID3_build_tree(df, features, label, parent, max_depth):
    ''' Inputs
            * df: dataframe containing data
            * features: current features to consider
            * label: column name in df to use as label
            * parent: of type Node
        Output
            * a decision tree
    '''

    #####
    # Todo: update this function to stop once max depth hit
    #####
    depth_count = max_depth
    # if all features have been used, return most popular label
    # if there is only one label, also return
    if len(features) == 0 or len( df[label].unique() ) == 1 or depth_count == 0:
        # Get most frequent label using mode
        leaf = df[label].mode()[0]

        # Set children to be leaf node containing most frequent label
        parent.children = [Node(label, leaf)]

    # Otherwise, continue to recurse down the tree
    else:

        # Get feature with the most info gain
        feature_to_split_on = information_gain(df, features, label)

        # Values from that feature become the children of the previous node
        parent.children = get_children_from_fvals(df, feature_to_split_on)

        # Remove the feature we just split on so we don't try to split on it again
        new_features = features.copy()
        new_features.remove(feature_to_split_on)

        # Recursively call build_tree on each child node
        for child in parent.children:
            ID3_build_tree(df[ df[feature_to_split_on] == child.feature_value ],
                    new_features, label, child,max_depth-1)
        depth_count -= 1

def ID3_decision_tree(df, features, label, max_depth=5, random_subspace = None):
    ''' Inputs
            * df: dataframe containing data
            * features: list of current features to consider
            * label: column name in df to use as label
            * max_depth: max depth of tree
            * random_subspace: number of random features to be chosen
        Output
            * dtree: root node of trained decision tree
    '''
    # get the number of columns of the dataframe
    _, n_col = df.shape
    # get the indeces of the columns excluding target column
    n_col_indeces = list(range(n_col-1)) 
    # check if random
    if random_subspace != None:
      n_col_indeces = random.sample(population = n_col_indeces, k = random_subspace)
    # initialize empty list of random features
    random_features = []
    # append randomly chosen features to the random_features list
    for index in n_col_indeces:
      random_features.append(features[index])
    # initialize root node
    dtree = Node('root', '')
    # build tree using random features
    ID3_build_tree(df, random_features, label, dtree, max_depth)
    return dtree

def ID3_regular(df, features, label, max_depth): #ID3 decision tree algo from class
    ''' Inputs
            * df: dataframe containing data
            * features: current features to consider
            * label: column name in df to use as label
        Output
            * dtree: root node of trained decision tree
    '''
    dtree = Node('root', '')
    ID3_build_tree(df, features, label, dtree, max_depth)
    #print_tree(dtree, 0)
    return dtree

def ID3_cross_validation(df, num_folds, features, label, tree_depths = 100): #cross validation from class
    ''' Inputs
            * df: dataframe containing data
            * num_folds: number of folds (k) for cross-validation
            * features: featurs to use (list of names)
            * label: column name in df to use as label
        Output
            * train_accuracy: dataframe containing training data
            * test_accuracy: dataframe containing testing data
    '''

    #####
    # Todo: Implement cross-validation and train tree for different depths
    ####

    train, test = split_data(df,0.70)

    folds = divide_k_folds(train, num_folds)

    best_accuracy = -9999999
    best_depth = None

    depth_acc = []

    for depth in tree_depths:

        accuracies= []
        for i in range(0,num_folds):
            #removes validation subset from df
            validation = folds[i]
            exc_val = train.merge(validation, how='left', indicator=True)
            exc_val = exc_val[exc_val['_merge'] == 'left_only']

            # model is trained with remaining
            dtree = ID3_regular(exc_val, features, label, depth)

            # gets accuracy of the validation set
            accuracy = get_ID3_accuracy(validation, dtree, features, label)
            accuracies.append(accuracy)

        mean_accuracy = sum(accuracies)/len(accuracies)
        if mean_accuracy > best_accuracy:
           best_accuracy = mean_accuracy
           best_depth = depth

        depth_acc.append([depth, mean_accuracy])

        # print('depth ' + str(depth) + ' has ' + str(mean_accuracy) + " accuracy")
        # print('best depth so far is ' + str(best_depth) + " with accuracy of " + str(best_accuracy))

    depth_accuracy_df = pd.DataFrame(depth_acc, columns = ['Depth', 'Mean Accuracy'])

    dtree = ID3_decision_tree(train, features, label, best_depth)
    test_accuracy = get_ID3_accuracy(test, dtree, features, label)

    return best_depth, depth_accuracy_df, test_accuracy



In [70]:
# CODE FOR FINDING ONE PREDICTION
def ID3_decision_tree_prediction(tree, example):
  if tree.children is None:
    return tree.feature_value
  else:
    for child in tree.children:
      fval = example[child.feature_name]
      if child.feature_value == fval:
        return ID3_decision_tree_prediction(child,example)
      elif child.children is None:
        return ID3_decision_tree_prediction(child,example)

# CODE FOR FINING ALL PREDICTIONS FOR ONE TREE
def ID3_decision_tree_all(tree, df):
  # get number of examples
  num_examples = df.shape[0]
  # initialize empty list of predictions
  predictions = []
  # loop through all of the examples
  for example in range(num_examples):
    # get prediction for one example
    pred = ID3_decision_tree_prediction(tree,df.iloc[example])
    # append to list of predictions
    predictions.append(pred)
  return predictions

def get_random_forest_predictions(df, forest):
  # initalize dictionary for predictions
  random_forest_predictions = {}
  # loop through every forest
  for i in range(len(forest)):
    # make the column names
    col_name = "dtree_{}".format(i)
    # get predictions for each example for one decision tree
    pred = ID3_decision_tree_all(forest[i],df)
    # add the prediction to the dictionary
    random_forest_predictions[col_name] = pred
    # make dictionary a dataframe
    pred_df = pd.DataFrame(random_forest_predictions)
    # get the mode of each row
    prediction = pred_df.mode(axis=1,dropna=True)[0]
    # replace nan values with 0 (this is a precaution because it caused serious problems)
    prediction = prediction.replace(np.nan,0)
  return prediction

In [71]:
def bootstrapping(df, num_df):
    """
    gets random samples with replacement and from a dataframe and creates a 
    new sub dataframe
    """
    for i in range(num_df):
        indices = np.random.randint(0, len(df), size=num_df)
        df_bootstrapped = df.iloc[indices]
    return df_bootstrapped

def random_forest_algorithm(train_df, features, label, n_trees, n_bootstrap, n_features, max_depth):
    """ 
    Takes a train data frame and returns a list of trees generated from bootstrapped data.
    The length of the list depends on the number of classifiers (n_trees)
    Outputs:
    * a list of decision trees (forest)
    """
    forest = []
    for tree in range(n_trees):
        df_bootstrapped = bootstrapping(train_df, n_bootstrap)
        tree_id3 = ID3_decision_tree(df_bootstrapped, features, label, max_depth,n_features)
        forest.append(tree_id3)
    return forest

In [72]:

def get_best_hyperparameters(train_data, test_data, features, label, n_trees, n_bootstrap, n_features, max_depth):
 
  best_depth = None
  best_accuracy = -999999999

  for depth in max_depth:
      # get the forest
      forest = random_forest_algorithm(train_data, features, label, n_trees, n_bootstrap, n_features, depth)
      # get final predictions for train
      predictions_train = get_random_forest_predictions(train_data, forest)
      # get final predictions for test
      predictions_test = get_random_forest_predictions(test_data, forest)
      #get accuracy 
      accuracy_train = accuracy_score(train_data[label],predictions_train)
      accuracy_test = accuracy_score(test_data[label],predictions_test)
      print('max_depth:', depth, ', \t train_accuracy: ', accuracy_train,  ', \t test_accuracy:', accuracy_test)

      if accuracy_train >= best_accuracy:
          best_accuracy = accuracy_train
          best_depth = depth

  return best_depth




In [1]:
################## TESTING ID3 RANDOM FOREST ON SONAR DATA SET ##################

#NOTE: If keep getting error in "get hyper parameters", run this again

# Load Data and drop NAs (only 1 NA)
sonar = pd.read_csv("/content/sonar.csv")
sonar = sonar.dropna()

###### BIN STRATEGY 1 (MEAN) ######

## Data Preprocessing 

# get the mean of each column and turn to list
col_mean = sonar.mean(axis=0).tolist()

# initalize empty df
sonar_bin1 = pd.DataFrame()

# initilaize empty list for features
sonar_features = []

# add new labels to empty df based on their values relative to the column mean
for i in range(len(col_mean)):
  col_name = "attribute_{}".format(i+1)
  # extract features
  sonar_features.append(col_name)
  sonar_bin1[col_name] = pd.cut(sonar.iloc[:,i],
                                      bins = [0,col_mean[i],1],
                                      labels = ["lower","higher"])
  
# add target column to new df and replace
sonar_bin1['class'] = sonar['Class']
sonar_bin1['class'] = sonar_bin1['class'].replace(['Rock','Mine'],[0,1])

NameError: ignored

In [74]:
## Training and Testing Sonar Data Set with Bin Strategy 1 for ID3 Random Forest

# initialize parameters
sonar_boot = int(sonar_bin1.shape[0]*0.7)
sonar_n_features = int(np.log2(len(sonar_features)+1)) #used fewer features due to overfitting (log2 vs. to square root)
sonar_label = 'class'
sonar_n_trees = 50

# split dataset into train and test
train_sonar1, test_sonar1 = split_data(sonar_bin1, 0.7)
# omit any na's just in case
train_sonar1 = train_sonar1.dropna()
test_sonar1 = test_sonar1.dropna()


In [75]:
print(sonar_n_features)

5


In [77]:
# Get hyper parameters 
best_sb1_depth = get_best_hyperparameters(train_sonar1,test_sonar1,sonar_features,sonar_label,
                                                    sonar_n_trees, sonar_boot, sonar_n_features,[3,4,5])
print("The best depth: " + str(best_sb1_depth))

max_depth: 3 , 	 train_accuracy:  0.8413793103448276 , 	 test_accuracy: 0.7096774193548387
max_depth: 4 , 	 train_accuracy:  0.9172413793103448 , 	 test_accuracy: 0.7741935483870968
max_depth: 5 , 	 train_accuracy:  0.9586206896551724 , 	 test_accuracy: 0.8225806451612904
The best depth: 5


In [78]:
# Get Average Accuracies for 5 tests for Bin Strategy 1

# initialize empty lists
train_accuracies1 = []
test_accuracies1 = []
# loop 5 times
for i in range(5):
  # generate forest
  sonar_forest1 = random_forest_algorithm(train_sonar1,sonar_features,sonar_label,
                                       sonar_n_trees,sonar_boot,sonar_n_features,
                                       best_sb1_depth)
  # get train predictions
  s1_train_predictions = get_random_forest_predictions(train_sonar1, sonar_forest1)
  # get train score
  s1_train_score = accuracy_score(train_sonar1[sonar_label],s1_train_predictions)
  # append score
  train_accuracies1.append(s1_train_score)
  # get test predictions 
  s1_test_predictions = get_random_forest_predictions(test_sonar1, sonar_forest1)
  # get test score
  s1_test_score = accuracy_score(test_sonar1[sonar_label],s1_test_predictions)
  # append score
  test_accuracies1.append(s1_test_score)

# caculate average
avg_train_accuracy1 = sum(train_accuracies1)/len(train_accuracies1)
abg_test_accuracy1 = sum(test_accuracies1)/len(test_accuracies1)
print("Average Train Accuracy: " + str(avg_train_accuracy1))
print("Average Test Accuracy " + str(abg_test_accuracy1))
print(train_accuracies1)
print(test_accuracies1)

Average Train Accuracy: 0.9227586206896552
Average Test Accuracy 0.7387096774193548
[0.9172413793103448, 0.9448275862068966, 0.9448275862068966, 0.8827586206896552, 0.9241379310344827]
[0.7419354838709677, 0.7419354838709677, 0.7096774193548387, 0.7258064516129032, 0.7741935483870968]


In [79]:
###### BIN STRATEGY 2 (Median) ######

## Data Preprocessing for Bin Strategy 2

# initalize empty df
sonar_bin2 = pd.DataFrame()

# initilaize empty list for features
sonar_features = []

# add new labels to empty df based on their values relative to the column median
for i in range(sonar.shape[1]-1): # minus 1 to not include target data frame
  col_name = "attribute_{}".format(i+1)
  # extract features
  sonar_features.append(col_name)

  sonar_bin2[col_name] = pd.qcut(sonar.iloc[:,i],
                                      q = 2,
                                      labels = ["one","two"])
  
# add target column to new df and replace
sonar_bin2['class'] = sonar['Class']
sonar_bin2['class'] = sonar_bin2['class'].replace(['Rock','Mine'],[0,1])


In [80]:
## Training and Testing Sonar Data Set with Bin Strategy 2 for ID3 Random Forest
train_sonar2, test_sonar2 = split_data(sonar_bin2, 0.7)

train_sonar2 = train_sonar2.dropna()
test_sonar2 = test_sonar2.dropna()

In [82]:
# Get hyper parameters 
best_sb2_depth = get_best_hyperparameters(train_sonar2,test_sonar2,sonar_features,sonar_label,
                                                    sonar_n_trees, sonar_boot, sonar_n_features,[3,4,5])
print("The best depth: " + str(best_sb2_depth))

max_depth: 3 , 	 train_accuracy:  0.8698630136986302 , 	 test_accuracy: 0.7580645161290323
max_depth: 4 , 	 train_accuracy:  0.9041095890410958 , 	 test_accuracy: 0.7096774193548387
max_depth: 5 , 	 train_accuracy:  0.952054794520548 , 	 test_accuracy: 0.8225806451612904
The best depth: 5


In [83]:
# Get Average Accuracies for 5 tests for Bin Strategy 2

# initialize empty lists
train_accuracies2 = []
test_accuracies2 = []
# loop 5 times (to save time)
for i in range(5):
  # generate forest
  sonar_forest2 = random_forest_algorithm(train_sonar2,sonar_features,sonar_label,
                                       sonar_n_trees,sonar_boot,sonar_n_features,
                                       best_sb2_depth)
  # get train predictions
  s2_train_predictions = get_random_forest_predictions(train_sonar2, sonar_forest2)
  # get train score
  s2_train_score = accuracy_score(train_sonar2[sonar_label],s2_train_predictions)
  # append score
  train_accuracies2.append(s2_train_score)
  # get test predictions
  s2_test_predictions = get_random_forest_predictions(test_sonar2, sonar_forest2)
  # get test score
  s2_test_score = accuracy_score(test_sonar2[sonar_label],s2_test_predictions)
  # append score
  test_accuracies2.append(s2_test_score)
# calculate averages
avg_train_accuracy2 = sum(train_accuracies2)/len(train_accuracies2)
abg_test_accuracy2 = sum(test_accuracies2)/len(test_accuracies2)
print("Average Train Accuracy: " + str(avg_train_accuracy2))
print("Average Test Accuracy " + str(abg_test_accuracy2))
print(train_accuracies2)
print(test_accuracies2)

Average Train Accuracy: 0.941095890410959
Average Test Accuracy 0.7258064516129032
[0.9383561643835616, 0.9794520547945206, 0.9383561643835616, 0.9246575342465754, 0.9246575342465754]
[0.7580645161290323, 0.7096774193548387, 0.7419354838709677, 0.7419354838709677, 0.6774193548387096]


In [84]:
################## TESTING ID3 Algorithm ON SONAR DATA SET ##################
# get best depth with cross validation
bestDepth1, depthAccuracy_df1, testAccuracy1 = ID3_cross_validation(train_sonar1,10,sonar_features,sonar_label,[3,4,5])
# generate tree with regular ID3 decision tree algorithm from homework
bestTree1 = ID3_regular(train_sonar1,sonar_features,sonar_label,bestDepth1) 
# get train predcitions
regTrain_predictions1 = ID3_decision_tree_all(bestTree1,train_sonar1)
# get train score
regTrain_score1 = accuracy_score(train_sonar1[sonar_label],regTrain_predictions1)
# get test predictions
regTest_predictions1 = ID3_decision_tree_all(bestTree1,test_sonar1)
# get test score
regTest_score1 = accuracy_score(test_sonar1[sonar_label],regTest_predictions1)

print("Train Accuracy for sonar_bin1: " + str(regTrain_score1))
print("Test Accuracy for sonar_bin1" + str(regTest_score1))
print("----------------------------------")

# get best depth with cross validation
bestDepth2, depthAccuracy_df2, testAccuracy2 = ID3_cross_validation(train_sonar2,10,sonar_features,sonar_label,[3,4,5,6])
# generate tree with regular ID3 decision tree algorithm from homework
bestTree2 = ID3_regular(train_sonar2,sonar_features,sonar_label,bestDepth2) 
# get train predcitions
regTrain_predictions2 = ID3_decision_tree_all(bestTree2,train_sonar2)
# get train score
regTrain_score2 = accuracy_score(train_sonar2[sonar_label],regTrain_predictions2)
# get test predictions
regTest_predictions2 = ID3_decision_tree_all(bestTree2,test_sonar2)
# get test score
regTest_score2 = accuracy_score(test_sonar2[sonar_label],regTest_predictions2)

print("Train Accuracy for sonar_bin1: " + str(regTrain_score2))
print("Test Accuracy for sonar_bin1" + str(regTest_score2))

Train Accuracy for sonar_bin1: 0.8551724137931035
Test Accuracy for sonar_bin10.6774193548387096
----------------------------------
Train Accuracy for sonar_bin1: 0.9041095890410958
Test Accuracy for sonar_bin10.7096774193548387


In [93]:
################## TESTING ID3 RANDOM FOREST ON BANKNOTES DATA SET ##################

# Load Data and drop NAs (only 1 NA)
banknotes = pd.read_csv("/content/data_banknote_authentication-1.csv")
banknotes = banknotes.dropna()


###### BIN STRATEGY 1 (MEAN) ######

## Data Preprocessing 

# get the mean of each column and turn to list
col_mean2 = banknotes.mean(axis=0).tolist()
col_mean2 = col_mean2[:-1]

# initalize empty df
bank_bin1 = pd.DataFrame()

# bank notes features
bank_features = ['variance','skewness','curtosis','entropy']

# add new labels to empty df based on their values relative to the column mean
for i in range(len(col_mean2)):
  col_name = bank_features[i]
  # add new columns with mutated values
  bank_bin1[col_name] = pd.cut(banknotes.iloc[:,i],
                                      bins = [-100,col_mean2[i],100],
                                      labels = ["lower","higher"])
  
# add target column to new df and replace
bank_bin1['class'] = banknotes['class']

## Training and Testing Banknotes Data Set with Bin Strategy 1 for ID3 Random Forest

# initialize parameters
bank_boot = int(bank_bin1.shape[0]*0.75) #added more training data due to over fitting
bank_n_features = int(math.sqrt(len(bank_features)))
bank_label = 'class'
bank_trees = 50
# split data into train and test
train_bank1, test_bank1 = split_data(bank_bin1, 0.7) 
# omit na's just in case
train_bank1 = train_bank1.dropna()
test_bank1 = test_bank1.dropna()

In [94]:
# Get hyperparameters 
best_bn1_depth= get_best_hyperparameters(train_bank1,test_bank1,bank_features,bank_label,
                                                    bank_trees, bank_boot, bank_n_features,[2,3,4]) 
print("The best depth: " + str(best_bn1_depth))

max_depth: 2 , 	 train_accuracy:  0.846875 , 	 test_accuracy: 0.8422330097087378
max_depth: 3 , 	 train_accuracy:  0.8010416666666667 , 	 test_accuracy: 0.779126213592233
max_depth: 4 , 	 train_accuracy:  0.8010416666666667 , 	 test_accuracy: 0.779126213592233
The best depth: 2


In [95]:
# Get Average Accuracies for 10 tests for Bin Strategy 1
# initialize empty lists
trainAccuracies1 = []
testAccuracies1 = []
# loop 10 times
for i in range(10):
  # generate random forest
  bank_forest1 = random_forest_algorithm(train_bank1,bank_features,bank_label,
                                       bank_trees,bank_boot,bank_n_features,
                                       best_bn1_depth)
  # get train predictions
  b1_train_predictions = get_random_forest_predictions(train_bank1, bank_forest1)
  # get train score
  b1_train_score = accuracy_score(train_bank1[bank_label],b1_train_predictions)
  # append score
  trainAccuracies1.append(b1_train_score)
  # get test predictions
  b1_test_predictions = get_random_forest_predictions(test_bank1, bank_forest1)
  # get test score
  b1_test_score = accuracy_score(test_bank1[bank_label],b1_test_predictions)
  # append score
  testAccuracies1.append(b1_test_score)
# calculate averages
avg_trainAccuracy1 = sum(trainAccuracies1)/len(trainAccuracies1)
avg_testAccuracy1 = sum(testAccuracies1)/len(testAccuracies1)
print("Average Train Accuracy: " + str(avg_trainAccuracy1))
print("Average Test Accuracy " + str(avg_testAccuracy1))
print(trainAccuracies1)
print(testAccuracies1)

Average Train Accuracy: 0.8188541666666665
Average Test Accuracy 0.8036407766990292
[0.7958333333333333, 0.846875, 0.846875, 0.8010416666666667, 0.846875, 0.8010416666666667, 0.8010416666666667, 0.8010416666666667, 0.8010416666666667, 0.846875]
[0.7718446601941747, 0.8422330097087378, 0.8422330097087378, 0.779126213592233, 0.8422330097087378, 0.779126213592233, 0.779126213592233, 0.779126213592233, 0.779126213592233, 0.8422330097087378]


In [96]:
###### BIN STRATEGY 2 (Median) ######

## Data Preprocessing 

# initalize empty df
bank_bin2 = pd.DataFrame()

# add new labels to empty df based on their values relative to the column mean
for i in range(banknotes.shape[1]-1):
  col_name = bank_features[i]
  # add new columns with mutated values
  bank_bin2[col_name] = pd.qcut(banknotes.iloc[:,i],
                                      q = 2,
                                      labels = ["lower","higher"])
  
# add target column to new df and replace
bank_bin2['class'] = banknotes['class']

## Training and Testing Banknotes Data Set with Bin Strategy 1 for ID3 Random Forest

# splits data into train and test
train_bank2, test_bank2 = split_data(bank_bin2, 0.7)
# omit na's just in case
train_bank2 = train_bank1.dropna()
test_bank2 = test_bank1.dropna()

In [97]:
# Get hyper parameters 
best_bn2_depth = get_best_hyperparameters(train_bank2,test_bank2,bank_features,bank_label,
                                                    bank_trees, bank_boot, bank_n_features,[2,3,4])
print("The best depth: " + str(best_bn2_depth))

max_depth: 2 , 	 train_accuracy:  0.8010416666666667 , 	 test_accuracy: 0.779126213592233
max_depth: 3 , 	 train_accuracy:  0.846875 , 	 test_accuracy: 0.8422330097087378
max_depth: 4 , 	 train_accuracy:  0.846875 , 	 test_accuracy: 0.8422330097087378
The best depth: 4


In [98]:
# Get Average Accuracies for 10 tests for Bin Strategy 2
trainAccuracies2 = []
testAccuracies2 = []
for i in range(10):
  bank_forest2 = random_forest_algorithm(train_bank2,bank_features,bank_label,
                                       bank_trees,bank_boot,bank_n_features,
                                       best_bn2_depth)
  b2_train_predictions = get_random_forest_predictions(train_bank2, bank_forest1)
  b2_train_score = accuracy_score(train_bank2[bank_label],b2_train_predictions)
  trainAccuracies2.append(b2_train_score)
  b2_test_predictions = get_random_forest_predictions(test_bank2, bank_forest2)
  b2_test_score = accuracy_score(test_bank2[bank_label],b2_test_predictions)
  testAccuracies2.append(b2_test_score)

avg_trainAccuracy2 = sum(trainAccuracies2)/len(trainAccuracies2)
avg_testAccuracy2 = sum(testAccuracies2)/len(testAccuracies2)
print("Average Train Accuracy: " + str(avg_trainAccuracy2))
print("Average Test Accuracy " + str(avg_testAccuracy2))
print(trainAccuracies2)
print(testAccuracies2)

Average Train Accuracy: 0.846875
Average Test Accuracy 0.81626213592233
[0.846875, 0.846875, 0.846875, 0.846875, 0.846875, 0.846875, 0.846875, 0.846875, 0.846875, 0.846875]
[0.8422330097087378, 0.8422330097087378, 0.779126213592233, 0.8422330097087378, 0.8422330097087378, 0.779126213592233, 0.8422330097087378, 0.7718446601941747, 0.779126213592233, 0.8422330097087378]


In [99]:
################## TESTING ID3 Algorithm ON BANKNOTES DATA SET ##################
# get best depth with cross validation from class
best_depth1, depth_accuracy_df1, test_accuracy1 = ID3_cross_validation(train_bank1,10,bank_features,bank_label,[1,2,3,4,5,6,7,8,9,10])
# get tree with regular ID3 algorithm from homework
best_tree1 = ID3_regular(train_bank1,bank_features,bank_label,best_depth1) 
# get train predictions
reg_train_predictions1 = ID3_decision_tree_all(best_tree1,train_bank1)
#get train score
reg_train_score1 = accuracy_score(train_bank1[bank_label],reg_train_predictions1)
# get test predicitions
reg_test_predictions1 = ID3_decision_tree_all(best_tree1,test_bank1)
#get test score
reg_test_score1 = accuracy_score(test_bank1[bank_label],reg_test_predictions1)

print("Train Accuracy for bank_bin1: " + str(reg_train_score1))
print("Test Accuracy for bank_bin1" + str(reg_test_score1))
print("----------------------------------")
# get best depth with cross validation from class
best_depth2, depth_accuracy_df2, test_accuracy2 = ID3_cross_validation(train_bank2,10,bank_features,bank_label,[1,2,3,4,5,6,7,8,9,10])
# get tree with regular ID3 algorithm from homework
best_tree2 = ID3_regular(train_bank2,bank_features,bank_label,best_depth2) 
# get train predictions
reg_train_predictions2 = ID3_decision_tree_all(best_tree2,train_bank2)
#get train score
reg_train_score2 = accuracy_score(train_bank2[bank_label],reg_train_predictions2)
# get test predicitions
reg_test_predictions2 = ID3_decision_tree_all(best_tree2,test_bank2)
#get test score
reg_test_score2 = accuracy_score(test_bank2[bank_label],reg_test_predictions2)

print("Train Accuracy for bank_bin2: " + str(reg_train_score2))
print("Test Accuracy for bank_bin2" + str(reg_test_score2))



Train Accuracy for bank_bin1: 0.846875
Test Accuracy for bank_bin10.8422330097087378
----------------------------------
Train Accuracy for bank_bin2: 0.846875
Test Accuracy for bank_bin20.8422330097087378
