# SENG 474
# Assignment 1 - Problem 6
# Nolan Kurylo
# V00893175
References:

1) https://pandas.pydata.org/pandas-docs/stable/reference/index.html

2) https://www.python-course.eu/Regression_Trees.php

3) https://machinelearningmastery.com/implement-resampling-methods-scratch-python/


Please Note: very similar code to that from Problem 2, the only difference is that the current node to be split on in tree is decided by the feature with the minimum variance (functions: min_variance() and weighted_variance()) as compared to Problem 2 that splits the current node on the feature with the largest information gain.

In [1]:
# 6.1 Regress
import pandas as pd
import numpy as np
import pprint
from matplotlib import pyplot as plt

np.random.seed(1337)

df = pd.read_csv('elections_clean.csv') # create dataframe of csv

label_vector = df.pop('PovertyLevel') # this is the target/label vector

feature_list = ['Education','Religion','EthnicMale','EthnicFemale', 'PerCapitaInc'] # these will be the features in the tree
df = df[feature_list]

df['PovertyLevel'] = label_vector # place label vector at the end of df


def weighted_variance(dataset, col):
    """ Finds the weighted variance of the subset dataset for the input column col
    :param dataset: subset of original dataframe
    :param col: col to be calculated on
    :return: weighted variance for the column
    """
    values, num_values = np.unique(col, return_counts=True) 
    weighted_var = 0
    for i in range(len(values)):
        matching_rows = dataset.loc[col==values[i]] # In the column col, find all rows with matching value values[i] 
        label_vector_subset = matching_rows['PovertyLevel'] # return a subset of the values in the label vector
        weight = len(label_vector_subset) / len(dataset) # weight is the fraction of the entire dataset that the subset consumes
        weighted_var += (weight * label_vector_subset.var())
    return weighted_var

 
def min_variance(dataset, feature_list): 
    """ Finds the feature still in the feature_list with the minimum variance (feature to be split on)
    :param dataset: subset of original dataframe
    :param feature_list: subset of features from the features in the dataset 
    :return: feature with the lowest variance from the feature_list
    """
    min_variance = {}

    for feature in feature_list: # add variances to dictionary for each feature still in thje feature_list
        feature_variance = weighted_variance(dataset, dataset[feature])
        min_variance[feature] = feature_variance
    
    return min(min_variance, key=min_variance.get)


def RegressionID3(dataset, feature_list, parent=None): 
    """ Recursively builds a dictionary-structured Decision Tree Regressor based on ID3 algorithm
    :param dataset: training set
    :param feature_list: list of all features from the dataset
    :param parent: most common element in the current label vector subset
    :return: Decision Tree structured as a dictionary
    """

    if len(np.unique(dataset["PovertyLevel"])) == 1: # if the label vector is "pure" (only one value in label vector)
        return dataset["PovertyLevel"][0] # return the pure value and go back up tree

    if(len(feature_list) == 0): # reached the end of the branch
        return parent # return parent value and go back up tree

    if(len(dataset) == 0): # if dirty split, return the original mean of the label vector
        return df['PovertyLevel'].mean()

   
    curr_node = min_variance(dataset, feature_list) # find the feature with the min variance and make it the curr_node in the tree

    curr_node_values, num_values = np.unique(dataset[curr_node], return_counts=True) # get the values and their counts associated with the curr_node
    
    parent = dataset['PovertyLevel'].mode()[0] # make parent the most common element in the current label vector subset
    

    DecisionTree = {curr_node:{}}

    feature_list = list(filter(lambda feature: feature != curr_node, feature_list)) # remove the curr_node (.remove() was causing issues)
    

    for value in curr_node_values: # for each value associated with the curr_node feature
        
        sub_dataset = dataset[dataset[curr_node] == value].reset_index(drop=True) # split data into a subset that excludes all rows of the current 'value' of the curr_node in the loop
        DecisionTree[curr_node][value] = RegressionID3(sub_dataset, feature_list, parent) # recursively build the tree on the new subset as the dataset
           
    return DecisionTree


def split_training_validation_sets(df):
    """ Find 70% of the original dataset as the training set, 30% as the validaiton set
    :param df: dataframe to be split
    :return: training and validation splits as dataframes
    """
    shuffled_dataset = df.sample(frac=1).reset_index(drop=True) # shuffle the dataset
    
    split_70_30 = int(df.shape[0] * 0.7) # find the index for the 70 / 30 split to split on

    
    training_set = shuffled_dataset.iloc[:split_70_30].reset_index(drop=True) # 70% of dataset
    validation_set = shuffled_dataset.iloc[split_70_30:].reset_index(drop=True) # 30% of dataset
    return training_set,validation_set

def predict(validation_sample, DecisionTree): 
    """ Recursively runs through DecisionTree and returns where the input validation_sample evaluates a continous value
    :param validation_sample: sample for the Decision tree to predict
    :param DecisionTree: dictionary structured model
    :return: the binary prediction - 0 or 1
    """
    
    for feature in list(validation_sample.keys()): # for each feature in the sample
        
        if feature not in list(DecisionTree.keys()): #ignore missing features
            continue
        try:
            prediction = DecisionTree[feature][validation_sample[feature]] # see if node in tree is a leaf( 0 or 1) or a dict (sub tree)
            
            
            if(type(prediction) == float): # node is a leaf, return the label value
                
                return prediction
            else: # node is a subtree, recursively go down to next node in subtree
                
                return predict(validation_sample, prediction)

        except:
            return df['PovertyLevel'].mean()
        


def find_rmse(validation_set, DecisionTree): 
    """ Finds the Root Mean Squared Error (RMSE) of the DecsionTree given an input validaetion_set and the DecisionTree
    :param validation_set: samples for the Decision tree to predict
    :param DecisionTree: dictionary structured model
    return: root mean squared error (RMSE)
    """
    validation_samples = validation_set.iloc[:,:-1].to_dict(orient='records') # convert dataset to dictionary
    
    actual_output = validation_set.iloc[:,-1] # remove the label vector so we can see how how well the tree predicts the samples
    validation_set['predicted_output'] = actual_output # create empty column for predictions
    predicted_output = np.array([]) # array for predictions 
    num_correct = 0 # store the number of samples in the validation_set that are actually correct
    for i in range(len(validation_samples)):
        prediction = predict(validation_samples[i],DecisionTree)
        predicted_output = np.append(predicted_output, prediction) #update prediction array
    
    sum_arg = ((actual_output-predicted_output)**2)/len(actual_output) # argument of summation
    mse = sum_arg.sum() # mean sqaured error
    rmse = np.sqrt(mse) # root of mean squared error
    return rmse 



training_set, validation_set = split_training_validation_sets(df) # split dataset into %70 training 30% validation sets
 
print("Training RegressionTree . . . (this will take ~10 seconds)")
DecisionTree = RegressionID3(training_set, feature_list) # train the tree

training_rmse = find_rmse(training_set, DecisionTree)  # find the root mean squared error
validation_rmse = find_rmse(validation_set, DecisionTree)  # find the root mean squared error
print("The Training root mean squared error (RMSE) is: " + str(training_rmse))
print("The Validation root mean squared error (RMSE) is: " + str(validation_rmse))



Training RegressionTree . . . (this will take ~10 seconds)
The Training root mean squared error (RMSE) is: 0.03398957810128662
The Validation root mean squared error (RMSE) is: 0.032298402433508705


In [2]:
# 6.2 Cross-Validate
import random 

def cross_validate(data):
    """ Creates list of 5 folds (datasets)
    :param data: original dataframe
    :return: list of the 5 folds
    """
    folds = []
    dataset = data
    num_folds = 5 
    fold_len = int(len(dataset) / num_folds)
    
    for i in range(num_folds):
        fold = [] # list to store each of the 5 fold datasets
        while len(fold) < fold_len:
            index = random.randrange(len(dataset)) # choose a randomized index
            row = dataset.iloc[index] #save the row
            dataset = dataset.drop(dataset.index[index]).reset_index(drop=True) # remove chosen index from dataset
            fold.append(row) # add the chosen index (row) to the current fold dataset 
        folds.append(fold)
    return folds 

def findBestFold(folds, feature_list):
    """ Finds the fold with the minimum rmse value and the fold (iteration) number
    :param folds: list of 5 folds
    :param feature_list: list of all features from the dataset
    :return: fold with min rmse and the fold number
    """
    for i in range(len(folds)):
        folds[i] = pd.DataFrame(folds[i]) # convert to dataframe

    iterations = {} # to store the rmse of each iteration
    for i in range(len(folds)):  

        validation_fold = folds.pop(i) # store validation fold for current iteration
        training_folds = pd.concat(folds) # create training fold of all other folds
        folds.insert(0, validation_fold) # put validation fold back in so it can be used as a training fold later

        print("Training RegressionTree with Validation Fold " + str(i+1) + " . . . (this will take ~10 seconds)")
        DecisionTreei = RegressionID3(training_folds, feature_list)
        rmse = find_rmse(validation_fold, DecisionTreei) 
        iterations["Validation Fold " + str(i+1)] = rmse
        print("RegressionTree using Validation Fold " + str(i+1) + " RMSE: " + str(rmse))
        
    
    min_iteration = min(iterations, key=iterations.get) # find the iteration with the tree that produces the (best) minimzed error
    return min_iteration, iterations[min_iteration]


folds = cross_validate(df) # get a list of 5 folds to be used for finding the best regression tree

bestFold, bestFoldRMSE = findBestFold(folds, feature_list) # algorithm from scratch to find the fold that produces a regression tree with the best (minimzed) error
print()
print("After 5-Fold cross-validation, the best RegressionTree comes from using " + bestFold + " giving an RMSE of " + str(bestFoldRMSE))


Training RegressionTree with Validation Fold 1 . . . (this will take ~10 seconds)
RegressionTree using Validation Fold 1 RMSE: 0.033110431956701984
Training RegressionTree with Validation Fold 2 . . . (this will take ~10 seconds)
RegressionTree using Validation Fold 2 RMSE: 0.03354747717124353
Training RegressionTree with Validation Fold 3 . . . (this will take ~10 seconds)
RegressionTree using Validation Fold 3 RMSE: 0.033558751396049406
Training RegressionTree with Validation Fold 4 . . . (this will take ~10 seconds)
RegressionTree using Validation Fold 4 RMSE: 0.03353583320056681
Training RegressionTree with Validation Fold 5 . . . (this will take ~10 seconds)
RegressionTree using Validation Fold 5 RMSE: 0.033699187258665886

After 5-Fold cross-validation, the best RegressionTree comes from using Validation Fold 1 giving an RMSE of 0.033110431956701984
