In [1]:
import numpy as np
import pandas as pd
import random
from pprint import pprint

In [3]:
def train_test_split(df, test_size):
    
    if isinstance(test_size, float):
        test_size = round(test_size * len(df))

    indices = df.index.tolist()
    test_indices = random.sample(population=indices, k=test_size)

    test_df = df.loc[test_indices]
    train_df = df.drop(test_indices)
    
    return train_df, test_df

In [5]:
def classify_data(data):
    
    label_column = data[:, -1]
    unique_classes, counts_unique_classes = np.unique(label_column, return_counts=True)

    index = counts_unique_classes.argmax()
    classification = unique_classes[index]
    
    return classification

In [6]:
def get_potential_splits(data):
    
    potential_splits = {}
    _, n_columns = data.shape
    for column_index in range(n_columns - 1):
        values = data[:, column_index]
        unique_values = np.unique(values)
        
        type_of_feature = FEATURE_TYPES[column_index]
        
        #feature is real-time
        if type_of_feature == "r":
            potential_splits[column_index] = []
            for index in range(len(unique_values)):
                if index != 0:
                    current_value = unique_values[index]
                    previous_value = unique_values[index - 1]
                    potential_split = (current_value + previous_value) / 2

                    potential_splits[column_index].append(potential_split)
        
        # feature is nominal
        else:
            potential_splits[column_index] = unique_values
    
    return potential_splits


In [7]:
def split_data(data, split_column, split_value):
    
    split_column_values = data[:, split_column]

    type_of_feature = FEATURE_TYPES[split_column]
    
    # feature is real-time
    if type_of_feature == "r":
        data_below = data[split_column_values <= split_value]
        data_above = data[split_column_values >  split_value]
    
    # feature is nominal  
    else:
        data_below = data[split_column_values == split_value]
        data_above = data[split_column_values != split_value]
    
    return data_below, data_above

In [8]:
def gini(data):
    
    label_column = data[:, -1]
    _, counts = np.unique(label_column, return_counts=True)

    probabilities = counts / counts.sum()
    gini = 1 - sum(probabilities ** 2)
     
    return gini

In [9]:
def information_gain(data_below, data_above):
    
    n = len(data_below) + len(data_above)
    p_data_below = len(data_below) / n
    p_data_above = len(data_above) / n

    information_gain =  (p_data_below * gini(data_below) 
                      + p_data_above * gini(data_above))
    
    return information_gain

In [10]:
def determine_best_split(data, potential_splits):
    
    ig = 9999
    for column_index in potential_splits:
        for value in potential_splits[column_index]:
            data_below, data_above = split_data(data, split_column=column_index, split_value=value)
            current_ig = information_gain(data_below, data_above)

            if current_ig <= ig:
                ig = current_ig
                best_split_column = column_index
                best_split_value = value
    
    return best_split_column, best_split_value

In [11]:
def decision_tree_algorithm(df, column_name, datatype, current_depth=0, min_samples=2, max_depth=5):
    
    if current_depth == 0:
        global COLUMN_HEADERS, FEATURE_TYPES
        COLUMN_HEADERS = column_name
        FEATURE_TYPES = datatype
        data = df.values
    else:
        data = df           
    
    # finish training process
    if (len(data) < min_samples) or (current_depth == max_depth):
        classification = classify_data(data)
        
        return classification

    
    # training process
    else:    
        current_depth += 1

        # helper functions 
        potential_splits = get_potential_splits(data)
        split_column, split_value = determine_best_split(data, potential_splits)
        data_below, data_above = split_data(data, split_column, split_value)
        
        # print nodes
        feature_name = COLUMN_HEADERS[split_column]
        type_of_feature = FEATURE_TYPES[split_column]
        
        # feature is real-time
        if type_of_feature == "r":
            node = "{} <= {}".format(feature_name, split_value)
            
        # feature is nominal
        else:
            node = "{} = {}".format(feature_name, split_value)
        
        # instantiate sub-tree
        sub_tree = {node: []}
        
        # find answers (recursion)
        left_leave = decision_tree_algorithm(data_below, _, _, current_depth, min_samples, max_depth)
        right_leave = decision_tree_algorithm(data_above, _, _, current_depth, min_samples, max_depth)
        
        
        if left_leave == right_leave:
            sub_tree = left_leave
        else:
            sub_tree[node].append(left_leave)
            sub_tree[node].append(right_leave)
        
        return sub_tree

In [13]:
def classify_example(example, tree):
    node = list(tree.keys())[0]
    feature_name, comparison_operator, value = node.split(" ")

    # feature is real-time
    if comparison_operator == "<=": 
        if example[feature_name] <= float(value):
            result = tree[node][0]
        else:
            result = tree[node][1]
    
    # feature is nominal
    else:
        if str(example[feature_name]) == value:
            result = tree[node][0]
        else:
            result = tree[node][1]

    if not isinstance(result, dict):
        return result
    
    else:
        residual_tree = result
        return classify_example(example, residual_tree)

In [14]:
def calculate_accuracy(df, tree):

    df["classification"] = df.apply(classify_example, axis=1, args=(tree,))
    df["classification_correct"] = df["classification"] == df["class"]
    
    accuracy = df["classification_correct"].mean()
    
    return accuracy

In [20]:
col_names_1 = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', "class"]
data_1 = pd.read_csv("iris.tmls", skiprows=2, header=None, names=col_names_1)
datatype_1 = pd.read_csv("iris.tmls", header=None).iloc[1].tolist()
datatype_1 = datatype_1[:-1]

train_df_1, test_df_1 = train_test_split(data_1, test_size=0.2)

tree_1 = decision_tree_algorithm(train_df_1, col_names_1, datatype_1, max_depth=3)
pprint(tree_1)

accuracy_1 = calculate_accuracy(test_df_1, tree_1)
print("accuracy: " + str(accuracy_1))

{'petal-width <= 0.8': ['Iris-setosa',
                        {'petal-width <= 1.75': [{'petal-length <= 4.95': ['Iris-versicolor',
                                                                           'Iris-virginica']},
                                                 'Iris-virginica']}]}
accuracy: 0.9666666666666667


In [22]:
col_names_2 = ["alcohol", "malic-acid", "ash", "alcalinity-of-ash", "magnesium", "total-phenols", 
               "flavanoids", "nonflavanoid-phenols", "proanthocyanins", "color-intensity", "hue", 
               "OD280/OD315-of-dilutued-wines", "proline", "class"]
data_2 = pd.read_csv("wine.tmls", skiprows=2, header=None, names=col_names_2)
datatype_2 = pd.read_csv("wine.tmls", header=None).iloc[1]

train_df_2, test_df_2 = train_test_split(data_2, test_size=0.2)

tree_2 = decision_tree_algorithm(train_df_2, col_names_2, datatype_2, max_depth=3)
pprint(tree_2)

accuracy_2 = calculate_accuracy(test_df_2, tree_2)
print("accuracy: " + str(accuracy_2))

{'color-intensity <= 3.915': [{'proline <= 1002.5': [{'OD280/OD315-of-dilutued-wines <= 3.73': [2.0,
                                                                                                1.0]},
                                                     1.0]},
                              {'flavanoids <= 1.58': [{'hue <= 0.97': [3.0,
                                                                       2.0]},
                                                      {'proline <= 679.0': [2.0,
                                                                            1.0]}]}]}
accuracy: 0.9444444444444444


In [25]:
col_names_3 = ["sex", "height", "weight", "class"]
data_3 = pd.read_csv("weight.tmls", skiprows=2, header=None, names=col_names_3)
datatype_3 = pd.read_csv("weight.tmls", header=None).iloc[1]

train_df_3, test_df_3 = train_test_split(data_3, test_size=0.2)

tree_3 = decision_tree_algorithm(train_df_3, col_names_3, datatype_3, max_depth=4)
pprint(tree_3)

accuracy_3 = calculate_accuracy(test_df_3, tree_3)
print("accuracy: " + str(accuracy_3))

{'weight <= 122.5': [{'weight <= 83.5': [{'height <= 172.5': [{'weight <= 65.0': [2,
                                                                                  4]},
                                                              {'weight <= 69.5': [1,
                                                                                  2]}]},
                                         {'height <= 164.0': [{'weight <= 97.5': [4,
                                                                                  5]},
                                                              {'height <= 181.5': [4,
                                                                                   3]}]}]},
                     {'height <= 186.5': [{'weight <= 132.5': [{'height <= 178.5': [5,
                                                                                    4]},
                                                               5]},
                                          {'weight <= 145.5'

In [35]:
#col_names_3 = ["buying-price", "maintenance-cost", "number-of-doors", "number-of-persons",
#               "lug_boot", "safety", "class"]
#data_3 = pd.read_csv("car.tmls", skiprows=2, header=None, names=col_names_3)
#datatype_3 = pd.read_csv("car.tmls", header=None).iloc[1]

col_names_3 = ["Age", "Gender", "Polyuria", "Polydipsia", "sudden-weight-loss", "weakness", 
               "Polyphagia", "Genital-thrush", "visual-blurring", "Itching", "Irritability",
               "delayed-healing", "partial-paresis", "muscle-stiffness", "Alopecia", "Obesity", "class"]
data_3 = pd.read_csv("diabetes.tmls", skiprows=2, header=None, names=col_names_3)
datatype_3 = pd.read_csv("diabetes.tmls", header=None).iloc[1]

train_df_3, test_df_3 = train_test_split(data_3, test_size=0.2)

tree_3 = decision_tree_algorithm(train_df_3, col_names_3, datatype_3, max_depth=3)
pprint(tree_3)

accuracy_3 = calculate_accuracy(test_df_3, tree_3)
print("accuracy: " + str(accuracy_3))

{'Polyuria = Yes': [{'Age <= 69.5': ['Positive',
                                     {'weakness = Yes': ['Positive',
                                                         'Negative']}]},
                    {'Gender = Male': [{'Polydipsia = Yes': ['Positive',
                                                             'Negative']},
                                       {'Alopecia = Yes': ['Negative',
                                                           'Positive']}]}]}
accuracy: 0.8942307692307693
