In [1]:
import re
import pandas as pd;

In [2]:
class Node:
    """ Node class for a decision tree. """
    def __init__(self, names):
        self.names = names

    def classify(x):
        """ Handled by the subclasses. """
        return None

    def dump(self, indent):
        """ Handled by the subclasses. """
        return None


class Leaf(Node):
    def __init__(self, names, value):
        Node.__init__(self, names)
        self.value = value

    def classify(self, x):
        return self.value

    def dump(self, indent):
        print(' %d' % self.value)


class Split(Node):
    def __init__(self, names, var, left, right):
        Node.__init__(self, names)
        self.var = var
        self.left = left
        self.right = right

    def classify(self, x):
        if x[self.var] == 0:
            return self.left.classify(x)
        else:
            return self.right.classify(x)
      
    def dump(self, indent):
        if indent > 0:
            print('')
        for i in range(0, indent):
            print('| ', end='')
        print('%s = 0 :' % self.names[self.var],end='')
        self.left.dump(indent+1)
        for i in range(0, indent):
            print('| ', end='')
        print('%s = 1 :' % self.names[self.var],end='')
        self.right.dump(indent+1)


Helper function computes entropy of Bernoulli distribution with parameter p

In [3]:
def entropy(p):
    # >>>> YOUR CODE GOES HERE <<<<
    # For now, always return "0":
    return 0;


Compute information gain for a particular split, given the counts 

py_pxi : number of occurences of y=1 with x_i=1 for all i=1 to n

pxi : number of occurrences of x_i=1

py : number of ocurrences of y=1


In [4]:
def infogain(py_pxi, pxi, py, total):
    # >>>> YOUR CODE GOES HERE <<<<
    # For now, always return "0":
    return 0;


OTHER SUGGESTED HELPER FUNCTIONS:

-collect counts for each variable value with each class label

-find the best variable to split on, according to mutual information

-partition data based on a given variable	
	


In [5]:
# Load data from a file
def read_data(filename):
    f = open(filename, 'r')
    p = re.compile(',')
    data = []
    header = f.readline().strip()
    varnames = p.split(header)
    namehash = {}
    for l in f:
        data.append([int(x) for x in p.split(l.strip())])
    return (data, varnames)


Build tree in a top-down manner, selecting splits until we hit a pure leaf or all splits look bad.


In [6]:
def build_tree(data, varnames):
    # >>>> YOUR CODE GOES HERE <<<<
    # For now, always return a leaf predicting "1":
    
    ## remove this part, it's here only as an example
    #random tree
    l1 = Leaf(varnames, 0)
    r1 = Leaf(varnames, 1)
    l0 = Leaf(varnames, 0)
    r0 = Split(varnames, 2, l1, r1)
    root = Split(varnames, 0 , r0, l1)
    root.dump(0)
    ####
    
    return root


Here we load data.
Each example is a list of attribute values, where the last element in the list is the class value.


In [7]:
agaricus = ["agaricuslepiotatrain1.csv",
              "agaricuslepiotatest1.csv",
              "agaricuslepiotatest1.csv"]

dataset1 = ["data_sets1/training_set.csv",
            "data_sets1/validation_set.csv",
            "data_sets1/test_set.csv"]
            

dataset2 = ["data_sets2/training_set.csv",
            "data_sets2/validation_set.csv",
            "data_sets2/test_set.csv"]

# pick the dataset you want to use this time
dataset = agaricus

(train, varnames) = read_data(dataset[0])
(validation, validationvarnames) = read_data(dataset[1]) 
(test, testvarnames) = read_data(dataset[2]) 

In [8]:
pd.DataFrame(train).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,113,114,115,116,117,118,119,120,121,122
0,0,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,1
1,0,0,1,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
3,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,1
4,0,0,1,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0


In [9]:
varnames

['cap-shape-bell',
 'cap-shape-conical',
 'cap-shape-convex',
 'cap-shape-flat',
 'cap-shape-knobbed',
 'cap-shape-sunken',
 'cap-surface-fibrous',
 'cap-surface-grooves',
 'cap-surface-scaly',
 'cap-surface-smooth',
 'cap-color-brown',
 'cap-color-buff',
 'cap-color-cinnamon',
 'cap-color-gray',
 'cap-color-green',
 'cap-color-pink',
 'cap-color-purple',
 'cap-color-red',
 'cap-color-white',
 'cap-color-yellow',
 'bruises?-bruises',
 'odor-almond',
 'odor-anise',
 'odor-creosote',
 'odor-fishy',
 'odor-foul',
 'odor-musty',
 'odor-none',
 'odor-pungent',
 'odor-spicy',
 'gill-attachment-attached',
 'gill-attachment-descending',
 'gill-attachment-free',
 'gill-attachment-notched',
 'gill-spacing-close',
 'gill-spacing-crowded',
 'gill-spacing-distant',
 'gill-size-broad',
 'gill-color-black',
 'gill-color-brown',
 'gill-color-buff',
 'gill-color-chocolate',
 'gill-color-gray',
 'gill-color-green',
 'gill-color-orange',
 'gill-color-pink',
 'gill-color-purple',
 'gill-color-red',
 'gill

Build the decision tree

In [10]:
root = build_tree(train, varnames)

cap-shape-bell = 0 :
| cap-shape-convex = 0 : 0
| cap-shape-convex = 1 : 1
cap-shape-bell = 1 : 0


In [11]:
root.dump(0)



cap-shape-bell = 0 :
| cap-shape-convex = 0 : 0
| cap-shape-convex = 1 : 1
cap-shape-bell = 1 : 0


Calcuating the accuracy

In [12]:
def accuracy(data):
    correct = 0
    # The position of the class label is the last element in the list.
    yi = len(data[0]) - 1
    for x in data:
    # Classification is done recursively by the node class.
    # This should work as-is.
        pred = root.classify(x)
        if pred == x[yi]:
            correct += 1
        acc = float(correct)/len(data)
    return acc;
   

In [13]:
 print("Train Accuracy: {}".format(accuracy(train)))

Train Accuracy: 0.532


In [14]:
 print("Test Accuracy: {}".format(accuracy(test)))

Test Accuracy: 0.36517647058823527
