# CS429 - Lab 6: Decision tree

In [1]:
from pyspark import SparkConf, SparkContext
import operator
import math

conf = SparkConf().setMaster("local[*]").setAppName("Lab 6")
sc = SparkContext(conf=conf)

In [2]:
sc

In [3]:
# Corresponding indices of attributes for each training example
attribute_indx = {
    'Outlook': 0,
    'Temperature': 1,
    'Humidity': 2,
    'Wind': 3,
    'class': 4
}

# Domain values of attributes
attribute_values = {
    'Outlook': ['Sunny', 'Overcast', 'Rain'],
    'Temperature': ['Hot', 'Mild', 'Cool'],
    'Humidity': ['High', 'Normal'],
    'Wind': ['Weak', 'Strong']
}

In [36]:
def class_counts(examples):
    """Count the number of examples per class.
        @params
            examples (PythonRDD): training examples
        @return
            class_cnts (list(dict())): examples count per class
    """
    count_by_class = examples.map(lambda x: (x[attribute_indx['class']], 1)).reduceByKey(lambda x,y: x+y).collect()
    count_all = examples.count()
    class_cnts = [(k, v/count_all) for k,v in count_by_class]
    return class_cnts
   
class Leaf:
    """A leaf node in decision tree. Leaf nodes contain the final prediction.
        @attributes
            self.node_name (str): uniqe name for each node
            self.label (str): label for printing
            self.leading_branch (str): value of parent node that leads to this node            
    """
    nodeid = 0
    def __init__(self, examples, leading_branch):
        if examples.count()==0:
            self.node_name = 'leaf' + str(Leaf.nodeid)
            Leaf.nodeid += 1
            self.label = 'N/A'
            self.leading_branch = leading_branch
        else:
            self.predictions = class_counts(examples)
            self.node_name = 'leaf' + str(Leaf.nodeid)
            Leaf.nodeid += 1
            self.label = ', '.join(['{0:.0f}% {1}'.format(p*100, c) for c,p in self.predictions]) # assign the class prediction to self.node_name
            self.leading_branch = leading_branch
    def isLeaf(self):
        return True
    
    def predict(self, x):
        return self.label
    

class DecisionNode:
    """A decision node in decision tree.
        @attributes
            self.node_name (str) : uniqe name for each node
            self.label (str): label for printing
            self.leading_branch (str): value of parent node that leads to this node
            self.children (dict()): children nodes, each is a dict entry of {'attribute_value':node}, where node is either DecisionNode or Leaf
    """
    nodeid = 0
    def __init__(self, label, leading_branch, children=dict()):
        self.node_name = 'node' + str(DecisionNode.nodeid)
        DecisionNode.nodeid += 1
        self.label = label
        self.leading_branch = leading_branch
        self.children = children
        
    def isLeaf(self):
        return False
    
    def predict(self, x):
        split_attr_value = x[attribute_indx[self.label]]
        return self.children[split_attr_value].predict(x)


In [38]:
def findBestGainAttr(examples, attributes):
    """Find the best attribute to split using Information Gain.
        @params
            examples (PythonRDD): training examples
            attributes: (dict()): a dict of remaining attributes and their values, where each entry is (attribute_name, list(attribute_values))
        @return:
            best_attribute (str): name of best attribute to split
    """
    #TODO: implement this function
    
    return best_attribute
    
def build_tree(examples, leading_branch, attributes):
    """Recursively build decision tree, one node at a time.
        @params
            examples (PythonRDD): training examples
            leading_branch (str): value of parent node that leads to this node
            attributes: (dict()): a dict of remaining attributes and their values, where each entry is (attribute_name, list(attribute_values))
        @return
            a tree root (DecisionNode)
    """
    #TODO: implement this function
    
    # base cases
    
    # select best attribute for splitting
    best_attribute = findBestGainAttr(examples, attributes)
    
    # build tree recursively
    children = dict()
    
    return DecisionNode(best_attribute, leading_branch, children)
    

In [42]:
# Build decision tree
tree = build_tree(data, '', attribute_values)

### Decision Tree Visualization
To visualize decision tree using `graphviz` you need to install `xdg-utils` package on your virtual machine
```bash
sudo apt-get install --reinstall xdg-utils
```


In [39]:
from graphviz import Digraph

In [40]:
def build_tree_graph(root_node, tree_):
    """Print decision tree.
        @params
            root_node (DecisionNode): tree root node
            tree_ (graphviz.Digraph): the graph to print
    """
    if not root_node.isLeaf():
        tree_.node(root_node.node_name, label=root_node.label, shape='diamond')
        for c in root_node.children.values():
            tree_.edge(root_node.node_name, c.node_name, label=c.leading_branch)
            print_tree(c, tree_)
    if root_node.isLeaf():
        tree_.node(root_node.node_name, label=root_node.label, shape='oval')

In [None]:
# Print decision tree
tree_view = Digraph()
build_tree_graph(tree,tree_view)
tree_view

In [45]:
# Predicting
x = ['Sunny', 'Cool', 'Normal', 'Weak']
tree.predict(x)

'67% Yes, 33% No'