# Decission Tree

In [23]:
from __future__ import division
from functools import reduce,partial
from collections import *
import math

In [12]:
def entropy(class_probabilities):
    return sum([-p * math.log(p,2)
               for p in class_probabilities if p])

In [13]:
def class_probabilities(labels):
    total_count = len(labels)
    return [count/total_count
           for count in Counter(labels).values()]
def data_entropy(labellled_data):
    labels = [label for _,label in labellled_data]
    probabilities = class_probabilities(labels)
    return entropy(probabilities)

def partition_entropy(subsets):
    total_count = sum(len(subset) for subset in subsets)
    return sum(data_entropy(subset) * len(subset)/total_count
              for subset in subsets)

In [35]:
inputs = [
({"level":"Senior", "lang":"Java", "tweets":"no", "phd":"no"}, False),
({"level":"Senior", "lang":"Java", "tweets":"no", "phd":"yes"}, False),
({"level":"Mid", "lang":"Python", "tweets":"no", "phd":"no"}, True),
({"level":"Junior", "lang":"Python", "tweets":"no", "phd":"no"}, True),
({"level":"Junior", "lang":"R", "tweets":"yes", "phd":"no"}, True),
({"level":"Junior", "lang":"R", "tweets":"yes", "phd":"yes"}, False),
({"level":"Mid", "lang":"R", "tweets":"yes", "phd":"yes"}, True),
({"level":"Senior", "lang":"Python", "tweets":"no", "phd":"no"}, False),
({"level":"Senior", "lang":"R", "tweets":"yes", "phd":"no"}, True),
({"level":"Junior", "lang":"Python", "tweets":"yes", "phd":"no"}, True),
({"level":"Senior", "lang":"Python", "tweets":"yes", "phd":"yes"}, True),
({"level":"Mid", "lang":"Python", "tweets":"no", "phd":"yes"}, True),
({"level":"Mid", "lang":"Java", "tweets":"yes", "phd":"no"}, True),
({"level":"Junior", "lang":"Python", "tweets":"no", "phd":"yes"}, False)
]

In [42]:
# print(inputs)
def partition_by(inputs,attribute):
    groups = defaultdict(list)
    for input in inputs:
        key = input[0][attribute]
        groups[key].append(input)
    return groups

def partition_entropy_by(inputs,attribute):
    partitions = partition_by(inputs,attribute)
    return partition_entropy(partitions.values())

# partition_entropy_by(inputs,'level')

In [44]:
for key in ['level','lang','tweets','phd']:
    print(key,"=>",partition_entropy_by(inputs,key))

level => 0.6935361388961919
lang => 0.8601317128547441
tweets => 0.7884504573082896
phd => 0.8921589282623617


### Let's generalize the things
We will try to build a tree just like this

``
('level',{
    'Junior': ('phd', {'no': True, 'yes': False}),
    'Mid': True,
    'Senior': ('tweets', {'no': False, 'yes': True})
})``

In [45]:
def classify(tree,input):
    #Check if leaf node
    if tree in [True,False]:
        return tree
    attribute,subtree_dict = tree
    subtree_key = input.get(attribute)
    if subtree_key not in subtree_dict:
        subtree_key = None
    subtree = subtree_dict[subtree_key]
    return classify(subtree,input)
    

In [60]:
# FORMATION of the TREE
def build_tree_id3(inputs,split_candidates = None):
    if split_candidates is None:
        split_candidates = inputs[0][0].keys()
    num_inputs = len(inputs)
    num_trues = len([label for _,label in inputs if label])
    num_false = num_inputs - num_trues
    if num_trues == 0: return False
    if num_false ==0 : return True

    if not split_candidates:
        return num_trues >= num_false
    best_attribute = min(split_candidates,key = partial(partition_entropy_by,inputs))
    partitions = partition_by(inputs,best_attribute)
    new_candidates = [a for a in split_candidates if a != best_attribute]
    sub_trees = {attribute_value:build_tree_id3(subset,new_candidates)
                for attribute_value,subset in partitions.items()}
    sub_trees[None] = num_trues >= num_false
    return (best_attribute,sub_trees)

In [62]:
tree = build_tree_id3(inputs)
classify(tree,{ "level" : "Junior",
"lang" : "Java",
"tweets" : "yes",
"phd" : "no"})

True