In [1]:
# Format: each row is an example.
# The last column is the label.
# The first three columns are features.
training_data = [
    ['Yes', 'Single', 125000, 'No'],
    ['No', 'Married', 100000, 'No'],
    ['No', 'Single', 70000, 'No'],
    ['Yes', 'Married', 120000, 'No'],
    ['No', 'Divorced', 95000, 'Yes'],
    ['No', 'Married', 60000, 'No'],
    ['Yes', 'Divorced', 220000, 'No'],
    ['No', 'Single', 85000, 'Yes'],
    ['No', 'Married', 75000, 'No'],
    ['No', 'Single', 90000, 'Yes']
]

In [2]:
# Column labels.
# These are used only to print the tree.
header = ["Home Owner", "Marital Status", "Annual Income", "Default Borrower"]

In [3]:
"""Find the unique values for a column in a dataset."""
def unique_vals(rows, col):
    return set([row[col] for row in rows])

In [51]:
# Demo:
#unique_vals(training_data, 0)
unique_vals(training_data, 1)

{'Divorced', 'Married', 'Single'}

In [5]:
"""Counts the number of each type of example in a dataset."""
def class_counts(rows):
    counts = {}  # a dictionary of label -> count.
    for row in rows:
        # in our dataset format, the label is always the last column
        label = row[-1]
        if label not in counts:
            counts[label] = 0
        counts[label] += 1
    return counts

In [6]:
# Demo:
class_counts(training_data)

{'No': 7, 'Yes': 3}

In [7]:
"""Test if a value is numeric."""
def is_numeric(value):
    return isinstance(value, int) or isinstance(value, float)

In [8]:
# Demo:
is_numeric(7)
# is_numeric("Single")

True

In [9]:
""" A Question is used to partition a dataset.

    This class just records a 'column number' (e.g., 1 for Marital Status) and a
    'column value' (e.g., Married). The 'match' method is used to compare
    the feature value in an example to the feature value stored in the
    question. See the demo below.
"""
class Question:
    def __init__(self, column, value):
        self.column = column
        self.value = value

    def match(self, row):
        # Compare the feature value in an example to the
        # feature value in this question.
        # returns either True or False based on the row
        val = row[self.column]
        if is_numeric(val):
            return val >= self.value
        else:
            return val == self.value

    def __repr__(self):
        # This is just a helper method to print
        # the question in a readable format.
        # Converts the object into string format.
        condition = "=="
        if is_numeric(self.value):
            condition = ">="
        return "Is %s %s %s?" % (
            header[self.column], condition, str(self.value))

In [11]:
# numeric attribute
Question(1, 'Married')

Is Marital Status == Married?

In [53]:
# categorical attribute
q = Question(2, 120000)
q

Is Annual Income >= 120000?

In [14]:
# Let's pick an example from the training set...
example = training_data[0]
# ... and see if it matches the question
q.match(example) # this will be true, since the first example is 125000.

True

In [15]:
""" Partitions a dataset.
    For each row in the dataset, check if it matches the question. If
    so, add it to 'true rows', otherwise, add it to 'false rows'.
"""
def partition(rows, question):
    true_rows, false_rows = [], []
    for row in rows:
        if question.match(row):
            true_rows.append(row)
        else:
            false_rows.append(row)
    return true_rows, false_rows

In [17]:
# Let's partition the training data based on whether rows are Yes or No in Home Owner column.
true_rows, false_rows = partition(training_data, Question(0, 'No'))
true_rows

[['No', 'Married', 100000, 'No'],
 ['No', 'Single', 70000, 'No'],
 ['No', 'Divorced', 95000, 'Yes'],
 ['No', 'Married', 60000, 'No'],
 ['No', 'Single', 85000, 'Yes'],
 ['No', 'Married', 75000, 'No'],
 ['No', 'Single', 90000, 'Yes']]

In [18]:
# This will contain everything else.
false_rows

[['Yes', 'Single', 125000, 'No'],
 ['Yes', 'Married', 120000, 'No'],
 ['Yes', 'Divorced', 220000, 'No']]

In [54]:
def gini(rows):
    """Calculate the Gini Impurity for a list of rows."""
    counts = class_counts(rows)
    print
    impurity = 1
    for lbl in counts:
        prob_of_lbl = counts[lbl] / float(len(rows))
        impurity -= prob_of_lbl**2
    return impurity

In [55]:
#######
# Demo:
# Let's look at some example to understand how Gini Impurity works.
#
# First, we'll look at a dataset with no mixing.
no_mixing = [['Apple'],
              ['Apple']]
# this will return 0
gini(no_mixing)

{'Apple': 2}


0.0

In [56]:
# Now, we'll look at dataset with a 50:50 apples:oranges ratio
some_mixing = [['Apple'],
               ['Orange']]
# this will return 0.5 - meaning, there's a 50% chance of misclassifying
# a random example we draw from the dataset.
gini(some_mixing)

{'Apple': 1, 'Orange': 1}


0.5

In [57]:
# Now, we'll look at a dataset with many different labels
lots_of_mixing = [['Apple'],
                  ['Orange'],
                  ['Grape'],
                  ['Grapefruit'],
                  ['Blueberry']]
# This will return 0.8
gini(lots_of_mixing)
#######

{'Apple': 1, 'Orange': 1, 'Grape': 1, 'Grapefruit': 1, 'Blueberry': 1}


0.7999999999999998

In [23]:
def info_gain(left, right, current_uncertainty):
    """Information Gain.

    The uncertainty of the starting node, minus the weighted impurity of
    child nodes.
    """
    p = float(len(left)) / (len(left) + len(right))
    return current_uncertainty - p * gini(left) - (1 - p) * gini(right)

In [58]:
# Calculate the uncertainy of our training data.
current_uncertainty = gini(training_data)
current_uncertainty

{'No': 7, 'Yes': 3}


0.42000000000000004

In [32]:
"""
    Find the best question to ask by iterating over every feature / value
    and calculating the information gain.
"""
def find_best_split(rows):
    best_gain = 0  # keep track of the best information gain
    best_question = None  # keep train of the feature / value that produced it
    current_uncertainty = gini(rows)
    n_features = len(rows[0]) - 1  # number of columns

    for col in range(n_features):  # for each feature

        values = set([row[col] for row in rows])  # unique values in the column

        for val in values:  # for each value

            question = Question(col, val)

            # try splitting the dataset
            true_rows, false_rows = partition(rows, question)

            # Skip this split if it doesn't divide the
            # dataset.
            if len(true_rows) == 0 or len(false_rows) == 0:
                continue

            # Calculate the information gain from this split
            gain = info_gain(true_rows, false_rows, current_uncertainty)

            # You actually can use '>' instead of '>=' here
            # but I wanted the tree to look a certain way for our
            # toy dataset.
            if gain >= best_gain:
                best_gain, best_question = gain, question

    return best_gain, best_question

In [33]:
# Find the best question to ask first for our toy dataset.
best_gain, best_question = find_best_split(training_data)

Is Annual Income >= 100000?

In [34]:
""" A Leaf node classifies data.

    This holds a dictionary of class (e.g., "Apple") -> number of times
    it appears in the rows from the training data that reach this leaf.
"""
class Leaf:

    def __init__(self, rows):
        self.predictions = class_counts(rows)

In [35]:
""" A Decision Node asks a question.

    This holds a reference to the question, and to the two child nodes.
"""
class Decision_Node:

    def __init__(self, question, true_branch, false_branch):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch

In [36]:
def build_tree(rows):
    """ Builds the tree.
        
        Base case (no further information gain).
    """

    # Try partitioing the dataset on each of the unique attribute,
    # calculate the information gain,
    # and return the question that produces the highest gain.
    gain, question = find_best_split(rows)

    # Base case: no further info gain
    # Since we can ask no further questions,
    # we'll return a leaf.
    if gain == 0:
        return Leaf(rows)

    # If we reach here, we have found a useful feature / value
    # to partition on.
    true_rows, false_rows = partition(rows, question)

    # Recursively build the true branch.
    true_branch = build_tree(true_rows)

    # Recursively build the false branch.
    false_branch = build_tree(false_rows)

    # Return a Question node.
    # This records the best feature / value to ask at this point,
    # as well as the branches to follow
    # dependingo on the answer.
    return Decision_Node(question, true_branch, false_branch)

In [37]:
def print_tree(node, spacing=""):
    
    # Base case: we've reached a leaf
    if isinstance(node, Leaf):
        print (spacing + "Predict", node.predictions)
        return

    # Print the question at this node
    print (spacing + str(node.question))

    # Call this function recursively on the true branch
    print (spacing + '--> True:')
    print_tree(node.true_branch, spacing + "  ")

    # Call this function recursively on the false branch
    print (spacing + '--> False:')
    print_tree(node.false_branch, spacing + "  ")

In [38]:
my_tree = build_tree(training_data)

In [39]:
print_tree(my_tree)

Is Annual Income >= 100000?
--> True:
  Predict {'No': 4}
--> False:
  Is Annual Income >= 85000?
  --> True:
    Predict {'Yes': 3}
  --> False:
    Predict {'No': 3}


In [40]:
def classify(row, node):

    # Base case: we've reached a leaf
    if isinstance(node, Leaf):
        return node.predictions

    # Decide whether to follow the true-branch or the false-branch.
    # Compare the feature / value stored in the node,
    # to the example we're considering.
    if node.question.match(row):
        return classify(row, node.true_branch)
    else:
        return classify(row, node.false_branch)

In [41]:
# The tree predicts the 1st row of our
# training data is 'No' with confidence 1.
classify(training_data[0], my_tree)

{'No': 4}

In [60]:
def print_leaf(counts):
    """A nicer way to print the predictions at a leaf."""
    total = sum(counts.values()) * 1.0
    probs = {}
    for lbl in counts.keys():
        probs[lbl] = str(int(counts[lbl] / total * 100)) + "%"
    return probs

In [61]:
# Printing that a bit nicer
print_leaf(classify(training_data[0], my_tree))

{'No': '100%'}

In [62]:
# Evaluate
testing_data = [
    ['No', 'Married', 55000, 'No'],
    ['Yes', 'Divorced', 80000, 'No'],
    ['Yes', 'Single', 110000, 'No'],
    ['No', 'Single', 95000, 'Yes'],
    ['No', "Married", 67000, 'No'],
]

In [63]:
for row in testing_data:
    print ("Actual: %s. Predicted: %s" %
           (row[-1], print_leaf(classify(row, my_tree))))

Actual: No. Predicted: {'No': '100%'}
Actual: No. Predicted: {'No': '100%'}
Actual: No. Predicted: {'No': '100%'}
Actual: Yes. Predicted: {'Yes': '100%'}
Actual: No. Predicted: {'No': '100%'}
