In [4]:
import numpy as np
import pandas as pd

In [3]:
headers = ["color", "diameter", "label"]

In [5]:
classes = ["apple", "grape", "lemon"]

In [83]:
X = np.array([
    ['green', 3], # apple
    ['yellow', 3], # apple
    ['red', 1], # grape
    ['red', 1], # grape
    ['yellow', 3], # lemon
], dtype=object)

In [61]:
Y = np.array([0, 0, 1, 1, 2])

In [62]:
print(X.shape)
print(Y.shape)

(5, 2)
(5,)


In [63]:
# how to use np unique
print(np.unique(X[:, 1]))

[1 3]


In [64]:
# how to count how many of each class
print(pd.Series(Y).value_counts())
print()
print(pd.Series(Y).value_counts()[1]) # this is like a dict key
print()
counts = pd.Series(Y).value_counts()
for index, value in counts.items():
    print(f"Class : {index}, Count : {value}")

0    2
1    2
2    1
dtype: int64

2

Class : 0, Count : 2
Class : 1, Count : 2
Class : 2, Count : 1


In [65]:
def class_count(Y):
    counts = {}
    for y in Y:
        if y not in counts:
            counts[y] = 1
        else:
            counts[y] += 1
    return counts

In [31]:
def is_numeric(value):
    return isinstance(value, int) or isinstance(value, float)

In [32]:
print(is_numeric(6))
print(is_numeric(6.34))
print(is_numeric("green"))

True
True
False


In [33]:
class Question:
    def __init__(self, col, val, labels):
        self.col = col
        self.val = val
        self.labels = labels

    def match(self, example):
        example_val = example[self.col]
        if is_numeric(example_val):
            return example_val >= self.val
        else:
            return example_val == self.val

    def __str__(self):
        operator = ">=" if is_numeric(self.val) else "=="
        return f"Is {self.labels[self.col]} {operator} {self.val}?"

In [91]:
print(str(Question(1, 3, headers)))
print(str(Question(0, "green", headers)))

Is diameter >= 3?
Is color == green?


In [40]:
# Let's pick an example from the training set...
example = X[0]
print(example)

# ... and see if it matches the question
q = Question(0, "green", headers)
q.match(example) # this will be true, since the first example is Green.

['green' 3]


True

In [88]:
def partition(X, Y, question):
    X_t, X_f = [], []
    Y_t, Y_f = [], []
    for i in range(len(X)):
        if question.match(X[i]):
            X_t.append(X[i])
            Y_t.append(Y[i])
        else:
            X_f.append(X[i])
            Y_f.append(Y[i])
    return np.array(X_t), np.array(Y_t), np.array(X_f), np.array(Y_f)

In [105]:
X_t, Y_t, X_f, Y_f = partition(X, Y, Question(0, 'red', headers))
print(X_t)
print([classes[i] for i in Y_t])
print()
print(X_f)
print([classes[i] for i in Y_f])

[['red' 1]
 ['red' 1]]
['grape', 'grape']

[['green' 3]
 ['yellow' 3]
 ['yellow' 3]]
['apple', 'apple', 'lemon']


In [95]:
def gini(Y):
    #counts = pd.Series(Y).value_counts()
    counts = class_count(Y)
    N = Y.shape[0]
    impurity = 1
    for count in counts.values():
        impurity -= (count / N) ** 2
    return impurity

In [96]:
# First, we'll look at a dataset with no mixing.

# classes = ["apple", "grape", "lemon"]

no_mixing = np.array([0,0])
# this will return 0
gini(no_mixing)

0.0

In [97]:
# Now, we'll look at dataset with a 50:50 apples:oranges ratio
some_mixing = np.array([0,1])
# this will return 0.5 - meaning, there's a 50% chance of misclassifying
# a random example we draw from the dataset.
gini(some_mixing)

0.5

In [98]:
# Now, we'll look at a dataset with many different labels
lots_of_mixing = np.array([0,1,2,3,4])
# This will return 0.8
gini(lots_of_mixing)

0.7999999999999998

In [115]:
def info_gain(X_t, Y_t, X_f, Y_f, curr_uncertainty):
    """Information Gain.

    The uncertainty of the starting node, minus the weighted impurity of
    two child nodes.
    """
    p = X_t.shape[0] / (X_t.shape[0] + X_f.shape[0])
    return curr_uncertainty - p * gini(Y_t) - (1 - p) * gini(Y_f)

In [116]:
# Calculate the uncertainy of our training data.
curr_uncertainty = gini(Y)
curr_uncertainty

0.6399999999999999

In [118]:
# How much information do we gain by partioning on 'Green'?
X_t, Y_t, X_f, Y_f = partition(X, Y, Question(0, 'green', headers))
info_gain(X_t, Y_t, X_f, Y_f, curr_uncertainty)

0.1399999999999999

In [119]:
# What about if we partioned on 'Red' instead?
X_t, Y_t, X_f, Y_f = partition(X, Y, Question(0, 'red', headers))
info_gain(X_t, Y_t, X_f, Y_f, curr_uncertainty)

0.37333333333333324

In [126]:
# partition on red

# only grape in true rows and 2 types in false row
X_t, Y_t, X_f, Y_f = partition(X, Y, Question(0, 'red', headers))
Y_t_labels = np.array([classes[i] for i in Y_t])
print(np.concatenate((X_t, Y_t_labels[:, np.newaxis]), axis=1))
print()
Y_f_labels = np.array([classes[i] for i in Y_f])
print(np.concatenate((X_f, Y_f_labels[:, np.newaxis]), axis=1))

[['red' 1 'grape']
 ['red' 1 'grape']]

[['green' 3 'apple']
 ['yellow' 3 'apple']
 ['yellow' 3 'lemon']]


In [127]:
# partition on green

# true singles out to only apple
# but false is NOT GORD
X_t, Y_t, X_f, Y_f = partition(X, Y, Question(0, 'green', headers))
Y_t_labels = np.array([classes[i] for i in Y_t])
print(np.concatenate((X_t, Y_t_labels[:, np.newaxis]), axis=1))
print()
Y_f_labels = np.array([classes[i] for i in Y_f])
print(np.concatenate((X_f, Y_f_labels[:, np.newaxis]), axis=1))

[['green' 3 'apple']]

[['yellow' 3 'apple']
 ['red' 1 'grape']
 ['red' 1 'grape']
 ['yellow' 3 'lemon']]


In [137]:
def opt_partition(X, Y):
    opt_info_gain = 0
    opt_question = None
    curr_uncertainty = gini(Y)
    M = X.shape[1]
    
    for m in range(M):
        unique_vals = np.unique(X[:, m])
        for val in unique_vals:
            q = Question(m, val, headers)
            
            # split dataset
            X_t, Y_t, X_f, Y_f = partition(X, Y, q)
            
            # skip if no split
            if X_t.shape[0] == 0 or Y_t.shape[0] == 0:
                continue
                
            ig = info_gain(X_t, Y_t, X_f, Y_f, curr_uncertainty)
            
            # just use >
            if ig >= opt_info_gain:
                opt_info_gain, opt_question = ig, q
                
    return opt_info_gain, opt_question

In [138]:
# Find the best question to ask first for our toy dataset.
opt_info_gain, opt_question = opt_partition(X, Y)
print(opt_question)
# FYI: is color == Red is just as good. See the note in the code above
# where I used '>='.

Is diameter >= 3?


In [145]:
class LeafNode:
    def __init__(self, Y):
        self.predictions = class_count(Y)

In [150]:
class DecisionNode:
    def __init__(self, question, true_child, false_child):
        self.question = question
        self.true_child = true_child
        self.false_child = false_child

In [147]:
def make_tree(X, Y):
    ig, q = opt_partition(X, Y)
    
    # can't ask a question
    if ig == 0:
        return LeafNode(Y)
    
    X_t, Y_t, X_f, Y_f = partition(X, Y, q)
    
    # recursive build true/false child
    child_t = make_tree(X_t, Y_t)
    child_f = make_tree(X_f, Y_f)
    
    # can ask questions so this is a decision node
    return DecisionNode(q, child_t, child_f)

In [148]:
def print_tree(node, spacing=""):
    # Base case: we've reached a leaf
    if isinstance(node, LeafNode):
        print (spacing + "Predict", node.predictions)
        return

    # Print the question at this node
    print (spacing + str(node.question))

    # Call this function recursively on the true branch
    print (spacing + '--> True:')
    print_tree(node.true_child, spacing + "  ")

    # Call this function recursively on the false branch
    print (spacing + '--> False:')
    print_tree(node.false_child, spacing + "  ")

In [151]:
my_tree = make_tree(X, Y)

In [152]:
print_tree(my_tree)

Is diameter >= 3?
--> True:
  Is color == yellow?
  --> True:
    Predict {0: 1, 2: 1}
  --> False:
    Predict {0: 1}
--> False:
  Predict {1: 2}


In [153]:
def classify(X_n, node):
    """See the 'rules of recursion' above."""

    # Base case: we've reached a leaf
    if isinstance(node, LeafNode):
        return node.predictions

    # Decide whether to follow the true-branch or the false-branch.
    # Compare the feature / value stored in the node,
    # to the example we're considering.
    if node.question.match(X_n):
        return classify(X_n, node.true_child)
    else:
        return classify(X_n, node.false_child)

In [155]:
print(X[0])
print(Y[0])
classify(X[0], my_tree)

['green' 3]
0


{0: 1}

In [None]:
X_test = np.array([
    ['Green', 3],
    ['Yellow', 4],
    ['Red', 2],
    ['Red', 1],
    ['Yellow'],
], dtype=object)
Y_test=np.array(