## Recap

In [8]:
import pandas
import numpy as np
import math

In [6]:
income = pandas.read_csv("income.csv", index_col=False)

cat_columns = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country', 'high_income']

for cat_column in cat_columns:
    col = pandas.Categorical(income[cat_column])
    income[cat_column] = col.codes

In [2]:
def calc_entropy(column):
    """
    Calculate entropy given a pandas series, list, or numpy array.
    """
    # Compute the counts of each unique value in the column
    counts = np.bincount(column)
    # Divide by the total column length to get a probability
    probabilities = counts / len(column)
    
    # Initialize the entropy to 0
    entropy = 0
    # Loop through the probabilities, and add each one to the total entropy
    for prob in probabilities:
        if prob > 0:
            entropy += prob * math.log(prob, 2)
    
    return -entropy

In [3]:
def calc_information_gain(data, split_name, target_name):
    """
    Calculate information gain given a data set, column to split on, and target
    """
    # Calculate the original entropy
    original_entropy = calc_entropy(data[target_name])
    
    # Find the median of the column we're splitting
    column = data[split_name]
    median = column.median()
    
    # Make two subsets of the data, based on the median
    left_split = data[column <= median]
    right_split = data[column > median]
    
    # Loop through the splits and calculate the subset entropies
    to_subtract = 0
    for subset in [left_split, right_split]:
        prob = (subset.shape[0] / data.shape[0]) 
        to_subtract += prob * calc_entropy(subset[target_name])
    
    # Return information gain
    return original_entropy - to_subtract

## Determining the Column to Split On

In [24]:
def find_best_column(data, target_name, columns):
    # Fill in the logic here to automatically find the column in columns to split on
    # data is a dataframe
    # target_name is the name of the target variable
    # columns is a list of potential columns to split on
    information_gains = []
    for col in columns:
        information_gains.append(calc_information_gain(data, col, target_name))
    highest = max(information_gains)
    highest_index = information_gains.index(highest)
    return columns[highest_index]

In [25]:
columns = ["age", "workclass", "education_num", "marital_status", "occupation", "relationship", "race", "sex", "hours_per_week", "native_country"]

In [26]:
income_split = find_best_column(income, 'high_income', columns)

In [27]:
income_split

'marital_status'

## Creating a Simple Recursive Algorithm

def id3(data, target, columns)
*    1 Create a node for the tree
*    2 If all values of the target attribute are 1, add 1 to counter_1
*    3 If all values of the target attribute are 0, add 1 to counter_0
*    4 Using information gain, find A, the column that splits the data best
*    5 Find the median value in column A
*    6 Split A into values below or equal to the median (0), and values above the median (1)
*    7 For each possible value (0 or 1), vi, of A,
*    8 Add a new tree branch below Root that corresponds to rows of data where A = vi
*    9 Let Examples(vi) be the subset of examples that have the value vi for A
*    10 Below this new branch, add the subtree id3(data[A==vi], target, columns)
*    11 Return Root

In [30]:
# We'll use lists to store our labels for nodes (when we find them)
# Lists can be accessed inside our recursive function, whereas integers can't.  
# Look at the python missions on scoping for more information on this topic
label_1s = []
label_0s = []

def id3(data, target, columns):
    # The pandas.unique method will return a list of all the unique values in a series
    unique_targets = pandas.unique(data[target])
    
    if len(unique_targets) == 1:
        # Insert code here to append 1 to label_1s or 0 to label_0s, based on what we should label the node
        # See lines 2 and 3 in the algorithm
        if unique_targets[0] == 1:
            label_1s.append(1)
        if unique_targets[0] == 0:
            label_0s.append(0)
        
        # Returning here is critical -- if we don't, the recursive tree will never finish, and run forever
        # See our example above for when we returned
        return
    
    # Find the best column to split on in our data
    best_column = find_best_column(data, target, columns)
    # Find the median of the column
    column_median = data[best_column].median()
    
    # Create the two splits
    left_split = data[data[best_column] <= column_median]
    right_split = data[data[best_column] > column_median]
    
    # Loop through the splits and call id3 recursively
    for split in [left_split, right_split]:
        # Call id3 recursively to process each branch
        id3(split, target, columns)
    
# Create the data set that we used in the example on the last screen
data = pandas.DataFrame([
    [0,20,0],
    [0,60,2],
    [0,40,1],
    [1,25,1],
    [1,35,2],
    [1,55,1]
    ])
# Assign column names to the data
data.columns = ["high_income", "age", "marital_status"]

# Call the function on our data to set the counters properly
id3(data, "high_income", ["age", "marital_status"])

In [31]:
label_1s

[1, 1, 1]

In [32]:
label_0s

[0, 0, 0]

## Sorting the Tree

In [33]:
# Create a dictionary to hold the tree  
# It has to be outside of the function so we can access it later
tree = {}

# This list will let us number the nodes  
# It has to be a list so we can access it inside the function
nodes = []

def id3(data, target, columns, tree):
    unique_targets = pandas.unique(data[target])
    
    # Assign the number key to the node dictionary
    nodes.append(len(nodes) + 1)
    tree["number"] = nodes[-1]

    if len(unique_targets) == 1:
        # Insert code here that assigns the "label" field to the node dictionary
        if unique_targets[0] == 1:
            tree["label"] = 1
        if unique_targets[0] == 0:
            tree["label"] = 0
        return
    
    best_column = find_best_column(data, target, columns)
    column_median = data[best_column].median()
    
    # Insert code here that assigns the "column" and "median" fields to the node dictionary
    tree["column"] = best_column
    tree["median"] = column_median
    
    left_split = data[data[best_column] <= column_median]
    right_split = data[data[best_column] > column_median]
    split_dict = [["left", left_split], ["right", right_split]]
    
    for name, split in split_dict:
        tree[name] = {}
        id3(split, target, columns, tree[name])

# Call the function on our data to set the counters properly
id3(data, "high_income", ["age", "marital_status"], tree)

In [35]:
tree
tree.keys()

dict_keys(['number', 'column', 'median', 'left', 'right'])

## Printing Labels for a More Attractive Tree

In [36]:
def print_with_depth(string, depth):
    # Add space before a string
    prefix = "    " * depth
    # Print a string, and indent it appropriately
    print("{0}{1}".format(prefix, string))
    
    
def print_node(tree, depth):
    # Check for the presence of "label" in the tree
    if "label" in tree:
        # If found, then this is a leaf, so print it and return
        print_with_depth("Leaf: Label {0}".format(tree["label"]), depth)
        # This is critical -- without it, you'll get infinite recursion
        return
    # Print information about what the node is splitting on
    print_with_depth("{0} > {1}".format(tree["column"], tree["median"]), depth)
    
    # Create a list of tree branches
    branches = [tree["left"], tree["right"]]
        
    # Insert code here to recursively call print_node on each branch
    # Don't forget to increment depth when you pass it in
    for branch in branches:
        print_node(branch, depth+1)

print_node(tree, 0)

age > 37.5
    age > 25.0
        age > 22.5
            Leaf: Label 0
            Leaf: Label 1
        Leaf: Label 1
    age > 55.0
        age > 47.5
            Leaf: Label 0
            Leaf: Label 1
        Leaf: Label 0


## Making Predictions Automatically

def predict(tree, row):
* 1 Check for the presence of "label" in the tree dictionary
* 2    If found, return tree["label"]
* 3 Extract tree["column"] and tree["median"]
* 4 Check whether row[tree["column"]] is less than or equal to tree["median"]
* 5    If it's less than or equal, call predict(tree["left"], row) and return the result
* 6    If it's greater, call predict(tree["right"], row) and return the result

In [40]:
def predict(tree, row):
    if "label" in tree:
        return tree["label"]
    
    column = tree["column"]
    median = tree["median"]
    
    # Insert code here to check whether row[column] is less than or equal to median
    # If it's less than or equal, return the result of predicting on the left branch of the tree
    # If it's greater, return the result of predicting on the right branch of the tree
    # Remember to use the return statement to return the result!
    if row[tree["column"]] > tree["median"]:
        return predict(tree["right"], row) 
    else:
        return predict(tree["left"], row)

# Print the prediction for the first row in our data
print(predict(tree, data.iloc[0]))

0


## Making Multiple Predictions

In [42]:
new_data = pandas.DataFrame([
    [40,0],
    [20,2],
    [80,1],
    [15,1],
    [27,2],
    [38,1]
    ])
# Assign column names to the data
new_data.columns = ["age", "marital_status"]

def batch_predict(tree, df):
    # Insert your code here
    return df.apply(lambda x: predict(tree,x), axis=1)

predictions = batch_predict(tree, new_data)

In [43]:
predictions

0    0
1    0
2    0
3    0
4    1
5    0
dtype: int64