In [1]:
import pandas as pd

In [2]:
income = pd.read_csv('income.csv', index_col=False)
income.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,high_income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
income.shape

(32561, 15)

In [4]:
# Convert categorical variables to numeric variables.

cols = ['workclass', 'education', 'marital_status', 'occupation', 
        'relationship', 'race', 'sex', 'native_country', 'high_income']
for col in cols:
    column = pd.Categorical(income[col])
    income[col] = column.codes

In [5]:
income.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,high_income
0,39,7,77516,9,13,4,1,1,4,1,2174,0,40,39,0
1,50,6,83311,9,13,2,4,0,4,1,0,0,13,39,0
2,38,4,215646,11,9,0,6,1,4,1,0,0,40,39,0
3,53,4,234721,1,7,2,6,0,2,1,0,0,40,39,0
4,28,4,338409,9,13,2,10,5,2,0,0,0,40,5,0


In [6]:
private_incomes = income[income['workclass'] == 4]
public_incomes = income[income['workclass'] != 4]

In [7]:
print(private_incomes.shape)
print(public_incomes.shape)

(22696, 15)
(9865, 15)


In [30]:
import numpy as np

def calc_entropy(column):
    """Calculate entropy given a pandas series, list, or numpy array."""
    
    # Compute the counts of each unique value in the column
    counts = np.bincount(column)
    probabilities = counts / len(column)
    
    entropy = 0
    for prob in probabilities:
        if prob > 0:
            entropy += prob * np.log2(prob)
    
    return -entropy

def calc_information_gain(data, split_name, target_name):
    """
    Calculate information gain given a data set, column to split on, and target
    """
    # Calculate the original entropy
    original_entropy = calc_entropy(data[target_name])
    
    # Find the median of the column we're splitting
    column = data[split_name]
    median = column.median()
    
    # Make two subsets of the data, based on the median
    left_split = data[column <= median]
    right_split = data[column > median]
    
    # Loop through the splits and calculate the subset entropies
    to_subtract = 0
    for subset in [left_split, right_split]:
        prob = (subset.shape[0] / data.shape[0]) 
        to_subtract += prob * calc_entropy(subset[target_name])
    
    # Return information gain
    return original_entropy - to_subtract

def entropy(df, target):
    """Calculates the entropy of a Series or a column in a DataFrame."""
    
    uniques = df[target].unique()
    probabilities = np.array([len(df[df[target] == unique]) / len(df) for unique in uniques])
    entropy = (-1) * np.sum(probabilities * np.log2(probabilities))
    
    return entropy

def information_gain(df, split_col, target_col):
    """Returns the information gain for a particular column to split the df on, given a target column."""
    
    entropy = calc_entropy(df, target_col)
    uniques = df[split_col].unique()
    
    subset_entropy = 0
    for unique in uniques:
        subset = df[df[split_col] == unique]
        subset_entropy += (len(subset) / len(df)) * calc_entropy(subset, target_col)
    
    return entropy - subset_entropy

In [9]:
income_entropy = calc_entropy(income, 'high_income')
income_entropy

0.7963839552022132

In [10]:
age_information_gain = information_gain(income, 'age', 'high_income')
age_information_gain

0.09921780194414098

In [11]:
columns = ["age", "workclass", "education_num", "marital_status", 
           "occupation", "relationship", "race", "sex", "hours_per_week", "native_country"]

information_gains = {}

for col in columns:
    info_gain = information_gain(income, col, 'high_income')
    information_gains[col] = info_gain

highest_gain = max(information_gains,key=information_gains.get)

In [12]:
highest_gain

'relationship'

In [13]:
information_gains

{'age': 0.09921780194414098,
 'education_num': 0.09359084108861437,
 'hours_per_week': 0.061001939253475856,
 'marital_status': 0.15652786512566186,
 'native_country': 0.008695342020861085,
 'occupation': 0.09292248395497071,
 'race': 0.008377946328510633,
 'relationship': 0.1653657579852149,
 'sex': 0.03717138743832116,
 'workclass': 0.021571590198122825}

In [25]:
def find_best_column(data, target_name, columns):
    """Returns the column name of the best column to split on, i.e., the one
    that provides the highest information gain."""
    
    information_gains = []
    for column in columns:
        info_gain = calc_information_gain(data, column, target_name)
        information_gains.append(info_gain)
    
    highest_gain = columns[information_gains.index(max(information_gains))]
    return highest_gain    

In [26]:
# Create a dictionary to hold the tree and a list to number the nodes.
# Both must be outside of the function so we can access them later.
tree = {}
nodes = []

def id3(data, target, columns, tree):

    unique_targets = pd.unique(data[target])
    
    # Assign the number key to the node dictionary
    nodes.append(len(nodes) + 1)
    tree["number"] = nodes[-1]
    
    if len(unique_targets) == 1:
        if unique_targets[0] == 1:
            tree['label'] = 1
        else:
            tree['label'] = 0        
        return

    best_column = find_best_column(data, target, columns)
    column_median = data[best_column].median()
    
    # Insert code here that assigns the "column" and "median" fields to the node dictionary
    tree['column'] = best_column
    tree['median'] = column_median
    
    # Create the two splits
    left_split = data[data[best_column] <= column_median]
    right_split = data[data[best_column] > column_median]
    split_dict = [["left", left_split], ["right", right_split]]
    
    # Loop through the splits and call id3 recursively
    for name, split in split_dict:
        tree[name] = {}
        id3(split, target, columns, tree[name])
        
        
def print_with_depth(string, depth):
    # Add space before a string
    prefix = "    " * depth
    # Print a string, and indent it appropriately
    print("{0}{1}".format(prefix, string))
    
    
def print_node(tree, depth):
    # Check for the presence of "label" in the tree
    if "label" in tree:
        # If found, then this is a leaf, so print it and return
        print_with_depth("Leaf: Label {0}".format(tree["label"]), depth)
        # This is critical -- without it, you'll get infinite recursion
        return
    # Print information about what the node is splitting on
    print_with_depth("{0} > {1}".format(tree["column"], tree["median"]), depth)
    
    # Create a list of tree branches
    branches = [tree["left"], tree["right"]]
        
    # Insert code here to recursively call print_node on each branch
    # Don't forget to increment depth when you pass it in
    for branch in branches:
        print_node(branch, depth + 1)

In [31]:
# Create the data set that we used in the example on the last screen
data = pd.DataFrame([
    [0,20,0],
    [0,60,2],
    [0,40,1],
    [1,25,1],
    [1,35,2],
    [1,55,1]
    ])
data.columns = ["high_income", "age", "marital_status"]

id3(data, "high_income", ["age", "marital_status"], tree)
print_node(tree, 0)

age > 37.5
    age > 25.0
        age > 22.5
            Leaf: Label 0
            Leaf: Label 1
        Leaf: Label 1
    age > 55.0
        age > 47.5
            Leaf: Label 0
            Leaf: Label 1
        Leaf: Label 0


In [34]:
def predict(tree, row):
    if "label" in tree:
        return tree["label"]
    
    column = tree["column"]
    median = tree["median"]
    
    # Check whether row[column] is less than or equal to median
    # If it's less than or equal, return the result of predicting on the left branch of the tree
    # If it's greater, return the result of predicting on the right branch of the tree
    # Remember to use the return statement to return the result!
    if row[tree['column']] <= tree['median']:
        return predict(tree["left"], row)
    else:
        return predict(tree["right"], row)
    
# Print the predictions for our data
print(predict(tree, data.iloc[0]))
print(predict(tree, data.iloc[1]))
print(predict(tree, data.iloc[2]))
print(predict(tree, data.iloc[3]))
print(predict(tree, data.iloc[4]))
print(predict(tree, data.iloc[5]))

0
0
0
1
1
1


In [37]:
new_data = pd.DataFrame([
    [40,0],
    [20,2],
    [80,1],
    [15,1],
    [27,2],
    [38,1]
    ])
# Assign column names to the data
new_data.columns = ["age", "marital_status"]

def batch_predict(tree, df):
    return df.apply(lambda row: predict(tree, row), axis=1)

predictions = batch_predict(tree, new_data)
predictions

0    0
1    0
2    0
3    0
4    1
5    0
dtype: int64

In [53]:
# Use DecisionTreeClassifier for classification and DecisionTreeRegressor for regression

import math
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

np.random.seed(1)
income = income.reindex(np.random.permutation(income.index))
train_max_row = math.floor(income.shape[0] * 0.8)
train = income.iloc[:train_max_row]
test = income.iloc[train_max_row:]

clf = DecisionTreeClassifier(random_state=1)
clf.fit(train[columns], train['high_income'])
predictions = clf.predict(test[columns])
test_error = roc_auc_score(test['high_income'], predictions)
train_error = roc_auc_score(train['high_income'], clf.predict(train[columns]))

print('AUC train:', train_error)
print('AUC test:', test_error)

AUC train: 0.9479263579758092
AUC test: 0.6964447084320571


In [54]:
# Retrain using some non-default parameters

clf = DecisionTreeClassifier(random_state=1, min_samples_split=13)
clf.fit(train[columns], train['high_income'])
predictions = clf.predict(test[columns])
test_error = roc_auc_score(test['high_income'], predictions)
train_error = roc_auc_score(train['high_income'], clf.predict(train[columns]))

print('AUC train:', train_error)
print('AUC test:', test_error)

AUC train: 0.8398021634354487
AUC test: 0.7168766976453667


In [55]:
clf = DecisionTreeClassifier(random_state=1, min_samples_split=13, max_depth=7)
clf.fit(train[columns], train['high_income'])
predictions = clf.predict(test[columns])
test_error = roc_auc_score(test['high_income'], predictions)
train_error = roc_auc_score(train['high_income'], clf.predict(train[columns]))

print('AUC train:', train_error)
print('AUC test:', test_error)

AUC train: 0.7504900249627168
AUC test: 0.7452613933764609


In [56]:
clf = DecisionTreeClassifier(random_state=1, min_samples_split=100, max_depth=2)
clf.fit(train[columns], train['high_income'])
predictions = clf.predict(test[columns])
test_error = roc_auc_score(test['high_income'], predictions)
train_error = roc_auc_score(train['high_income'], clf.predict(train[columns]))

print('AUC train:', train_error)
print('AUC test:', test_error)

AUC train: 0.6600959918961614
AUC test: 0.6644604503234572


In [58]:
# Demonstrate overfitting by introducing random noise into the data
income["noise"] = np.random.randint(4, size=income.shape[0])

# Adjust "columns" to include the noise column
columns = ["noise", "age", "workclass", "education_num", "marital_status", 
           "occupation", "relationship", "race", "sex", "hours_per_week", "native_country"]

# Make new train and test sets
train_max_row = math.floor(income.shape[0] * .8)
train = income.iloc[:train_max_row]
test = income.iloc[train_max_row:]

# Initialize the classifier
clf = DecisionTreeClassifier(random_state=1)
clf.fit(train[columns], train['high_income'])
predictions = clf.predict(test[columns])
test_error = roc_auc_score(test['high_income'], predictions)
train_error = roc_auc_score(train['high_income'], clf.predict(train[columns]))

print('AUC train:', train_error)
print('AUC test:', test_error)

AUC train: 0.9762782800246012
AUC test: 0.7054946007868337


# Random Forests

In [59]:
columns = ["age", "workclass", "education_num", "marital_status", 
           "occupation", "relationship", "race", "sex", "hours_per_week", "native_country"]

clf1 = DecisionTreeClassifier(random_state=1, min_samples_leaf=2)
clf1.fit(train[columns], train['high_income'])
predictions1 = clf1.predict(test[columns])
test_error1 = roc_auc_score(test['high_income'], predictions1)
train_error1 = roc_auc_score(train['high_income'], clf1.predict(train[columns]))

clf2 = DecisionTreeClassifier(random_state=1, max_depth=5)
clf2.fit(train[columns], train["high_income"])
predictions2 = clf2.predict(test[columns])
test_error2 = roc_auc_score(test['high_income'], predictions2)
train_error2 = roc_auc_score(train['high_income'], clf2.predict(train[columns]))

print('AUC train1:', train_error1)
print('AUC test1:', test_error1)
print('AUC train2:', train_error2)
print('AUC test2:', test_error2)

AUC train1: 0.8545670596653147
AUC test1: 0.6999046775794102
AUC train2: 0.6813647249880411
AUC test2: 0.6791875035045009


In [61]:
predictions1 = clf1.predict_proba(test[columns])[:, 1]
predictions2 = clf2.predict_proba(test[columns])[:, 1]

mean = np.round((predictions1 + predictions2) / 2)
print('AUC combined:', roc_auc_score(test['high_income'], mean))

AUC combined: 0.7248783041737138


In [64]:
# Random forest with 10 trees and bagging

tree_count = 10
bag_proportion = 0.6  # each bag has 60% of the data

predictions = []
for i in range(tree_count):
    bag = train.sample(frac=bag_proportion, replace=True, random_state=i)
    clf = DecisionTreeClassifier(random_state=1, min_samples_leaf=2)
    clf.fit(bag[columns], bag['high_income'])
    predictions.append(clf.predict_proba(test[columns])[:, 1])

mean = np.round(sum(predictions) / tree_count)
print('AUC forest:', roc_auc_score(test['high_income'], mean))

AUC forest: 0.7359508313653795


In [65]:
# Random forest with 10 trees, bagging, and random subsets

tree_count = 10
bag_proportion = 0.6  # each bag has 60% of the data

predictions = []
for i in range(tree_count):
    bag = train.sample(frac=bag_proportion, replace=True, random_state=i)
    clf = DecisionTreeClassifier(random_state=1, min_samples_leaf=2, splitter='random', max_features='auto')
    clf.fit(bag[columns], bag['high_income'])
    predictions.append(clf.predict_proba(test[columns])[:, 1])

mean = np.round(sum(predictions) / tree_count)
print('AUC forest:', roc_auc_score(test['high_income'], mean))

AUC forest: 0.7361342390049956


In [66]:
# Using the RandomForestClassifier (parameter for bagging is 'bootstrap', which defaults to True)

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=5, random_state=1, min_samples_leaf=2)
clf.fit(train[columns], train["high_income"])
predictions = clf.predict(test[columns])

print(roc_auc_score(test["high_income"], predictions))

0.7452169269668562


In [67]:
clf = RandomForestClassifier(n_estimators=150, random_state=1, min_samples_leaf=2)
clf.fit(train[columns], train["high_income"])
predictions = clf.predict(test[columns])

print(roc_auc_score(test["high_income"], predictions))

0.746077893152825


In [68]:
# Compare and contrast train/test error for a single decision tree vs. a random forest

# Decision tree
clf = DecisionTreeClassifier(random_state=1, min_samples_leaf=5)
clf.fit(train[columns], train["high_income"])

# Training and test error
predictions = clf.predict(train[columns])
print('Train error (tree):', roc_auc_score(train["high_income"], predictions))

predictions = clf.predict(test[columns])
print('Test error (tree):', roc_auc_score(test["high_income"], predictions))

# Random forest
clf = RandomForestClassifier(n_estimators=150, random_state=1, min_samples_leaf=5)
clf.fit(train[columns], train["high_income"])

# Training and test error
predictions = clf.predict(train[columns])
print('Train error (forest):', roc_auc_score(train["high_income"], predictions))

predictions = clf.predict(test[columns])
print('Test error (forest):', roc_auc_score(test["high_income"], predictions))

Train error (tree): 0.8189999316640605
Test error (tree): 0.7252679394578748
Train error (forest): 0.7930105518730077
Test error (forest): 0.7477108927055527
