In [39]:

import pandas as pd
# Add asset from file system
headings = ['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'high_income']
income = pd.read_csv('../datasets/adult.data.csv',header=None)
income.head()
income.columns = headings
income.info()
features_to_convert = ["workclass","education", "marital_status", "occupation", "relationship", "race", "sex", "native_country", "high_income"]
for column in features_to_convert:    
    col = pd.Categorical(income[column])
    income[column] = col.codes
income.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         32561 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education_num     32561 non-null int64
marital_status    32561 non-null object
occupation        32561 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital_gain      32561 non-null int64
capital_loss      32561 non-null int64
hours_per_week    32561 non-null int64
native_country    32561 non-null object
high_income       32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,high_income
0,39,7,77516,9,13,4,1,1,4,1,2174,0,40,39,0
1,50,6,83311,9,13,2,4,0,4,1,0,0,13,39,0
2,38,4,215646,11,9,0,6,1,4,1,0,0,40,39,0
3,53,4,234721,1,7,2,6,0,2,1,0,0,40,39,0
4,28,4,338409,9,13,2,10,5,2,0,0,0,40,5,0


In [40]:
import numpy
import math

# Set a random seed so the shuffle is the same every time
numpy.random.seed(1)

# Shuffle the rows  
# This permutes the index randomly using numpy.random.permutation
# Then, it reindexes the dataframe with the result
# The net effect is to put the rows into random order
income = income.reindex(numpy.random.permutation(income.index))

train_max_row = math.floor(income.shape[0] * .8)
train = income.iloc[0:train_max_row,:]
test = income.iloc[train_max_row:,:]

In [41]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

columns = ["age", "workclass", "education_num", "marital_status", "occupation", "relationship", "race", "sex", "hours_per_week", "native_country"]

clf = DecisionTreeClassifier(random_state=1, min_samples_leaf=2)
clf.fit(train[columns], train["high_income"])

clf2 = DecisionTreeClassifier(random_state=1, max_depth=5)
clf2.fit(train[columns], train["high_income"])

predictions = clf.predict(test[columns])
print(roc_auc_score(test["high_income"], predictions))

predictions = clf2.predict(test[columns])
print(roc_auc_score(test["high_income"], predictions))

0.6878964226062301
0.6759853906508785


In [42]:
#create an ensemble and average probabilities of predictions; generate a prediction from 2 trees
import numpy as np
predictions = clf.predict_proba(test[columns])[:,1]
predictions2 = clf2.predict_proba(test[columns])[:,1]

mean = np.round((predictions2 + predictions)/2,0)
print(roc_auc_score(test["high_income"], mean))

0.7150846804038882


In [43]:
# We'll build 10 trees
tree_count = 10

# Each "bag" will have 60% of the number of original rows
bag_proportion = .6

predictions = []
for i in range(tree_count):
    # We select 60% of the rows from train, sampling with replacement
    # We set a random state to ensure we'll be able to replicate our results
    # We set it to i instead of a fixed value so we don't get the same sample in every loop
    # That would make all of our trees the same
    bag = train.sample(frac=bag_proportion, replace=True, random_state=i)
    
    # Fit a decision tree model to the "bag"
    clf = DecisionTreeClassifier(random_state=1, min_samples_leaf=2)
    clf.fit(bag[columns], bag["high_income"])
  
    # Using the model, make predictions on the test data
    predictions.append(clf.predict_proba(test[columns])[:,1])

length = len(predictions[0])
 
probs = []
for n in range(0,length,1): 
    proba = 0
    for i in range(0,tree_count,1):
        proba += predictions[i][n]
    probs.append(proba/tree_count)

mean = np.round(probs,0)
print(roc_auc_score(test["high_income"], mean))
    



0.7329963297474371


In [44]:
def calc_entropy(column):
    """
    Calculate entropy given a pandas series, list, or numpy array.
    """
    # Compute the counts of each unique value in the column
    counts = numpy.bincount(column)
    # Divide by the total column length to get a probability
    probabilities = counts / len(column)
    
    # Initialize the entropy to 0
    entropy = 0
    # Loop through the probabilities, and add each one to the total entropy
    for prob in probabilities:
        if prob > 0:
            entropy += prob * math.log(prob, 2)
    
    return -entropy

def calc_information_gain(data, split_name, target_name):
    """
    Calculate information gain given a data set, column to split on, and target
    """
    # Calculate the original entropy
    original_entropy = calc_entropy(data[target_name])
    
    # Find the median of the column we're splitting
    column = data[split_name]
    median = column.median()
    
    # Make two subsets of the data, based on the median
    left_split = data[column <= median]
    right_split = data[column > median]
    
    # Loop through the splits and calculate the subset entropies
    to_subtract = 0
    for subset in [left_split, right_split]:
        prob = (subset.shape[0] / data.shape[0]) 
        to_subtract += prob * calc_entropy(subset[target_name])
    
    # Return information gain
    return original_entropy - to_subtract

In [45]:
### example of randomly selecting features

# Create the data set that we used two missions ago
data = pandas.DataFrame([
    [0,4,20,0],
    [0,4,60,2],
    [0,5,40,1],
    [1,4,25,1],
    [1,5,35,2],
    [1,5,55,1]
    ])
data.columns = ["high_income", "employment", "age", "marital_status"]

# Set a random seed to make the results reproducible
numpy.random.seed(1)

# The dictionary to store our tree
tree = {}
nodes = []

# The function to find the column to split on
def find_best_column(data, target_name, columns):
    information_gains = []
    
    # Insert your code here
    
    for col in columns:
        information_gain = calc_information_gain(data, col, "high_income")
        information_gains.append(information_gain)

    # Find the name of the column with the highest gain
    highest_gain_index = information_gains.index(max(information_gains))
    highest_gain = columns[highest_gain_index]
    return highest_gain

# The function to construct an ID3 decision tree
def id3(data, target, columns, tree):
    unique_targets = pandas.unique(data[target])
    nodes.append(len(nodes) + 1)
    tree["number"] = nodes[-1]

    if len(unique_targets) == 1:
        if 0 in unique_targets:
            tree["label"] = 0
        elif 1 in unique_targets:
            tree["label"] = 1
        return
    
    best_column = find_best_column(data, target, columns)
    column_median = data[best_column].median()
    
    tree["column"] = best_column
    tree["median"] = column_median
    
    left_split = data[data[best_column] <= column_median]
    right_split = data[data[best_column] > column_median]
    split_dict = [["left", left_split], ["right", right_split]]
    
    for name, split in split_dict:
        tree[name] = {}
        id3(split, target, columns, tree[name])


# Run the ID3 algorithm on our data set and print the resulting tree
id3(data, "high_income", ["employment", "age", "marital_status"], tree)
print(tree)
def find_best_column(data, target_name, columns):
    information_gains = []
    
    # Select two columns randomly
    cols = numpy.random.choice(columns, 2)
    
    for col in cols:
        information_gain = calc_information_gain(data, col, "high_income")
        information_gains.append(information_gain)

    highest_gain_index = information_gains.index(max(information_gains))
    
    # Get the highest gain by indexing "cols"
    highest_gain = cols[highest_gain_index]
    
    return highest_gain

id3(data, "high_income", ["employment", "age", "marital_status"], tree)
print(tree)

{'left': {'left': {'left': {'label': 0, 'number': 4}, 'column': 'age', 'median': 22.5, 'number': 3, 'right': {'label': 1, 'number': 5}}, 'column': 'age', 'median': 25.0, 'number': 2, 'right': {'label': 0, 'number': 6}}, 'column': 'employment', 'median': 4.5, 'number': 1, 'right': {'left': {'left': {'label': 1, 'number': 9}, 'column': 'age', 'median': 37.5, 'number': 8, 'right': {'label': 0, 'number': 10}}, 'column': 'age', 'median': 40.0, 'number': 7, 'right': {'label': 1, 'number': 11}}}
{'left': {'left': {'left': {'label': 0, 'number': 15}, 'column': 'age', 'median': 22.5, 'number': 14, 'right': {'label': 1, 'number': 16}}, 'column': 'employment', 'median': 4.0, 'number': 13, 'right': {'label': 1, 'number': 17}}, 'column': 'age', 'median': 37.5, 'number': 12, 'right': {'left': {'left': {'label': 0, 'number': 20}, 'column': 'age', 'median': 47.5, 'number': 19, 'right': {'label': 1, 'number': 21}}, 'column': 'age', 'median': 55.0, 'number': 18, 'right': {'label': 0, 'number': 22}}}


In [46]:
### example of selecting features randomly with SKLearn (changing parameters to splitter=random and max_features=auto)
# We'll build 10 trees
tree_count = 10

# Each "bag" will have 60% of the number of original rows
bag_proportion = .6

predictions = []
for i in range(tree_count):
    # We select 60% of the rows from train, sampling with replacement
    # We set a random state to ensure we'll be able to replicate our results
    # We set it to i instead of a fixed value so we don't get the same sample every time
    bag = train.sample(frac=bag_proportion, replace=True, random_state=i)
    
    # Fit a decision tree model to the "bag"
    clf = DecisionTreeClassifier(random_state=1, min_samples_leaf=2, splitter="random", max_features="auto")
    clf.fit(bag[columns], bag["high_income"])
    
    # Using the model, make predictions on the test data
    predictions.append(clf.predict_proba(test[columns])[:,1])

combined = numpy.sum(predictions, axis=0) / 10
rounded = numpy.round(combined)

print(roc_auc_score(test["high_income"], rounded))

0.7345958637997538


Using random subsets from the previous screen improved the accuracy over using bagging alone:

settings	test AUC
min_samples_leaf: 2	0.688
max_depth: 2	0.676
combined predictions	0.715
min_samples_leaf: 2, with bagging	0.732
min_samples_leaf: 2, with bagging and random subsets	0.735

In [47]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=150, random_state=1, min_samples_leaf=2)

clf.fit(train[columns], train["high_income"])

predictions = clf.predict(test[columns])
print(roc_auc_score(test["high_income"], predictions))

0.7379403213124711


In [64]:
import timeit

start_time = timeit.default_timer()

clf = DecisionTreeClassifier(random_state=1, min_samples_leaf=5)

clf.fit(train[columns], train["high_income"])

predictions = clf.predict(train[columns])
print("1 Tree training data acc: " + str(roc_auc_score(train["high_income"], predictions)))

predictions = clf.predict(test[columns])
print("1 Tree test data acc: " + str(roc_auc_score(test["high_income"], predictions)))
print(timeit.default_timer() - start_time)

start_time = timeit.default_timer()

clf = RandomForestClassifier(n_estimators=150, random_state=1, min_samples_leaf=5)
clf.fit(train[columns], train["high_income"])


predictions = clf.predict(train[columns])
print("{} {}".format("150 Tree training data acc: ", roc_auc_score(train["high_income"], predictions)))

predictions = clf.predict(test[columns])
print("150 Tree test data acc: " + str(roc_auc_score(test["high_income"], predictions)))

print(timeit.default_timer() - start_time)

1 Tree training data acc: 0.8192570489534683
1 Tree test data acc: 0.7139325899284541
0.09822869999698014
150 Tree training data acc:  0.7917047295143252
150 Tree test data acc: 0.7498874343962398
2.9832977000005485
