#### Load and clean the data set

In [8]:
import pandas as pd

income = pd.read_csv("adult.data",header=None)
income.columns = ["age", "workclass", "fnlwgt", "education", "education_num", "marital_status", "occupation", "relationship", "race", "sex", "capital_gain", "capital_loss", "hours_per_week", "native_country", "high_income"]

categories = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country', 'high_income']

for i in categories:
    col = pd.Categorical.from_array(income[i])
    income[i] = col.codes

#### Split the data

In [9]:
import numpy
import math

# Set a random seed so the shuffle is the same every time.
numpy.random.seed(1)

# Shuffle the rows.  This first permutes the index randomly using numpy.random.permutation.
# Then, it reindexes the dataframe with this.
# The net effect is to put the rows into random order.
income = income.reindex(numpy.random.permutation(income.index))

train_max_row = math.floor(income.shape[0] * .8)
train = income.iloc[:train_max_row,:]
test = income.iloc[train_max_row:,:]

#### Create and test 2 decision trees

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

columns = ["age", "workclass", "education_num", "marital_status", "occupation", "relationship", "race", "sex", "hours_per_week", "native_country"]

clf = DecisionTreeClassifier(random_state=1, min_samples_leaf=2)
clf.fit(train[columns], train["high_income"])
clf_predictions = clf.predict(test[columns])
print(roc_auc_score(test.high_income, clf_predictions))

clf2 = DecisionTreeClassifier(random_state=1, max_depth=5)
clf2.fit(train[columns], train["high_income"])
clf2_predictions = clf2.predict(test[columns])
print(roc_auc_score(test.high_income, clf2_predictions))

0.687896422606
0.675985390651


In [11]:
predictions = clf.predict_proba(test[columns])[:,1]
predictions2 = clf2.predict_proba(test[columns])[:,1]

mean_predictions = (predictions + predictions2) / 2
mean_predictions = numpy.round(mean_predictions)

print(roc_auc_score(test.high_income, mean_predictions))

0.715084680404


#### Two ways to introduce variation in a random forest
* Bagging - Each tree is trained with a random sample of data called a bag. This is peformed with replacement, as in samples which are included for one tree may be included for other trees as well.
* Random feature subsets

##### Use bagging to generate random forest

In [21]:
import numpy as np

# We'll build 10 trees
tree_count = 10

# Each "bag" will have 60% of the number of original rows.
bag_proportion = .6

predictions = []
for i in range(tree_count):
    # We select 60% of the rows from train, sampling with replacement.
    # We set a random state to ensure we'll be able to replicate our results.
    # We set it to i instead of a fixed value so we don't get the same sample every loop.
    # That would make all of our trees the same.
    bag = train.sample(frac=bag_proportion, replace=True, random_state=i)
    
    # Fit a decision tree model to the "bag".
    clf = DecisionTreeClassifier(random_state=1, min_samples_leaf=2)
    clf.fit(bag[columns], bag["high_income"])
    
    # Using the model, make predictions on the test data.
    predictions.append(clf.predict_proba(test[columns])[:,1])
    
result = np.sum(predictions, axis=0) / 10
result = np.round(result)

print("AUC score from random forest:", roc_auc_score(test.high_income, result))

AUC score from random forest: 0.732996329747


##### Using scikit learn to generate random subsets of features

In [32]:
# We'll build 10 trees
tree_count = 10

# Each "bag" will have 60% of the number of original rows.
bag_proportion = .6

predictions = []
for i in range(tree_count):
    # We select 60% of the rows from train, sampling with replacement.
    # We set a random state to ensure we'll be able to replicate our results.
    # We set it to i instead of a fixed value so we don't get the same sample every time.
    bag = train.sample(frac=bag_proportion, replace=True, random_state=i)
    
    # Fit a decision tree model to the "bag".
    clf = DecisionTreeClassifier(random_state=1, min_samples_leaf=2, splitter="random", max_features="auto")
    clf.fit(bag[columns], bag["high_income"])
    
    # Using the model, make predictions on the test data.
    predictions.append(clf.predict_proba(test[columns])[:,1])

combined = numpy.sum(predictions, axis=0) / 10
rounded = numpy.round(combined)

print(roc_auc_score(test["high_income"], rounded))

0.7345958638


##### Using scikit learn to create random forest classifier

In [33]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=5, random_state=1, min_samples_leaf=2)
clf.fit(train[columns], train.high_income)
predictions = clf.predict(test[columns])
print(roc_auc_score(test.high_income, predictions))

0.734746139194


##### Increase the amount of trees to 150

In [35]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=150, random_state=1, min_samples_leaf=2)
clf.fit(train[columns], train.high_income)
predictions = clf.predict(test[columns])
print(roc_auc_score(test.high_income, predictions))

0.737940321312


In [41]:
clf = DecisionTreeClassifier(random_state=1, min_samples_leaf=5)

clf.fit(train[columns], train["high_income"])

predictions = clf.predict(train[columns])
print("Training set with decision tree  ", roc_auc_score(train["high_income"], predictions))

predictions = clf.predict(test[columns])
print("Test set with decision tree      ", roc_auc_score(test["high_income"], predictions))

clf = RandomForestClassifier(n_estimators=150, random_state=1, min_samples_leaf=5)
clf.fit(train[columns], train.high_income)

predictions = clf.predict(train[columns])
print("\nTraining set with random forest  ", roc_auc_score(train["high_income"], predictions))

predictions = clf.predict(test[columns])
print("Test set with random forest      ", roc_auc_score(test["high_income"], predictions))

Training set with decision tree   0.819257048953
Test set with decision tree       0.713932589928

Training set with random forest   0.791704729514
Test set with random forest       0.749887434396


### When to use random forests

##### Strengths
* Very accurate predictions -- Random forests achieve near state of the art performance on many machine learning tasks. Along with neural networks and gradient boosted trees, they are typically one of the top performing algorithms.
* Resistance to overfitting -- due to how they're constructed, random forests are fairly resistant to overfitting. Parameters like max_depth still have to be set and tweaked, though.

##### Weaknesses
* Hard to interpret -- because we've averaging the results of many trees, it can be hard to figure out why a random forest is making predictions the way it is.
* Longer creation time -- making two trees takes twice as long as making one, 3 takes three times as long, and so on. Luckily, we can exploit multicore processors to parallelize tree construction. Scikit allows us to do this through the n_jobs parameter on RandomForestClassifier. We'll get more into parallelization later.