In [1]:
import numpy as np
import pandas as pd
from pprint import pprint
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split



  from numpy.core.umath_tests import inner1d


In [2]:
# The original file from UCI named adult.test has 16,281 records
# https://archive.ics.uci.edu/ml/datasets/Census+Income
# Renamed adult.test to adult_data_small...
# Changed all categorical fields with text to categorical with integers
# Manually split this file into 2 files.  One for train/test and one for validation
# adult_data_small_train_test has 14,500 records
# adult_data small_val has 1,781 records


# The original file from UCI named adult.data has 32,561 records
# https://archive.ics.uci.edu/ml/datasets/Census+Income
# Renamed adult.data to adult_data_large...
# Changed all categorical fields with text to categorical with integers
# Manually split this file into 2 files.  One for train/test and one for validation
# adult_data_large_train_test has 29,000 records
# adult_data_large_val has 3,561 records

In [3]:
# Reading and prepping csv input files
# Set input file names
train_test_file_name = "adult_data_small_train_test.csv"
val_file_name = "adult_data_small_val.csv"


# Read the input files (train/test and validation) into dataframes and add the column names
colnames=["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", 
"relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"] 
train_test_df = pd.read_csv(train_test_file_name, names=colnames, header=None) 
val_df = pd.read_csv(val_file_name, names=colnames, header=None) 

# View shape of dataframes that were created to make sure they are correct
print("\nShape of df created from " + train_test_file_name + " is " + str(train_test_df.shape))
print("Shape of df created from " + val_file_name + " is " + str(val_df.shape))

# Drop any rows that have a ? in a field since values can't be imputed
indexNames = train_test_df[(train_test_df["workclass"] == "?") | (train_test_df["native-country"] == "?") | 
(train_test_df["occupation"] == "?")].index
train_test_df.drop(indexNames , inplace=True)

indexNames = val_df[(val_df["workclass"] == "?") | (val_df["native-country"] == "?") | (val_df["occupation"] == "?")].index
val_df.drop(indexNames , inplace=True)

# Reset the indices
train_test_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)

# View shape of dataframes after the rows with ? were removed
print("\nShape of train-test df after dropping rows with ? is " + str(train_test_df.shape))
print("Shape of val df after dropping rows with ? is " + str(val_df.shape))


Shape of df created from adult_data_small_train_test.csv is (14500, 15)
Shape of df created from adult_data_small_val.csv is (1781, 15)

Shape of train-test df after dropping rows with ? is (13424, 15)
Shape of val df after dropping rows with ? is (1636, 15)


In [4]:
# Do train/test split for the train_test_df
# Get the columns that are needed for the train_test_df_data dataframe
cols = [col for col in train_test_df.columns if col not in ['income']]
train_test_df_data = train_test_df[cols]
train_test_df_target = train_test_df['income']
X_train, X_test, y_train, y_test = train_test_split(train_test_df_data, train_test_df_target, test_size=0.2)

# View shape of test and train dataframes
print("\nX_train shape = " + str(X_train.shape) + " y_train shape  = " + str(y_train.shape))
print("X_test shape = " + str(X_test.shape) + " y_test shape  = " + str(y_test.shape))


X_train shape = (10739, 14) y_train shape  = (10739,)
X_test shape = (2685, 14) y_test shape  = (2685,)


In [5]:
# ************ Train/Test *********************
print("\n***************************************************************")
print("*****Random Forest With Default Hyperparameters Train/Test*****")
print("***************************************************************\n")

# ************ Train/Test *********************
# Create Random Forest Classifier using default hyperparameters
default_rf_clf = RandomForestClassifier(random_state=0)

# Train the Classifier
default_rf_clf.fit(X_train, y_train)

# Perform predictions
default_train_test_preds = default_rf_clf.predict(X_test)

# View the first 5 predictions and the predicted probabilities 
print("*****Default Hyperparameters Test Predictions*****")
for i in range(0,5):
	print("Prediction " + str(i+1) + " is " + str(default_train_test_preds[i]) + " with probabilities of " 
		+ str(default_rf_clf.predict_proba(X_test)[i]))

# Create confusion matrix
print("\n*****Default Hyperparameters Test Confusion Matrix*****")
confusion_matrix = pd.crosstab(y_test, default_train_test_preds, rownames=['Actual Income'], colnames=['Predicted Income'])
print(confusion_matrix)

# Print accuracy
accuracy = (((confusion_matrix[0][0] + confusion_matrix[1][1])) / (X_test.shape[0]) * 100)
print("\nDefault Hyperparameters Test Accuracy = {0:0.4f}%".format(accuracy))



***************************************************************
*****Random Forest With Default Hyperparameters Train/Test*****
***************************************************************

*****Default Hyperparameters Test Predictions*****
Prediction 1 is 1 with probabilities of [0.2 0.8]
Prediction 2 is 1 with probabilities of [0.2 0.8]
Prediction 3 is 1 with probabilities of [0. 1.]
Prediction 4 is 1 with probabilities of [0.2 0.8]
Prediction 5 is 0 with probabilities of [0.9 0.1]

*****Default Hyperparameters Test Confusion Matrix*****
Predicted Income    0     1
Actual Income              
0                 437   249
1                 209  1790

Default Hyperparameters Test Accuracy = 82.9423%


In [6]:
# ************ Validation *********************
print("\n\n***************************************************************")
print("*****Random Forest With Default Hyperparameters Validation*****")
print("***************************************************************\n")


# Apply the classifier we trained (default_rf_clf) to the validation data
cols = [col for col in val_df.columns if col not in ['income']]
val_df_data = val_df[cols]
val_df_target = val_df['income']
val_preds = default_rf_clf.predict(val_df_data)

# View the first 5 predictions and the predicted probabilities 
print("*****Default Hyperparameters Validation Predictions*****")
for i in range(0,5):
	print("Prediction " + str(i + 1) + " is " + str(val_preds[i]) + " with probabilities of " 
		+ str(default_rf_clf.predict_proba(val_df_data)[i]))

# Create confusion matrix
print("*****Default Hyperparameters Validation Confusion Matrix*****")
confusion_matrix = pd.crosstab(val_df_target, val_preds, rownames=['Actual Income'], colnames=['Predicted Income'])
print(confusion_matrix)

# Print accuracy
default_accuracy = (((confusion_matrix[0][0] + confusion_matrix[1][1])) / (val_df_data.shape[0]) * 100)
print("\nDefault Hyperparameters Validation Accuracy = {0:0.4f}%\n\n".format(default_accuracy))



***************************************************************
*****Random Forest With Default Hyperparameters Validation*****
***************************************************************

*****Default Hyperparameters Validation Predictions*****
Prediction 1 is 1 with probabilities of [0. 1.]
Prediction 2 is 1 with probabilities of [0. 1.]
Prediction 3 is 1 with probabilities of [0.2 0.8]
Prediction 4 is 1 with probabilities of [0.4 0.6]
Prediction 5 is 1 with probabilities of [0. 1.]
*****Default Hyperparameters Validation Confusion Matrix*****
Predicted Income    0     1
Actual Income              
0                 252   130
1                 119  1135

Default Hyperparameters Validation Accuracy = 84.7800%




In [7]:
# ************ Random Forest Classifier using grid search for hyper-parameters ******************
print("\n\n***************************************************************")
print("*****************Random Forest Grid Search*********************")
print("***************************************************************\n")

# Number of trees in random forest
# Using num=2 will significantly reduce run time (~3 min vs ~65 min)
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 2)]
# Using num=10 will significantly increase run time
#n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
# Using num=2 will significantly reduce run time (~3 min vs ~65 min)
max_depth = [int(x) for x in np.linspace(10, 110, num = 2)]
# Using num=10 will significantly increase run time
#max_depth = [int(x) for x in np.linspace(10, 110, num = 10)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print("The values that will be used for the grid search are:")
pprint(grid)

print("\n*****Performing the Grid Search of the Hyperparameters*****")
# Create a regressor using values from grid
rf_reg = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf_reg, param_grid = grid, cv = 3, n_jobs = -1, verbose = 1)

# Train the classifier
best_grid = grid_search.fit(X_train, y_train)

# Make predictions
grid_preds = best_grid.predict(X_test)

print("The best hyperparameters found during the grid search are:")
pprint(best_grid.best_params_)




***************************************************************
*****************Random Forest Grid Search*********************
***************************************************************

The values that will be used for the grid search are:
{'bootstrap': [True, False],
 'max_depth': [10, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 2000]}

*****Performing the Grid Search of the Hyperparameters*****
Fitting 3 folds for each of 216 candidates, totalling 648 fits


[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done 290 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 648 out of 648 | elapsed:  3.6min finished


The best hyperparameters found during the grid search are:
{'bootstrap': False,
 'max_depth': 10,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 2000}


In [8]:
# ************ Random Forest Classifier using grid search with best hyper-parameters ******************
print("\n************************************************************************")
print("*****Random Forest With Hyperparameters from Grid Search Train/Test*****")
print("************************************************************************\n")

# Train the Classifier with new values from grid search
rf_clf = RandomForestClassifier(max_depth = best_grid.best_params_["max_depth"], 
	min_samples_split = best_grid.best_params_["min_samples_split"], 
	min_samples_leaf = best_grid.best_params_["min_samples_leaf"], 
	bootstrap = best_grid.best_params_["bootstrap"], 
	max_features = best_grid.best_params_["max_features"], 
	n_estimators = best_grid.best_params_["n_estimators"], n_jobs=-1, random_state=0)
rf_clf.fit(X_train, y_train)

# Perform predictions
train_test_preds = rf_clf.predict(X_test)

# View the first 5 predictions and the predicted probabilities 
print("\n")
print("*****Grid Search Hyperparameters Test Predictions*****")
for i in range(0,5):
	print("Prediction " + str(i+1) + " is " + str(train_test_preds[i]) + " with probabilities of " + str(rf_clf.predict_proba(X_test)[i]))

# Create confusion matrix
print("\n*****Grid Search Hyperparameters Test Confusion Matrix*****")
confusion_matrix = pd.crosstab(y_test, train_test_preds, rownames=['Actual Income'], colnames=['Predicted Income'])
print(confusion_matrix)

# Print accuracy
grid_search_accuracy = (((confusion_matrix[0][0] + confusion_matrix[1][1])) / (X_test.shape[0]) * 100)
print("\nGrid Search Hyperparameters Test Accuracy = {0:0.4f}%".format(grid_search_accuracy))

print("\nCompared to the random forest classifier created with default hyperparameters, the accuracy of the " +
	"classifier created with optimal hyperparameters determined by grid search is " +
	"{0:0.4f}% better".format((grid_search_accuracy - default_accuracy)))


************************************************************************
*****Random Forest With Hyperparameters from Grid Search Train/Test*****
************************************************************************



*****Grid Search Hyperparameters Test Predictions*****
Prediction 1 is 1 with probabilities of [0.20075775 0.79924225]
Prediction 2 is 1 with probabilities of [0.08290475 0.91709525]
Prediction 3 is 1 with probabilities of [0.00358695 0.99641305]
Prediction 4 is 1 with probabilities of [0.28043365 0.71956635]
Prediction 5 is 0 with probabilities of [0.96604775 0.03395225]

*****Grid Search Hyperparameters Test Confusion Matrix*****
Predicted Income    0     1
Actual Income              
0                 386   300
1                  98  1901

Grid Search Hyperparameters Test Accuracy = 85.1769%

Compared to the random forest classifier created with default hyperparameters, the accuracy of the classifier created with optimal hyperparameters determined by grid search i