In [1]:
from pydataset import data
from sklearn.cross_validation import train_test_split
import pandas as pd

## Get data

In [2]:
df = data("Males")

In [3]:
df.head()

Unnamed: 0,nr,year,school,exper,union,ethn,married,health,wage,industry,occupation,residence
1,13,1980,14,1,no,other,no,no,1.19754,Business_and_Repair_Service,Service_Workers,north_east
2,13,1981,14,2,yes,other,no,no,1.85306,Personal_Service,Service_Workers,north_east
3,13,1982,14,3,no,other,no,no,1.344462,Business_and_Repair_Service,Service_Workers,north_east
4,13,1983,14,4,no,other,no,no,1.433213,Business_and_Repair_Service,Service_Workers,north_east
5,13,1984,14,5,no,other,no,no,1.568125,Personal_Service,"Craftsmen, Foremen_and_kindred",north_east


## Create dummy variables and split data

In [4]:
dummy_vars = ['school', 'union', 'ethn', 'married', 'health', 'industry', 'occupation', 'residence']
df_dummies = pd.get_dummies(df, columns=dummy_vars)

In [5]:
y = df.wage
del df_dummies['wage']
X = df_dummies
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## First Model - Random Forest

In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
clf = RandomForestRegressor()
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
print("MSE: {}".format(mean_squared_error(predictions, y_test)))

MSE: 0.208287219606


## Grid Search Hyperparameters

In [8]:
from sklearn.grid_search import GridSearchCV
import numpy as np
parameters = {'n_estimators':[10, 100, 1000], 'max_features': ['auto', 'sqrt', 'log2']}
model = RandomForestRegressor()
clf = GridSearchCV(model, parameters, cv=5, n_jobs=6)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
print("MSE: {}".format(mean_squared_error(predictions, y_test)))
print("MAE: {}".format(np.mean(np.abs(predictions - y_test))))

MSE: 0.193103387872
MAE: 0.29340976703


## Custom Scoring Function - Maximizes Score

In [9]:
def mean_abs_error(estimator, X, y):
    predictions = estimator.predict(X)
    return -np.mean(np.abs(predictions - y))

clf = GridSearchCV(model, parameters, cv=5, n_jobs=6, scoring=mean_abs_error)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
print("MSE: {}".format(mean_squared_error(predictions, y_test)))
print("MAE: {}".format(np.mean(np.abs(predictions - y_test))))

MSE: 0.193477716119
MAE: 0.294052963377


## Random Search

In [10]:
from scipy.stats import randint as sp_randint
from sklearn.grid_search import RandomizedSearchCV
parameters = {'n_estimators':[10, 100, 1000], "max_features": sp_randint(1, 11)}
clf = RandomizedSearchCV(model, param_distributions=parameters, cv=5, n_jobs=6,
                                   scoring=mean_abs_error, n_iter=10)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
print("MSE: {}".format(mean_squared_error(predictions, y_test)))
print("MAE: {}".format(np.mean(np.abs(predictions - y_test))))

MSE: 0.192518791677
MAE: 0.290623108254
