Random forests have several advantages, but explorations in R have been very slow. This is an attempt to see how scikit-learn fares. There are two passes we make here: first, a toy example to fix the syntax and then a real example that takes forever in R.

We are focusing on regression problems.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import datasets

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
boston = datasets.load_boston()

In [30]:
features = pd.DataFrame(boston.data, columns = boston.feature_names); features.shape

(506, 13)

In [32]:
features.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [20]:
target = boston.target; target.shape

(506,)

In [7]:
from sklearn.preprocessing import StandardScaler

In [8]:
X_train, X_test, y_train, y_test = train_test_split(features, target, train_size = 0.8, random_state = 20130810)



In [10]:
scaler = StandardScaler().fit(X_train)
X_train_scaled = pd.DataFrame(scaler.transform(X_train), index = X_train.index.values, columns = X_train.columns.values)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), index = X_test.index.values, columns = X_test.columns.values)

Is scaling necessary for Random Forests?

In [11]:
from sklearn.ensemble import RandomForestRegressor

In [25]:
%time 
rf = RandomForestRegressor(n_estimators = 500, oob_score=True, random_state=20130810, verbose = 1)
rf.fit(X_train, y_train)

Wall time: 0 ns


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.8s finished


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
           oob_score=True, random_state=20130810, verbose=1,
           warm_start=False)

In [26]:
predicted_train = rf.predict(X_train)
predicted_test = rf.predict(X_test)

[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.0s finished


In [27]:
rf.oob_score_

0.86146083124116513

Lets now move to some pesky data

In [40]:
adult_train = pd.read_csv("data/general/adult.data", header=None, names = ["age", "workclass", "fnlwgt", "education", "education_num",
                                                                           "marital_status", "occupation", "relationship", "race",
                                                                           "sex", "capital_gain", "capital_loss", "hours_per_week",
                                                                           "native_country", "target"])

In [41]:
adult_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [43]:
adult_train.shape

(32561, 15)

In [91]:
adult_test = pd.read_csv("data/general/adult.test", 
                         header=None, 
                         names = ["age", "workclass", "fnlwgt", "education", "education_num",
                                                                          "marital_status", "occupation", "relationship", "race",
                                                                          "sex", "capital_gain", "capital_loss", "hours_per_week",
                                                                          "native_country", "target"],
                         skiprows = 1)

In [92]:
adult_test.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,target
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.


In [93]:
adult_train.replace(r'\?', np.nan, regex = True)
adult_test.replace(r'\?', np.nan, regex = True)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,target
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K.
5,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K.
6,29,,227026,HS-grad,9,Never-married,,Unmarried,Black,Male,0,0,40,United-States,<=50K.
7,63,Self-emp-not-inc,104626,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,3103,0,32,United-States,>50K.
8,24,Private,369667,Some-college,10,Never-married,Other-service,Unmarried,White,Female,0,0,40,United-States,<=50K.
9,55,Private,104996,7th-8th,4,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,10,United-States,<=50K.


In [99]:
X = adult_train.drop('target', axis = 1)
y = adult_train['target']

In [100]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state = 20130810)



In [101]:
from sklearn.ensemble import RandomForestClassifier

In [102]:
rf = RandomForestClassifier(n_estimators = 30, max_depth=10, random_state=20130810)

In [103]:
rf.fit(X_train, y_train)

ValueError: could not convert string to float: ' United-States'