# Random Forest Regressor (scikit-learn)

In [None]:
%%javascript
IPython.notebook.clear_all_output();

<IPython.core.display.Javascript object>

## sklearn.ensemble.RandomForestRegressor

**sklearn.tree.DecisionTreeRegressor** [[LINK](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html)]

criterion = "**mse**/friedman_mse/mae"

others are similar to `RandomForestClassifier`

# Import Dataset
## UCI Abalone  Dataset

The example is from https://archive.ics.uci.edu/ml/datasets/abalone

We will use all 4177 instances and all 9 features.

We use the features: {'sex', 'length', 'diameter', 'height', 'whole_weight', 'shucked_weight', 'viscera_weight', 'shell_weight', 'rings'} where the {'rings'} feature serves as our target feature and represents the age of the abalone.

The first eight rows of the dataset look as follows

In [1]:
import pandas as pd

headers=['sex', 'length', 'diameter', 'height', 'whole_weight', 'shucked_weight', 'viscera_weight', 'shell_weight', 'rings']
dataset = pd.read_csv("abalone.data.csv", header=None, names=headers, na_values="?")
dataset.head(10)

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7
5,I,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,8
6,F,0.53,0.415,0.15,0.7775,0.237,0.1415,0.33,20
7,F,0.545,0.425,0.125,0.768,0.294,0.1495,0.26,16
8,M,0.475,0.37,0.125,0.5095,0.2165,0.1125,0.165,9
9,F,0.55,0.44,0.15,0.8945,0.3145,0.151,0.32,19


In [5]:
dataset = dataset.sample(frac=1)
dataset.head(10)

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
2010,I,0.435,0.325,0.12,0.346,0.159,0.084,0.095,7
3208,I,0.32,0.25,0.08,0.1565,0.057,0.034,0.06,9
1522,M,0.68,0.515,0.17,1.6115,0.8415,0.306,0.395,11
291,F,0.565,0.505,0.21,1.2765,0.501,0.279,0.355,12
4170,M,0.55,0.43,0.13,0.8395,0.3155,0.1955,0.2405,10
1919,I,0.605,0.49,0.165,1.071,0.482,0.1935,0.352,10
2118,F,0.48,0.38,0.12,0.608,0.2705,0.1405,0.185,8
1966,F,0.665,0.5,0.15,1.2475,0.4625,0.2955,0.3595,10
3680,F,0.61,0.495,0.165,1.0835,0.4525,0.273,0.317,9
3268,M,0.41,0.31,0.125,0.3595,0.1415,0.0885,0.115,11


**One Hot Encoding**

In [2]:
dataset["sex"].value_counts()

M    1528
I    1342
F    1307
Name: sex, dtype: int64

In [3]:
dataset = pd.get_dummies(dataset, columns=["sex"])
dataset.head(10)

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings,sex_F,sex_I,sex_M
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,0,0,1
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,0,0,1
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,1,0,0
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,0,0,1
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7,0,1,0
5,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,8,0,1,0
6,0.53,0.415,0.15,0.7775,0.237,0.1415,0.33,20,1,0,0
7,0.545,0.425,0.125,0.768,0.294,0.1495,0.26,16,1,0,0
8,0.475,0.37,0.125,0.5095,0.2165,0.1125,0.165,9,0,0,1
9,0.55,0.44,0.15,0.8945,0.3145,0.151,0.32,19,1,0,0


**Switch Columns**

In [5]:
# get headers (for adjustment later)
cols = list(dataset)
print(cols)

['length', 'diameter', 'height', 'whole_weight', 'shucked_weight', 'viscera_weight', 'shell_weight', 'rings', 'sex_F', 'sex_I', 'sex_M']


In [6]:
# get y-column index
j = cols.index('rings')
print(j)

7


In [7]:
# pop y-column from cols
col_j = cols.pop(j)
print(cols)
print(col_j)

['length', 'diameter', 'height', 'whole_weight', 'shucked_weight', 'viscera_weight', 'shell_weight', 'sex_F', 'sex_I', 'sex_M']
rings


In [8]:
cols.insert(len(cols), col_j)
dataset = dataset.loc[:,cols] # loc gets data indexed by *labels* (iloc by *position*) 
dataset.head(10)

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,sex_F,sex_I,sex_M,rings
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,0,0,1,15
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,0,0,1,7
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,1,0,0,9
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,0,0,1,10
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,0,1,0,7
5,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,0,1,0,8
6,0.53,0.415,0.15,0.7775,0.237,0.1415,0.33,1,0,0,20
7,0.545,0.425,0.125,0.768,0.294,0.1495,0.26,1,0,0,16
8,0.475,0.37,0.125,0.5095,0.2165,0.1125,0.165,0,0,1,9
9,0.55,0.44,0.15,0.8945,0.3145,0.151,0.32,1,0,0,19


In [10]:
# 70% data for training, 30% data for testing

training_data = dataset.iloc[:int(0.7*len(dataset))]
training_data.head(10)

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,sex_F,sex_I,sex_M,rings
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,0,0,1,15
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,0,0,1,7
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,1,0,0,9
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,0,0,1,10
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,0,1,0,7
5,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,0,1,0,8
6,0.53,0.415,0.15,0.7775,0.237,0.1415,0.33,1,0,0,20
7,0.545,0.425,0.125,0.768,0.294,0.1495,0.26,1,0,0,16
8,0.475,0.37,0.125,0.5095,0.2165,0.1125,0.165,0,0,1,9
9,0.55,0.44,0.15,0.8945,0.3145,0.151,0.32,1,0,0,19


In [11]:
training_data = training_data.reset_index(drop=True)
training_data.head(10)

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,sex_F,sex_I,sex_M,rings
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,0,0,1,15
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,0,0,1,7
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,1,0,0,9
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,0,0,1,10
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,0,1,0,7
5,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,0,1,0,8
6,0.53,0.415,0.15,0.7775,0.237,0.1415,0.33,1,0,0,20
7,0.545,0.425,0.125,0.768,0.294,0.1495,0.26,1,0,0,16
8,0.475,0.37,0.125,0.5095,0.2165,0.1125,0.165,0,0,1,9
9,0.55,0.44,0.15,0.8945,0.3145,0.151,0.32,1,0,0,19


In [12]:
testing_data = dataset.iloc[int(0.7*len(dataset)):].reset_index(drop=True)

X_train = training_data.iloc[:, :-1]
print(X_train.shape)
X_test = testing_data.iloc[:, :-1]
print(X_test.shape)
y_train = training_data.iloc[:, -1]
print(y_train.shape)
y_test = testing_data.iloc[:, -1]
print(y_test.shape)

(2923, 10)
(1254, 10)
(2923,)
(1254,)


In [18]:
from sklearn.ensemble import RandomForestRegressor

tree = RandomForestRegressor(n_estimators=10, criterion="mse", max_depth=10, random_state=0)

# training
tree.fit(X_train, y_train)

# testing
y_predict = tree.predict(X_test)
print(y_predict)
print(y_test.values) # https://blog.csdn.net/weixin_39223665/article/details/79935467

[13.38768068 17.175      12.74880745 ...  9.70562303 11.65057146
  7.91481481]
[12 13 13 ... 15 11  9]


  from numpy.core.umath_tests import inner1d


In [19]:
import numpy as np

# compute root-mean-square error
RMSE = (y_test - y_predict)**2
RMSE = np.sum(RMSE.values)
RMSE /= len(y_test)
RMSE = np.sqrt(RMSE)
print(RMSE)

2.322789868942676


#### You can try parameter tuning here, but we will illustrate with another model, SVM, so that you can see more models