# Hyperparameter Tuning
* Use the previous dataset to model a random forest.

- Three hyperparameters to tune:
 1. Feature Sampling
 2. Number of Estimators
 3. Min Samples Leaf

Show how you use each of these parameters by evaluating against your validation set. Do so by both plotting accuracy scores of the hyperparameters against the validation set, and with a sentence or two about how you made your decision.

- A couple of notes: Remember that there is an interaction between feature sampling and number of estimators. Feature sampling increases the variance of the estimators, which tends to lead to a greater benefit of adding additional estimators

For choosing the number of estimators, we can initially work with a lower number than we will in our final model. The goal initially is to choose a number so that we can model relatively quickly, yet still find the general trend in the data.

## my data in use: Automobile (insurance) Data Set 
http://mlr.cs.umass.edu/ml/datasets/Automobile

In [1]:
import pandas as pd

df1 = pd.read_csv("https://raw.githubusercontent.com/mhan1/Data-Science/master/Machine%20Learning_Linear%20Regression%20project_Minyeong%20Han_Data%20Science/imputed_engineered_features.csv")
df1.head(2)

Unnamed: 0.1,Unnamed: 0,symboling,num_of_doors,wheel-base,length,width,height,curb_weight,num_of_cylinders,engine_size,...,engine_type_ohc,engine_type_ohcf,engine_type_ohcv,engine_type_other,fuel_system_1bbl,fuel_system_2bbl,fuel_system_idi,fuel_system_mpfi,fuel_system_other,normalized_losses
0,0,3,2.0,88.6,168.8,64.1,48.8,2548,4,130,...,0,0,0,0,0,0,0,1,0,122.0
1,1,3,2.0,88.6,168.8,64.1,48.8,2548,4,130,...,0,0,0,0,0,0,0,1,0,122.0


In [2]:
#checking the number of rows and columns
df1.shape

(205, 50)

In [3]:
#drop unnecessary column
df1 = df1.drop('Unnamed: 0', axis=1)
df1.head(2)

Unnamed: 0,symboling,num_of_doors,wheel-base,length,width,height,curb_weight,num_of_cylinders,engine_size,bore,...,engine_type_ohc,engine_type_ohcf,engine_type_ohcv,engine_type_other,fuel_system_1bbl,fuel_system_2bbl,fuel_system_idi,fuel_system_mpfi,fuel_system_other,normalized_losses
0,3,2.0,88.6,168.8,64.1,48.8,2548,4,130,3.47,...,0,0,0,0,0,0,0,1,0,122.0
1,3,2.0,88.6,168.8,64.1,48.8,2548,4,130,3.47,...,0,0,0,0,0,0,0,1,0,122.0


In [4]:
df1.columns

Index(['symboling', 'num_of_doors', 'wheel-base', 'length', 'width', 'height',
       'curb_weight', 'num_of_cylinders', 'engine_size', 'bore', 'stroke',
       'compression_ratio', 'horsepower', 'peak_rpm', 'city_mpg',
       'highway_mpg', 'price', 'engine_location_is_front', 'fuel_type_is_gas',
       'aspiration_is_std', 'make_honda', 'make_mazda', 'make_mitsubishi',
       'make_nissan', 'make_other', 'make_peugot', 'make_subaru',
       'make_toyota', 'make_volkswagen', 'make_volvo', 'body_style_hatchback',
       'body_style_other', 'body_style_sedan', 'body_style_wagon',
       'drive_wheels_fwd', 'drive_wheels_other', 'drive_wheels_rwd',
       'engine_type_dohc', 'engine_type_l', 'engine_type_ohc',
       'engine_type_ohcf', 'engine_type_ohcv', 'engine_type_other',
       'fuel_system_1bbl', 'fuel_system_2bbl', 'fuel_system_idi',
       'fuel_system_mpfi', 'fuel_system_other', 'normalized_losses'],
      dtype='object')

In [5]:
X = df1.iloc[:, :-1]
X.head(2)

Unnamed: 0,symboling,num_of_doors,wheel-base,length,width,height,curb_weight,num_of_cylinders,engine_size,bore,...,engine_type_l,engine_type_ohc,engine_type_ohcf,engine_type_ohcv,engine_type_other,fuel_system_1bbl,fuel_system_2bbl,fuel_system_idi,fuel_system_mpfi,fuel_system_other
0,3,2.0,88.6,168.8,64.1,48.8,2548,4,130,3.47,...,0,0,0,0,0,0,0,0,1,0
1,3,2.0,88.6,168.8,64.1,48.8,2548,4,130,3.47,...,0,0,0,0,0,0,0,0,1,0


In [6]:
y = df1.iloc[:, -1]
y.head(2)

0    122.0
1    122.0
Name: normalized_losses, dtype: float64

## Split the data into training, test, and validation sets

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [8]:
len(X_test) / (len(X_train) + len(X_test) + len(X_val))

0.2

In [9]:
len(X_val) / (len(X) - len(X_test))

0.20121951219512196

In [10]:
X_train[:1]

Unnamed: 0,symboling,num_of_doors,wheel-base,length,width,height,curb_weight,num_of_cylinders,engine_size,bore,...,engine_type_l,engine_type_ohc,engine_type_ohcf,engine_type_ohcv,engine_type_other,fuel_system_1bbl,fuel_system_2bbl,fuel_system_idi,fuel_system_mpfi,fuel_system_other
28,-1,4.0,103.3,174.6,64.6,59.8,2535,4,122,3.34,...,0,1,0,0,0,0,1,0,0,0


In [11]:
X_val[:1]

Unnamed: 0,symboling,num_of_doors,wheel-base,length,width,height,curb_weight,num_of_cylinders,engine_size,bore,...,engine_type_l,engine_type_ohc,engine_type_ohcf,engine_type_ohcv,engine_type_other,fuel_system_1bbl,fuel_system_2bbl,fuel_system_idi,fuel_system_mpfi,fuel_system_other
88,-1,4.0,96.3,172.4,65.4,51.6,2403,4,110,3.17,...,0,1,0,0,0,0,0,0,0,1


In [12]:
X_test[:1]

Unnamed: 0,symboling,num_of_doors,wheel-base,length,width,height,curb_weight,num_of_cylinders,engine_size,bore,...,engine_type_l,engine_type_ohc,engine_type_ohcf,engine_type_ohcv,engine_type_other,fuel_system_1bbl,fuel_system_2bbl,fuel_system_idi,fuel_system_mpfi,fuel_system_other
15,0,4.0,103.5,189.0,66.9,55.7,3230,6,209,3.62,...,0,1,0,0,0,0,0,0,1,0


In [13]:
X_train.shape

(131, 48)

In [14]:
X_test.shape

(41, 48)

In [15]:
X_val.shape

(33, 48)

## 1. Sampling the features

In [16]:
from sklearn.ensemble import RandomForestRegressor    

def max_features_score(max_features_list):
    feature_sampling_scores = []
    for max_feat in max_features_list:
        rfr_max_feature = RandomForestRegressor(n_estimators=50, random_state=1,  
                                max_features=max_feat)
                            
        rfr_max_feature.fit(X_train, y_train)
        score = rfr_max_feature.score(X_val, y_val)
        feature_sampling_scores.append([max_feat, score])
    return feature_sampling_scores

In [17]:
max_features_score([0.5, 'sqrt', 'log2', None])

[[0.5, 0.42474476377455306],
 ['sqrt', 0.37851450786770213],
 ['log2', 0.35876288524526434],
 [None, 0.3931832022859586]]

### I can see from above list that using 0.5 or None perform the best for my dataset features. Hence, I will use 0.5

## 2. Number of Trees (Estimators for random forest)

In [18]:
from sklearn.ensemble import RandomForestRegressor    

rfr_n_trees = RandomForestRegressor(n_estimators=50, random_state=1,  
                                max_features=0.5)
rfr_n_trees.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=0.5, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None,
           oob_score=False, random_state=1, verbose=0, warm_start=False)

In [19]:
rfr_n_trees.score(X_val, y_val)

0.42474476377455306

In [20]:
rfr_n_trees.estimators_[47:50]

[DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=0.5,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=77964601, splitter='best'),
 DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=0.5,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=1616579073, splitter='best'),
 DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=0.5,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=1567750016, splitter='best')]

In [21]:
import numpy as np

tree_predictions = np.vstack([estimator.predict(X_val) for estimator in rfr_n_trees.estimators_])

In [22]:
tree_predictions.shape

(50, 33)

In [23]:
tree_predictions[0:2]

array([[125., 128.,  95., 161., 122.,  91.,  94., 122., 113., 104.,  94.,
         85., 154., 122., 118., 104., 154., 122., 125., 150., 186., 119.,
         65., 122.,  85.,  95., 154., 168., 145., 122.,  95.,  94., 158.],
       [153., 128.,  93.,  94., 122.,  91., 168., 122., 113., 104.,  85.,
         85., 134., 118., 101., 161., 134., 122., 153., 150., 134., 161.,
         65.,  87.,  83.,  93.,  85., 101., 145.,  94.,  74.,  94., 158.]])

In [24]:
#visualizing the r2_score of prediction
import numpy as np
from sklearn.metrics import r2_score

r2_scores = [r2_score(y_val, np.mean(tree_predictions[:i + 1], axis = 0)) 
                  for i in range(0, len(rfr_n_trees.estimators_))]

In [25]:
r2_scores

[-0.0005296970521631028,
 0.31627530639815715,
 0.44276329075238574,
 0.3976515346635694,
 0.35311695127759035,
 0.3698252620677517,
 0.3954791482241312,
 0.3930856690827007,
 0.4168706201301199,
 0.4054439007085304,
 0.39983313880183313,
 0.40009379039547466,
 0.42410550612911335,
 0.4246098951297028,
 0.41787516428168003,
 0.4083073787346072,
 0.40608646648161584,
 0.3928131782075107,
 0.3920442703912769,
 0.40289901009826135,
 0.39539060091375144,
 0.39448663343285206,
 0.4018277648709866,
 0.4115868513505926,
 0.4142532126855155,
 0.4160447317529129,
 0.42369765659400327,
 0.4187171579893607,
 0.4162057793624009,
 0.4147085991267828,
 0.41035769541969946,
 0.4101035465290568,
 0.4117942735384833,
 0.40861647782753496,
 0.40423256804787233,
 0.41529720457765273,
 0.417691481485565,
 0.41210532292700985,
 0.42010068918286647,
 0.4151873150312474,
 0.4205369732786254,
 0.42618029167950167,
 0.4303464303507998,
 0.428061720198908,
 0.4234065717951587,
 0.42260655816045345,
 0.421934688

In [26]:
from graph import trace_values, plot
x_vals = list(range(1, len(rfr_n_trees.estimators_) + 1))

trace = trace_values(x_vals, r2_scores)

plot([trace])

### RandomForestRegression adding additional trees does not hurt the model, but it takes more time to train a random forest with more trees(estimators). Based on above, with max_features of 0.5, the r2_score of the mean prediction of the estimators is highest at 14 estimators(decision trees) and reaches a plateau after that. Hence, I will use 14 estimators(decision trees) to limit the number of estimators.

## 3. Number of leaves (min_samples_leaf)

In [27]:
from sklearn.ensemble import RandomForestRegressor

min_samples = list(range(2, 51, 1))
def find_min_samples_leaf(min_samples):
    scores = []
    for min_sample in min_samples:
        rfr_min_sample = RandomForestRegressor(min_samples_leaf = min_sample,
                                              n_estimators = 14, random_state=1,
                                              max_features = 0.5)
                            
        rfr_min_sample.fit(X_train, y_train)
        score = rfr_min_sample.score(X_val, y_val)
        scores.append([min_sample, score])
    return scores

In [28]:
scores = find_min_samples_leaf(min_samples)
scores

[[2, 0.4427916863445995],
 [3, 0.44611710774064506],
 [4, 0.46286045376950197],
 [5, 0.3629140727485617],
 [6, 0.33383081295145467],
 [7, 0.34425196758385745],
 [8, 0.30853740043735844],
 [9, 0.31791664986498214],
 [10, 0.3067690796229522],
 [11, 0.2812803706643987],
 [12, 0.2840319654048178],
 [13, 0.2723435761406978],
 [14, 0.22143403734507794],
 [15, 0.168629456251969],
 [16, 0.16259471818415083],
 [17, 0.16444704849337355],
 [18, 0.1547625634926445],
 [19, 0.13592442421392958],
 [20, 0.12880301645275838],
 [21, 0.12598783712411243],
 [22, 0.11910855515897924],
 [23, 0.11186840012360165],
 [24, 0.10913119760182455],
 [25, 0.11867702813635095],
 [26, 0.10364405345886507],
 [27, 0.1038644127021755],
 [28, 0.09603748128599554],
 [29, 0.05541565712191444],
 [30, 0.05688672340669044],
 [31, 0.05688672340669044],
 [32, 0.07966958865888663],
 [33, 0.08023273869087999],
 [34, 0.06193726799750432],
 [35, 0.05413282322218482],
 [36, 0.047311516182212965],
 [37, 0.043687112889255975],
 [38, 0.

In [29]:
import numpy as np
wrapped_scores = np.array(scores)

In [30]:
wrapped_scores[:5]

array([[2.        , 0.44279169],
       [3.        , 0.44611711],
       [4.        , 0.46286045],
       [5.        , 0.36291407],
       [6.        , 0.33383081]])

In [31]:
from graph import trace_values, plot
trace_1 = trace_values(wrapped_scores[:, 0], wrapped_scores[:, 1])
plot([trace_1])

### Based on above graph, there is a downward trend once the min_samples_leaf is larger than 4. Hence, I will choose 4 for my final model.

In [32]:
X_val.shape

(33, 48)

In [33]:
X_train.shape

(131, 48)

## 4. Evaluation

In [34]:
from sklearn.ensemble import RandomForestRegressor

rfr_tuned = RandomForestRegressor(min_samples_leaf = 4,
                                  n_estimators = 14,
                                  max_features = 0.5, random_state=1)

In [35]:
combined_X = np.vstack((X_train, X_val))
combined_X.shape

(164, 48)

In [36]:
combined_y = np.concatenate((y_train, y_val))
combined_y.shape

(164,)

In [37]:
rfr_tuned.fit(X_train, y_train)
rfr_tuned.score(X_val, y_val)

0.46286045376950197

In [38]:
rfr_tuned.fit(combined_X, combined_y)
rfr_tuned.score(X_test, y_test)

0.548243519192865

### My model's accuracy is now 0.55, which performs the best so far.