# Exercise 11

## Car Price Prediction

Predict if the price of a car is low or high

In [1]:
%matplotlib inline
import pandas as pd

data = pd.read_csv('https://github.com/albahnsen/PracticalMachineLearningClass/raw/master/datasets/dataTrain_carListings.zip')
data = data.loc[data['Model'].str.contains('Camry')].drop(['Make', 'State'], axis=1)
data = data.join(pd.get_dummies(data['Model'], prefix='M'))
data['HighPrice'] = (data['Price'] > data['Price'].mean()).astype(int)
data = data.drop(['Model', 'Price'], axis=1)

data.head()

Unnamed: 0,Year,Mileage,M_Camry,M_Camry4dr,M_CamryBase,M_CamryL,M_CamryLE,M_CamrySE,M_CamryXLE,HighPrice
15,2016,29242,0,0,0,0,1,0,0,1
47,2015,26465,0,0,0,0,1,0,0,1
85,2012,46739,0,1,0,0,0,0,0,1
141,2017,41722,0,0,0,0,0,1,0,1
226,2014,77669,0,0,0,0,0,0,1,0


In [2]:
data.shape

(13150, 10)

In [3]:
y = data['HighPrice']
X = data.drop(['HighPrice'], axis=1)

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


# Exercise 11.1

Estimate a Decision Tree Classifier Manually using the code created in the Notebook #13

Evaluate the accuracy on the testing set

In [5]:
import numpy as np
max_depth = None
num_pct = 10
max_features = None
min_gain=0.001

In [15]:
def gini(y):
    if y.shape[0] == 0:
        return 0
    else:
        return 1 - (y.mean()**2 + (1 - y.mean())**2)

In [16]:
def gini_impurity(X_col, y, split):
    "Calculate the gain of an split k on feature j"
    
    filter_l = X_col < split
    y_l = y.loc[filter_l]
    y_r = y.loc[~filter_l]
    
    n_l = y_l.shape[0]
    n_r = y_r.shape[0]
    
    gini_y = gini(y)
    gini_l = gini(y_l)
    gini_r = gini(y_r)
    
    gini_impurity_ = gini_y - (n_l / (n_l + n_r) * gini_l + n_r / (n_l + n_r) * gini_r)
    
    return gini_impurity_

In [17]:
def best_split(X, y, num_pct=10):
    
    features = range(X.shape[1])
    
    best_split = [0, 0, 0]  # j, split, gain
    
    # For all features
    for j in features:
        
        splits = np.percentile(X.iloc[:, j], np.arange(0, 100, 100.0 / (num_pct+1)).tolist())
        splits = np.unique(splits)[1:]
        
        # For all splits
        for split in splits:
            gain = gini_impurity(X.iloc[:, j], y, split)
                        
            if gain > best_split[2]:
                best_split = [j, split, gain]
    
    return best_split

In [18]:
def tree_grow(X, y, level=0, min_gain=0.001, max_depth=None, num_pct=10):
    
    # If only one observation
    if X.shape[0] == 1:
        tree = dict(y_pred=y.iloc[:1].values[0], y_prob=0.5, level=level, split=-1, n_samples=1, gain=0)
        return tree
    
    # Calculate the best split
    j, split, gain = best_split(X, y, num_pct)
    
    # save tree and estimate prediction
    y_pred = int(y.mean() >= 0.5) 
    y_prob = (y.sum() + 1.0) / (y.shape[0] + 2.0)  # Laplace correction
    
    tree = dict(y_pred=y_pred, y_prob=y_prob, level=level, split=-1, n_samples=X.shape[0], gain=gain)
    
    # Check stooping criteria
    if gain < min_gain:
        return tree
    if max_depth is not None:
        if level >= max_depth:
            return tree   
    
    # No stooping criteria was meet, then continue to create the partition
    filter_l = X.iloc[:, j] < split
    X_l, y_l = X.loc[filter_l], y.loc[filter_l]
    X_r, y_r = X.loc[~filter_l], y.loc[~filter_l]
    tree['split'] = [j, split]

    # Next iteration to each split
    
    tree['sl'] = tree_grow(X_l, y_l, level + 1, min_gain=min_gain, max_depth=max_depth, num_pct=num_pct)
    tree['sr'] = tree_grow(X_r, y_r, level + 1, min_gain=min_gain, max_depth=max_depth, num_pct=num_pct)
    
    return tree

In [19]:
tree_grow(X, y, level=0, min_gain=0.001, max_depth=1, num_pct=10)

{'y_pred': 1,
 'y_prob': 0.5795316301703163,
 'level': 0,
 'split': [1, 51704.54545454545],
 'n_samples': 13150,
 'gain': 0.23348567756020572,
 'sl': {'y_pred': 1,
  'y_prob': 0.8377538829151733,
  'level': 1,
  'split': -1,
  'n_samples': 8368,
  'gain': 0.0359166442135464},
 'sr': {'y_pred': 0,
  'y_prob': 0.12771739130434784,
  'level': 1,
  'split': -1,
  'n_samples': 4782,
  'gain': 0.04846022210319853}}

In [20]:
tree = tree_grow(X, y, level=0, min_gain=0.001, max_depth=3, num_pct=10)
tree

{'y_pred': 1,
 'y_prob': 0.5795316301703163,
 'level': 0,
 'split': [1, 51704.54545454545],
 'n_samples': 13150,
 'gain': 0.23348567756020572,
 'sl': {'y_pred': 1,
  'y_prob': 0.8377538829151733,
  'level': 1,
  'split': [0, 2014.0],
  'n_samples': 8368,
  'gain': 0.0359166442135464,
  'sl': {'y_pred': 0,
   'y_prob': 0.3403880070546737,
   'level': 2,
   'split': [0, 2012.0],
   'n_samples': 565,
   'gain': 0.06001982703810749,
   'sl': {'y_pred': 0,
    'y_prob': 0.058823529411764705,
    'level': 3,
    'split': -1,
    'n_samples': 151,
    'gain': 0.005661757290357922},
   'sr': {'y_pred': 0,
    'y_prob': 0.44471153846153844,
    'level': 3,
    'split': -1,
    'n_samples': 414,
    'gain': 0.030742158715547196}},
  'sr': {'y_pred': 1,
   'y_prob': 0.8737988468930173,
   'level': 2,
   'split': [0, 2015.0],
   'n_samples': 7803,
   'gain': 0.015250286354762527,
   'sl': {'y_pred': 1,
    'y_prob': 0.731399157697707,
    'level': 3,
    'split': -1,
    'n_samples': 2135,
    'ga

In [21]:
def tree_predict(X, tree, proba=False):
    
    predicted = np.ones(X.shape[0])

    # Check if final node
    if tree['split'] == -1:
        if not proba:
            predicted = predicted * tree['y_pred']
        else:
            predicted = predicted * tree['y_prob']
            
    else:
        
        j, split = tree['split']
        filter_l = (X.iloc[:, j] < split)
        X_l = X.loc[filter_l]
        X_r = X.loc[~filter_l]

        if X_l.shape[0] == 0:  # If left node is empty only continue with right
            predicted[~filter_l] = tree_predict(X_r, tree['sr'], proba)
        elif X_r.shape[0] == 0:  # If right node is empty only continue with left
            predicted[filter_l] = tree_predict(X_l, tree['sl'], proba)
        else:
            predicted[filter_l] = tree_predict(X_l, tree['sl'], proba)
            predicted[~filter_l] = tree_predict(X_r, tree['sr'], proba)

    return predicted

In [23]:
tree_predict(X_test, tree)

array([0., 1., 1., ..., 1., 1., 0.])

In [35]:
from sklearn import metrics
Mtrs=pd.DataFrame(columns=['Model','RMSE','ACC',"F1"],data=[])
Mtrs.shape[0]
y_pred1 = tree_predict(X_test, tree)
RMSE=np.sqrt(metrics.mean_squared_error(y_pred1, y_test))
ACC=metrics.accuracy_score(y_pred1, y_test)
F1=metrics.f1_score(y_pred1, y_test)
Mtrs.loc[0] = ['DTree',RMSE,ACC,F1]
Mtrs

Unnamed: 0,Model,RMSE,ACC,F1
0,DTree,0.371508,0.861982,0.883304


# Exercise 11.2

Estimate a Bagging of 10 Decision Tree Classifiers Manually using the code created in the Notebook #13

Evaluate the accuracy on the testing set

In [25]:
n_estimators = 10
# set a seed for reproducibility
np.random.seed(123)
n_samples = X_train.shape[0]
# create bootstrap samples (will be used to select rows from the DataFrame)
samples = [np.random.choice(a=n_samples, size=n_samples, replace=True) for _ in range(n_estimators)]

In [29]:
np.random.seed(123)
seeds = np.random.randint(1, 10000, size=n_estimators)
trees = {}
for i in range(n_estimators):
    trees[i] = tree_grow(X_train.iloc[samples[i]], y_train.iloc[samples[i]], level=0, min_gain=0.001, max_depth=3, num_pct=10)
    
# Predict 
y_pred_df = pd.DataFrame(index=X_test.index, columns=list(range(n_estimators)))
for i in range(n_estimators):
    y_pred_df.iloc[:, i] = tree_predict(X_test, trees[i])

In [31]:
y_pred_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
332784,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
146436,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
130476,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
85618,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
75474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
y_pred2 = (y_pred_df.sum(axis=1) >= (n_estimators / 2)).astype(np.int)
RMSE=np.sqrt(metrics.mean_squared_error(y_pred2, y_test))
ACC=metrics.accuracy_score(y_pred2, y_test)
F1=metrics.f1_score(y_pred2, y_test)
Mtrs.loc[1] = ['MV_Bagging_10_DTree',RMSE,ACC,F1]
Mtrs

Unnamed: 0,Model,RMSE,ACC,F1
0,DTree,0.371508,0.861982,0.883304
1,MV_Bagging_10_DTree,0.364622,0.867051,0.891562


# Exercise 11.3

Implement the variable max_features on the Decision Tree Classifier created in 11.1.

Compare the impact in the results by varing the parameter max_features

Evaluate the accuracy on the testing set

In [121]:
def tree_grow2(X, y, level=0, min_gain=0.001, max_depth=None, num_pct=10, max_features=None):
    
    # If only one observation
    if X.shape[0] == 1:
        tree = dict(y_pred=y.iloc[:1].values[0], y_prob=0.5, level=level, split=-1, n_samples=1, gain=0)
        return tree
   
    # Calculate the X with the max_features
    # Handlig possible errors:
    if max_features > len(X.columns):
        return print("Error, el parámetro max_features no debe ser mayor a\nX: ",len(X.columns))
    else:
        # If None, then max_features = n_features, X = X
        np.random.seed(123)
        if max_features != None:
            # Select max_features random features:
            max_feat = max_features
            feat=np.random.choice(a=len(X.columns), size=max_feat, replace=False)
            feat.sort()
            X = X.iloc[:,feat]

    # Calculate the best split
    j, split, gain = best_split(X, y, num_pct)
    
    # save tree and estimate prediction
    y_pred = int(y.mean() >= 0.5) 
    y_prob = (y.sum() + 1.0) / (y.shape[0] + 2.0)  # Laplace correction
    
    tree = dict(y_pred=y_pred, y_prob=y_prob, level=level, split=-1, n_samples=X.shape[0], gain=gain)
    
    # Check stooping criteria
    if gain < min_gain:
        return tree
    if max_depth is not None:
        if level >= max_depth:
            return tree   
    
    # No stooping criteria was meet, then continue to create the partition
    filter_l = X.iloc[:, j] < split
    X_l, y_l = X.loc[filter_l], y.loc[filter_l]
    X_r, y_r = X.loc[~filter_l], y.loc[~filter_l]
    tree['split'] = [j, split]

    # Next iteration to each split
    
    tree['sl'] = tree_grow(X_l, y_l, level + 1, min_gain=min_gain, max_depth=max_depth, num_pct=num_pct)
    tree['sr'] = tree_grow(X_r, y_r, level + 1, min_gain=min_gain, max_depth=max_depth, num_pct=num_pct)
    
    return tree

In [122]:
max_feat = range(0,X_train.shape[1]+1)

for i in max_feat:
    tree2 = tree_grow2(X_train, y_train, level=0, min_gain=0.001, max_depth=3, num_pct=10, max_features=i)
    y_pred3 = pd.DataFrame(tree_predict(X_test, tree2))
    RMSE=np.sqrt(metrics.mean_squared_error(y_pred3, y_test))
    ACC=metrics.accuracy_score(y_pred3, y_test)
    F1=metrics.f1_score(y_pred3, y_test)
    Mtrs.loc[i+2] = ["DTree_MF: %.0f%s"%(i,'f'),RMSE,ACC,F1]

#tree2 = tree_grow2(X, y, level=0, min_gain=0.001, max_depth=3, num_pct=10)

In [123]:
Mtrs

Unnamed: 0,Model,RMSE,ACC,F1
0,DTree,0.371508,0.861982,0.883304
1,MV_Bagging_10_DTree,0.364622,0.867051,0.891562
2,DTree_MF: 0f,0.646151,0.582488,0.736168
3,DTree_MF: 1f,0.646151,0.582488,0.736168
4,DTree_MF: 2f,0.387893,0.849539,0.881596
5,DTree_MF: 3f,0.387893,0.849539,0.881596
6,DTree_MF: 4f,0.396122,0.843088,0.875616
7,DTree_MF: 5f,0.387596,0.84977,0.881756
8,DTree_MF: 6f,0.372747,0.86106,0.882799
9,DTree_MF: 7f,0.372747,0.86106,0.882799


# Exercise 11.4

Estimate a Bagging of 10 Decision Tree Classifiers with `max_features = log(n_features)`

Evaluate the accuracy on the testing set

In [124]:
n_estimators = 10
# set a seed for reproducibility
np.random.seed(123)
n_samples = X_train.shape[0]
# create bootstrap samples (will be used to select rows from the DataFrame)
samples = [np.random.choice(a=n_samples, size=n_samples, replace=True) for _ in range(n_estimators)]

In [125]:
np.random.seed(123)
seeds = np.random.randint(1, 10000, size=n_estimators)
max_feat = np.int(np.log(len(X_train.columns)))
trees = {}
for i in range(n_estimators):
    trees[i] = tree_grow2(X_train.iloc[samples[i]], y_train.iloc[samples[i]], level=0, min_gain=0.001, max_depth=3, num_pct=10, max_features=max_feat)
    
# Predict 
y_pred_df = pd.DataFrame(index=X_test.index, columns=list(range(n_estimators)))
for i in range(n_estimators):
    y_pred_df.iloc[:, i] = tree_predict(X_test, trees[i])

In [126]:
y_pred4 = (y_pred_df.sum(axis=1) >= (n_estimators / 2)).astype(np.int)
RMSE=np.sqrt(metrics.mean_squared_error(y_pred4, y_test))
ACC=metrics.accuracy_score(y_pred4, y_test)
F1=metrics.f1_score(y_pred4, y_test)
Mtrs.loc[12] = ['MV_Bagging_10_DTree_MF:log',RMSE,ACC,F1]
Mtrs

Unnamed: 0,Model,RMSE,ACC,F1
0,DTree,0.371508,0.861982,0.883304
1,MV_Bagging_10_DTree,0.364622,0.867051,0.891562
2,DTree_MF: 0f,0.646151,0.582488,0.736168
3,DTree_MF: 1f,0.646151,0.582488,0.736168
4,DTree_MF: 2f,0.387893,0.849539,0.881596
5,DTree_MF: 3f,0.387893,0.849539,0.881596
6,DTree_MF: 4f,0.396122,0.843088,0.875616
7,DTree_MF: 5f,0.387596,0.84977,0.881756
8,DTree_MF: 6f,0.372747,0.86106,0.882799
9,DTree_MF: 7f,0.372747,0.86106,0.882799


# Exercise 11.5

Using sklearn, train a RandomForestClassifier 10 arboles

Evaluate the accuracy on the testing set

In [130]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=10)
clf.fit(X_train,y_train)
y_pred5 = clf.predict(X_test)
RMSE=np.sqrt(metrics.mean_squared_error(y_pred5, y_test))
ACC=metrics.accuracy_score(y_pred5, y_test)
F1=metrics.f1_score(y_pred5, y_test)
Mtrs.loc[13] = ['RandomForest_10_Tree',RMSE,ACC,F1]
Mtrs

Unnamed: 0,Model,RMSE,ACC,F1
0,DTree,0.371508,0.861982,0.883304
1,MV_Bagging_10_DTree,0.364622,0.867051,0.891562
2,DTree_MF: 0f,0.646151,0.582488,0.736168
3,DTree_MF: 1f,0.646151,0.582488,0.736168
4,DTree_MF: 2f,0.387893,0.849539,0.881596
5,DTree_MF: 3f,0.387893,0.849539,0.881596
6,DTree_MF: 4f,0.396122,0.843088,0.875616
7,DTree_MF: 5f,0.387596,0.84977,0.881756
8,DTree_MF: 6f,0.372747,0.86106,0.882799
9,DTree_MF: 7f,0.372747,0.86106,0.882799


# Exercise 11.6

Find the best parameters of the RandomForestClassifier (max_depth, max_features, n_estimators)

Evaluate the accuracy on the testing set

In [138]:
from sklearn.model_selection import GridSearchCV

# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True, False],
    'max_depth': [1,10,20,30,40,50,60,70,80,90,100,110],
    'max_features': [1,2, 3,4,5,6,7,8,9],
    'n_estimators': [1,10, 20, 30, 40,50,60,70,80,90,100,110]
}

# Create a base model
rf = RandomForestClassifier(random_state = 42)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2, return_train_score=True)
grid_search.fit(X_train,y_train);
grid_search.best_params_

Fitting 3 folds for each of 2592 candidates, totalling 7776 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   43.5s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   49.2s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 3265 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed:  8.4min
[Parallel(n_jobs=-1)]: Done 4885 tasks      | elapsed: 10.5min
[Parallel(n_jobs=-1)]: Done 5816 tasks      | elapsed: 13.5min
[Parallel(n_jobs=-1)]: Done 6829 tasks      | elapsed: 16.7min
[Parallel(n_jobs=-1)]: Done 7776 out of 7776 | elapsed: 20.1min finished


{'bootstrap': True, 'max_depth': 10, 'max_features': 5, 'n_estimators': 40}

In [139]:
clf2 = RandomForestClassifier(n_estimators=40, max_features=5, max_depth=10,bootstrap=True)
clf2.fit(X_train,y_train)
y_pred6 = clf2.predict(X_test)
RMSE=np.sqrt(metrics.mean_squared_error(y_pred6, y_test))
ACC=metrics.accuracy_score(y_pred6, y_test)
F1=metrics.f1_score(y_pred6, y_test)
Mtrs.loc[14] = ['RandomForest_Best_Param',RMSE,ACC,F1]
Mtrs

Unnamed: 0,Model,RMSE,ACC,F1
0,DTree,0.371508,0.861982,0.883304
1,MV_Bagging_10_DTree,0.364622,0.867051,0.891562
2,DTree_MF: 0f,0.646151,0.582488,0.736168
3,DTree_MF: 1f,0.646151,0.582488,0.736168
4,DTree_MF: 2f,0.387893,0.849539,0.881596
5,DTree_MF: 3f,0.387893,0.849539,0.881596
6,DTree_MF: 4f,0.396122,0.843088,0.875616
7,DTree_MF: 5f,0.387596,0.84977,0.881756
8,DTree_MF: 6f,0.372747,0.86106,0.882799
9,DTree_MF: 7f,0.372747,0.86106,0.882799
