## Pre-work - Import Essential Libraries

In [1]:
import pandas as pd
import numpy as np
from numpy import sqrt

from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.utils import resample
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score, confusion_matrix, auc, f1_score
from sklearn.metrics import precision_score, recall_score, roc_curve

import warnings
warnings.filterwarnings('ignore')
from subprocess import check_output

In [2]:
#Data visualization
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
#set visual parameters
%matplotlib inline
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 14,7

In [3]:
#for printing in bold style
from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))

## Regression

In [4]:
petrol = pd.read_csv("petrol_consumption.csv")
petrol.head(2)

Unnamed: 0,Petrol_tax,Average_income,Paved_Highways,Population_Driver_licence(%),Petrol_Consumption
0,9.0,3571,1976,0.525,541
1,9.0,4092,1250,0.572,524


In [5]:
petrol.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 5 columns):
Petrol_tax                      48 non-null float64
Average_income                  48 non-null int64
Paved_Highways                  48 non-null int64
Population_Driver_licence(%)    48 non-null float64
Petrol_Consumption              48 non-null int64
dtypes: float64(2), int64(3)
memory usage: 2.0 KB


### This dataset is a small and clean dataset without any missing values.We can now separate the target variable and split the data for train and test datasets.

In [6]:
#separate the dependent and independent variables.
target = petrol['Petrol_Consumption']
train = petrol.copy()
train.drop('Petrol_Consumption', axis =1, inplace = True)

#### SPLIT DATA

In [7]:
train_x, test_x, train_y, test_y = train_test_split(train, target, test_size = 0.2, random_state = 32)

### Now lets HOLD out the TEST dataset for final predictions. And by using bootstrapping, lets generate random sample splits on the TRAIN dataset.We will NOT use the TEST dataset for tuning our model.

In [8]:
nrows = train_x.shape[0] #the number of rows of train dataset.
all_indices = list(range(nrows))

## BootStrapping

### We will train our initial Base Model using Bootstrapping Sampling technique
Note: Why Bootstrapping? Since this is a smaller dataset, KFold doesnot hold good. We can consider a sampling technique with replacement. Hence we will use BootStrapping resampling technique. LOOCV can be another option.
STEP 1: Firstly, using bootstrapping we will generate new TRAIN set and VALIDATION set.
STEP 2: We will iterate the resampling technique for n=10 number of times to generate random train and validation sets.
STEP 3: For each iteration, we will predict the labels on VALIDATION set and calculate the evaluation metrics(in this case-rmse, r2, mae).
STEP 4: We will calculate the mean of all evaluation metrics to get the overall scores of our base model.
STEP 5: Finally, we will use the TEST set, only once, to evaluate the base model on unseen observations.

In [9]:
dtree_boot = DecisionTreeRegressor(random_state=123)
mean_rmse = 0.0
mean_r2 = 0.0
mean_mae = 0.0
n = 10 #running the bootstrap for 10 iterations.
for i in range(n):
    #for each iteration, randomly generate new train and validation set.
    train_indices = resample(all_indices, replace=True, n_samples=int(0.7*nrows), random_state=i*32)
    validation_indices = [index for index in list(range(nrows)) if index not in train_indices]
    
    new_train_x = train_x.take(train_indices) #get the new train_x samples
    validation_x = train_x.take(validation_indices) #get the new validation_x samples
    
    new_train_y = train_y.take(train_indices)
    validation_y = train_y.take(validation_indices)
    
    dtree_boot.fit(new_train_x, new_train_y) #fit the model on the new train set
    
    preds_boot = dtree_boot.predict(validation_x)#predict the labels on the validation set
    rmse_boot = np.sqrt(mean_squared_error(validation_y, preds_boot))
    r2_boot = r2_score(validation_y, preds_boot)
    mae_boot = mean_absolute_error(validation_y, preds_boot)

    mean_rmse += rmse_boot
    mean_r2 += r2_boot
    mean_mae += mae_boot

### Now that we got the evaulation metrics from 10 iterations of validation sets, lets calculate the mean to get an overall metrics.

In [10]:
mean_rmse = mean_rmse/n
mean_r2 = mean_r2/n
mean_mae = mean_mae/n
printmd('**Evaluation Metrics on Validation Set:**')
print("Mean RMSE:{}  | Mean R2 Score:{}  |  Mean MAE:{}".format(mean_rmse,mean_r2, mean_mae))

#predict on the test set for base test evaluation metrics.
preds_base = dtree_boot.predict(test_x)
rmse_base = np.sqrt(mean_squared_error(test_y, preds_base))
r2_base = r2_score(test_y, preds_base)
mae_base = mean_absolute_error(test_y, preds_base)

printmd('**Evaluation Metrics on Test Set:**')
print("Test RMSE:{}  | Test R2 Score:{}  |  Test MAE:{}".format(rmse_base,r2_base, mae_base))

**Evaluation Metrics on Validation Set:**

Mean RMSE:145.74540117052433  | Mean R2 Score:-0.5529971779064724  |  Mean MAE:110.81994754041969


**Evaluation Metrics on Test Set:**

Test RMSE:44.74371464239419  | Test R2 Score:0.4897230944904368  |  Test MAE:34.4


## GridSearchCV

### une the base model by tweaking hyperparameters using GridSearchCV

In [11]:
#define list of parameters for the the DecisionTreeRegressor
param_grid = {'criterion' : ['mse', 'mae'],
              'max_depth': [2,4,6,8,None],
              'min_samples_leaf': [0.5, 1, 2],
              'min_samples_split':[2, 3, 4, 6],
              'max_leaf_nodes' : [None,2, 4, 6],
              'max_features': [2,3,4]}

#define a tuned_tree using GridSearchCV with the above set param_grid and with different set of folds.
dtree_gs = GridSearchCV(DecisionTreeRegressor(random_state=123), param_grid=param_grid, 
                           scoring = 'neg_mean_absolute_error', cv =10)#Using scoring method as 'neg_mean_absolute_error', model aims at reducing the MAE. 
#We can also try with other scoring methods.
dtree_gs.fit(train_x,train_y)

printmd("**Best Parameters After Tuning:**")
print(dtree_gs.best_params_)

**Best Parameters After Tuning:**

{'criterion': 'mse', 'max_depth': 2, 'max_features': 2, 'max_leaf_nodes': None, 'min_samples_leaf': 2, 'min_samples_split': 2}


## Final Model

### Using Best Parameters from GridSearchCV, we will train a final model.

In [13]:
dtree_final = DecisionTreeRegressor(criterion = 'mse', max_depth = 2, max_features = 2,
                                    min_samples_leaf = 2, min_samples_split = 2, random_state=123)
dtree_final.fit(train_x, train_y)

#Final Test Predictions
preds_final = dtree_final.predict(test_x)
rmse_final = np.sqrt(mean_squared_error(test_y, preds_final))
r2_final = r2_score(test_y, preds_final)
mae_final = mean_absolute_error(test_y, preds_final)

#print base evaluation metrics
printmd('**Base Evaluation Metrics on Validation Set:**')
print("Mean RMSE:{}  | Mean R2 Score:{}  |  Mean MAE:{}".format(mean_rmse,mean_r2, mean_mae))
printmd('**Base Evaluation Metrics on Test Set:**')
print("Test RMSE:{}  | Test R2 Score:{}  |  Test MAE:{}".format(rmse_base,r2_base, mae_base))
printmd('**Final Evaluation Metrics on Test Set:**')
print("Test RMSE:{}  | Test R2 Score:{}  |  Test MAE:{}".format(rmse_final,r2_final, mae_final))

printmd("**Parameters Before Tuning:**")
print(dtree_boot.get_params())
printmd("**Parameters After Tuning:**")
print(dtree_final.get_params())

**Base Evaluation Metrics on Validation Set:**

Mean RMSE:145.74540117052433  | Mean R2 Score:-0.5529971779064724  |  Mean MAE:110.81994754041969


**Base Evaluation Metrics on Test Set:**

Test RMSE:44.74371464239419  | Test R2 Score:0.4897230944904368  |  Test MAE:34.4


**Final Evaluation Metrics on Test Set:**

Test RMSE:36.83345514635533  | Test R2 Score:0.6541985904891294  |  Test MAE:28.088888888888892


**Parameters Before Tuning:**

{'criterion': 'mse', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'presort': False, 'random_state': 123, 'splitter': 'best'}


**Parameters After Tuning:**

{'criterion': 'mse', 'max_depth': 2, 'max_features': 2, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'presort': False, 'random_state': 123, 'splitter': 'best'}


### ------------------------------------------INFERENCE------------------------------------------
we can conclude that with the best parameters,
TEST Evaluation metrics have further improved significantly.
R2 Score improved from 0.48 to 0.65
RMSE reduced from 44.74 to 36.83
MAE reduced from 34.4 to 28.08.
We can also play around with more parameters to get a fine tuned model.