### Regression with Decision trees and random forests

In [None]:
# We already now how to solve one type of supervised problems with tree-based models: classification
# However, these methods can also be used for the other class of supervised learning: regression
# In the following we will go through the process of applying Regression Trees and Random Forest Regression

In [1]:
# Here are the important libraries
# Basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Classification performance evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix

# Logistic regression
from sklearn.linear_model import LogisticRegression

# Decision trees
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import graphviz

# Random forest classifier
from sklearn.ensemble import RandomForestClassifier

# Grid search
from sklearn.model_selection import GridSearchCV

In [2]:
# The main idea of classification tree was to create branches based on optimal splits, and 
# the predicted outcome in the branch is the class that has the majority of datapoints in the branch
# In Regression trees, we have a similar idea: we try to split our data (creating branches)
# in a way that create homogeneous subsets of data, i.e. similar datapoints
# In case of classification it means that we want points to be mainly from the same class
# In case of regression, it means that want the outcome value of the points to be as close to each other as possible
# In practice, we use the mean squred error for this
# And the prediction in this case, when we decided that a branch is final, is the average of the values in the branch
# We call this Regression tree, in contrast to classification trees

# We will start with a dataset describing cars
# We try whehter some measurements can predict the fuel consumption of the car

cars = pd.read_csv('auto.csv')

cars.head()

Unnamed: 0,mpg,displ,hp,weight,accel,origin,size
0,18.0,250.0,88,3139,14.5,US,15.0
1,9.0,304.0,193,4732,18.5,US,20.0
2,36.1,91.0,60,1800,16.4,Asia,10.0
3,18.5,250.0,98,3525,19.0,US,15.0
4,34.3,97.0,78,2188,15.8,Europe,10.0


In [None]:
# Before we proceed, we have one data transformation task: covert origin to dummy variables
# We create dummy variables, drop one to create one-hot encoding, join it with the original dataset and drop origin column

cars_origin = pd.get_dummies(cars.origin, drop_first=True)

cars = pd.concat([cars, cars_origin], axis = 1).drop('origin', axis = 1)

cars.head()

In [None]:
# To perform regression with trees, we have a new function, DecisionTreeRegressor

from sklearn.tree import DecisionTreeRegressor

# In regression problems the error is not measured by the confusion matrix
# There are alternative measures, mean squared error is one that is typically used
# We want it to get as low as possible (error = prediction - original value, we want the error to be small)

from sklearn.metrics import mean_squared_error as MSE

# The start of the process is the same, we create train and test sets
X = cars.iloc[:,1:]
y = cars['mpg']

X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=42)

# We create an instance of a DecisionTreeRegressor 
cars_reg = DecisionTreeRegressor(random_state = 0)

# Finally we fit the data

cars_tree_fit = cars_reg.fit(X_train, y_train)

In [None]:
# We can also visualize the tree

viz_tree = tree.export_graphviz(cars_tree_fit, out_file=None, feature_names=X.columns, )

# Then we can draw the tree

graph = graphviz.Source(viz_tree) 

# Save it in an external file
graph.render("viz_tree")

# As we can see when we open the file, the same thing happened as with decision tree: theere is no pruning, each branch has
# only one node, as here we do not differentiate classes but numeric values

In [None]:
# To see how we perform we create prediction and calculate MSE

y_pred = cars_tree_fit.predict(X_test)

mse_1 = MSE(y_test, y_pred)

print('MSE of unpruned tree is', mse_1)

In [None]:
# In case of rgression trees it is even more crucial to prune the tree
# Let's do that using some available attributes (we have most of the same ones as with classification tree)

# We limit the depth of the tree, and we require to have at least 10 nodes in each branch
cars_reg_2 = DecisionTreeRegressor(max_depth = 8, min_samples_leaf = 10, random_state = 0)

# Finally we fit the data

cars_tree_fit_2 = cars_reg_2.fit(X_train, y_train)

y_pred = cars_tree_fit_2.predict(X_test)

mse_2 = MSE(y_test, y_pred)

# We improved a lot. When we have regression, we do not have any spcific error value to target
# We just try to decrease this value as much as we can by chaning parameters

print('MSE of pruned tree is', mse_2)

In [3]:
cars = pd.read_csv('auto.csv')
cars.head()

Unnamed: 0,mpg,displ,hp,weight,accel,origin,size
0,18.0,250.0,88,3139,14.5,US,15.0
1,9.0,304.0,193,4732,18.5,US,20.0
2,36.1,91.0,60,1800,16.4,Asia,10.0
3,18.5,250.0,98,3525,19.0,US,15.0
4,34.3,97.0,78,2188,15.8,Europe,10.0


In [None]:
# To have an idea how good we are, compare it to linear regression

from sklearn.linear_model import LinearRegression

cars_lr = LinearRegression()

cars_lr_fit = cars_lr.fit(X_train, y_train)

y_pred = cars_lr_fit.predict(X_test)

mse_3 = MSE(y_test, y_pred)

# Simple linear regression performs better

print('MSE of linear regression is', mse_3)

In [None]:
# We can utilize our knowledge of gridsearch and try to improve

# Define params_dt
params_dt = {'max_depth': [2, 3, 4],'min_samples_leaf': [10, 15, 20, 25]}

grid_dt = GridSearchCV(estimator=cars_reg, param_grid=params_dt, scoring='neg_mean_squared_error', cv=5)

grid_dt.fit(X_train, y_train)

In [None]:
# We can extract the best model

best_model = grid_dt.best_estimator_

# and evaluate prediction based on that

y_pred = best_model.predict(X_test)

mse_4 = MSE(y_test, y_pred)

# We improved somewhat, but still worse than linear regression

print('MSE of optimized regression tree is', mse_4)

In [None]:
# We can also create Random forest for regression
# They work the same way as regression trees

# We need to import the new function
from sklearn.ensemble import RandomForestRegressor

# We can create an  instance of a regressor, with the same main parameter, number of base estimators

cars_rf = RandomForestRegressor(n_estimators = 200, random_state = 0)
                           
# We fit the training set            
cars_rf.fit(X_train, y_train)  

# Create predictions
y_pred_rf = cars_rf.predict(X_test)

mse_rf = MSE(y_test, y_pred_rf)

# As we can see, without any parameter selection, we already got better results than linear regression

print('MSE of random forests is', mse_rf)

In [None]:
# Let's try to optimize it

# We define the grid
grid = dict(max_depth = [4, 6, 8], min_samples_leaf = [10, 15, 20])

forest_cars = RandomForestRegressor(n_estimators = 200, random_state = 0)

grid_search = GridSearchCV(estimator=forest_cars, param_grid=grid, scoring='neg_mean_squared_error')

grid_result = grid_search.fit(X, y)

# Print out the best result
print("Best result is obtained using", grid_result.best_params_)

In [None]:
cars_rf = RandomForestRegressor(n_estimators = 200, max_depth = 6, min_samples_leaf = 6, random_state = 0)
                           
# We fit the training set            
cars_rf_fit = cars_rf.fit(X_train, y_train)  

# Create predictions
y_pred_rf = cars_rf_fit.predict(X_test)

mse_rf = MSE(y_test, y_pred_rf)

# As we can see, without any parameter selection, we already got better results than linear regression

print('MSE of random forests is', mse_rf)

### Feature importance

It can be important to also have an idea when applying tree based model, to understand what variables have the greates impact on determining the outcome. This is not really possible to see from the tree itself, especially not from a forest, but it can be extracted easily from a model. This works the same way for classification and regression problem, we will look at it now for thecreated regression models.

In [None]:
# First try with decision trees
# We can simply obtain feature importance using feature_importances_
# We check our best model for regression trees

# We can see that only three varibales play a role in the prediction

pd.Series(data = best_model.feature_importances_, index= X_train.columns)

In [None]:
# The same for the best random forest
# In this case we have at least some importance assigned to each variable, but the best is the same
# Interestingly, size, that had no role in the regression tree model, is the second most important here

pd.Series(data = cars_rf_fit.feature_importances_, index= X_train.columns)