<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Load-data-/-add-dummies" data-toc-modified-id="Load-data-/-add-dummies-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Load data / add dummies</a></span></li><li><span><a href="#Train-test-split" data-toc-modified-id="Train-test-split-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Train test split</a></span></li><li><span><a href="#Random-Forest" data-toc-modified-id="Random-Forest-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Random Forest</a></span></li><li><span><a href="#Ridge-Regression" data-toc-modified-id="Ridge-Regression-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Ridge Regression</a></span></li><li><span><a href="#Model-Stacking" data-toc-modified-id="Model-Stacking-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Model Stacking</a></span></li></ul></div>

# Model stacking example 
Author: Andrew Szwec


In [1]:
import numpy as np
import pandas as pd

## Load data / add dummies

In [2]:
# read in vehicle data
vehicles = pd.read_csv('used_vehicles.csv')

In [3]:
vehicles

Unnamed: 0,price,year,miles,doors,type
0,22000,2012,13000,2,car
1,14000,2010,30000,2,car
2,13000,2010,73500,4,car
3,9500,2009,78000,4,car
4,9000,2007,47000,4,car
5,4000,2006,124000,2,car
6,3000,2004,177000,4,car
7,2000,2004,209000,4,truck
8,3000,2003,138000,2,car
9,1900,2003,160000,4,car


In [4]:
# convert car to 0 and truck to 1
# vehicles['type'] = vehicles.type.map({'car':0, 'truck':1})
# OR

# create three dummy variables, drop the first dummy variable, and store this as a DataFrame
type_dummies = pd.get_dummies(vehicles.type, prefix='type')

In [5]:
# concatenate the two dummy variable columns onto the original DataFrame
# note: axis=0 means rows, axis=1 means columns
vehicles = pd.concat([vehicles, type_dummies], axis=1)
vehicles.drop(['type'], axis=1, inplace=True)

In [6]:
vehicles

Unnamed: 0,price,year,miles,doors,type_car,type_truck
0,22000,2012,13000,2,1,0
1,14000,2010,30000,2,1,0
2,13000,2010,73500,4,1,0
3,9500,2009,78000,4,1,0
4,9000,2007,47000,4,1,0
5,4000,2006,124000,2,1,0
6,3000,2004,177000,4,1,0
7,2000,2004,209000,4,0,1
8,3000,2003,138000,2,1,0
9,1900,2003,160000,4,1,0


In [7]:
vehicles.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 6 columns):
price         14 non-null int64
year          14 non-null int64
miles         14 non-null int64
doors         14 non-null int64
type_car      14 non-null uint8
type_truck    14 non-null uint8
dtypes: int64(4), uint8(2)
memory usage: 556.0 bytes


In [8]:
vehicles.shape

(14, 6)

## Train test split

In [9]:
from sklearn.model_selection import train_test_split
X = vehicles.drop(['price'], axis=1)
y = vehicles['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,  random_state=121)

## Random Forest

In [11]:
# import class, instantiate estimator, fit with all data
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, median_absolute_error

rfreg = RandomForestRegressor(n_estimators=100, max_features='auto', oob_score=True, random_state=121)
rfreg.fit(X_train, y_train)
rf_mse = mean_squared_error( rfreg.predict(X_test), y_test)
rf_mae = median_absolute_error(rfreg.predict(X_test), y_test)
print('Random forest MSE =',rf_mse)
print('Random forest MAE =', rf_mae)

Random forest MSE = 560586.0
Random forest MAE = 695.0


## Ridge Regression

In [14]:
from sklearn.linear_model import RidgeCV

# 5 fold cross validation
ridge = RidgeCV(cv=5, normalize=True)
ridge.fit(X_train, y_train)
rd_mse = mean_squared_error( ridge.predict(X_test), y_test)
rd_mae = median_absolute_error( ridge.predict(X_test), y_test)
print('Ridge Regression MSE =',rd_mse)
print('Ridge Regression MAE =',rd_mae)

Ridge Regression MSE = 2034237.0831707644
Ridge Regression MAE = 289.79234546539374


## Model Stacking
1. Concatenate predictions from random forest and ridge regression into one df, using training set

In [16]:
df2 = pd.DataFrame( {'rf':rfreg.predict(X_train), 'rdg':ridge.predict(X_train)})
df2

Unnamed: 0,rdg,rf
0,4258.853615,2897.0
1,6606.370294,5291.0
2,2335.383256,2272.0
3,14021.046538,18395.0
4,9726.674783,9930.0
5,3036.779577,2735.0
6,12563.261718,15000.0
7,9817.938416,8180.0
8,5895.830471,2915.0
9,7711.38151,4756.0


'2. Use a linear model to stack the RF and Ridge together. This is like a weighted average of models

In [17]:
from sklearn.linear_model import LinearRegression

lin = LinearRegression()
lin.fit(df2, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

'3. Now predict on test set to get MSE

In [19]:
# Now predict on test set
testdf = pd.DataFrame( {'rf':rfreg.predict(X_test), 'rdg':ridge.predict(X_test)})
stacked_mse = mean_squared_error( lin.predict(testdf) , y_test)
stacked_mae = median_absolute_error( lin.predict(testdf) , y_test)
print('Stacked Model MSE =',stacked_mse)
print('Stacked Model MAE =',stacked_mae)

Stacked Model MSE = 123421.16884607814
Stacked Model MAE = 353.19440828881625


In [20]:
if stacked_mse < rd_mse and stacked_mse < rf_mse:
    print('Stacked Model MSE less than both RF and Ridge')

Stacked Model MSE less than both RF and Ridge


In [None]:
# Test Data
# 4000	2006	124000	2	1	0

In [21]:
newcar = pd.DataFrame({'year':2006, 'miles':124000, 'doors':2, 'type_car':1, 'type_truck':0}, index=[0])
newcar['type_car'] = newcar.type_car.astype('uint8')
newcar['type_truck'] = newcar.type_truck.astype('uint8')

In [22]:
newcar.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1 entries, 0 to 0
Data columns (total 5 columns):
doors         1 non-null int64
miles         1 non-null int64
type_car      1 non-null uint8
type_truck    1 non-null uint8
year          1 non-null int64
dtypes: int64(3), uint8(2)
memory usage: 34.0 bytes


In [23]:
newcar = X_train.loc[5].values.reshape(1,5)

In [24]:
x1 = rfreg.predict(newcar)
x2 = ridge.predict(newcar)

In [25]:
x1

array([4756.])

In [29]:
x2

array([7711.38150995])

In [26]:
zz = pd.DataFrame({'rdg': x1, 'rf': x2}, index=[0])

In [27]:
lin.predict(zz)

array([8112.76296542])

In [30]:
X_train.loc[5]

year            2006
miles         124000
doors              2
type_car           1
type_truck         0
Name: 5, dtype: int64