In [1]:
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.tree import DecisionTreeRegressor

In [2]:
import pandas as pd
import numpy as np
import preprocessing as pp
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import GridSearchCV

In [3]:
df_train = pd.read_csv('housing_train.csv')
df_test = pd.read_csv('housing_test.csv')

target = 'median_house_value'
y_train = df_train[target]
X_train = df_train.drop(columns=target)
y_test = df_test[target]
X_test = df_test.drop(columns=target)

In [4]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

In [5]:
estimator = [DecisionTreeRegressor(),RandomForestRegressor(n_estimators=300),LinearRegression()]

In [6]:
preds = {}
for e in estimator:
    pipeline = pp.pipe(X_train,e)
    pipeline.fit(X_train,y_train)
    preds[str(e)] = pipeline.predict(X_test)

In [7]:
d = pd.DataFrame(preds)
d

Unnamed: 0,DecisionTreeRegressor(),RandomForestRegressor(n_estimators=300),LinearRegression()
0,399300.0,399483.333333,318657.0
1,71600.0,97242.000000,80449.0
2,85700.0,96615.000000,69761.0
3,242900.0,270736.333333,250113.0
4,339200.0,320711.333333,310337.0
...,...,...,...
3930,151000.0,203431.666667,242945.0
3931,93800.0,175310.666667,73505.0
3932,57300.0,65199.333333,66001.0
3933,135300.0,122631.333333,159425.0


In [8]:
final_preds = d.copy()
final_preds['mean'] = d.mean(axis=1)
final_preds['median'] = d.median(axis=1)

In [9]:
score = mean_absolute_error(y_test, final_preds['mean'])
print('MAE:', score)
r2score = r2_score(y_test, final_preds['mean'])
print('R2:', r2score)

MAE: 31680.174780742625
R2: 0.769390086790086


In [10]:
score = mean_absolute_error(y_test, final_preds['median'])
print('MAE:', score)
r2score = r2_score(y_test, final_preds['median'])
print('R2:', r2score)

MAE: 29644.609104616688
R2: 0.7855865228644684


In [11]:
#Bagging takes subsets of the training data, train the same model on all the subsets and average the final results
bagging = BaggingRegressor(DecisionTreeRegressor(), max_samples=0.5, max_features=0.5)

#Boosting uses weak learner and learn from errors and train new model based on the errors
boosting = AdaBoostRegressor(n_estimators=100)

#Stacking trains different model on part of the training set and use one model to decide which prediction to choose
estimators = [('ridge', RidgeCV()),
              ('lasso', LassoCV(random_state=42)),
              ('knr', KNeighborsRegressor(n_neighbors=20,metric='euclidean'))]
final_estimator = GradientBoostingRegressor(n_estimators=25, subsample=0.5, 
                                            min_samples_leaf=25, max_features=1,random_state=42)
stacking = StackingRegressor(estimators=estimators,final_estimator=final_estimator)

In [12]:
ensembles = [bagging, boosting, stacking]

for e in ensembles :
    print(e)
    model = e
    my_pipeline = pp.pipe(X_train, model)
    my_pipeline.fit(X_train,y_train)
    preds = my_pipeline.predict(X_test)
    score = mean_absolute_error(y_test, preds)
    print('MAE:', score)
    r2score = r2_score(y_test, preds)
    print('R2:', r2score, '\n')

BaggingRegressor(base_estimator=DecisionTreeRegressor(), max_features=0.5,
                 max_samples=0.5)
MAE: 33536.68343074968
R2: 0.755171902885788 

AdaBoostRegressor(n_estimators=100)
MAE: 52487.90285411425
R2: 0.5570060261432686 

StackingRegressor(estimators=[('ridge', RidgeCV()),
                              ('lasso', LassoCV(random_state=42)),
                              ('knr',
                               KNeighborsRegressor(metric='euclidean',
                                                   n_neighbors=20))],
                  final_estimator=GradientBoostingRegressor(max_features=1,
                                                            min_samples_leaf=25,
                                                            n_estimators=25,
                                                            random_state=42,
                                                            subsample=0.5))
MAE: 36250.660329051876
R2: 0.727342571296173 

