In [1]:
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.tree import DecisionTreeRegressor

In [2]:
import pandas as pd
import numpy as np
import preprocessing as pp
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import GridSearchCV

In [3]:
df_train = pd.read_csv('housing_train.csv')
df_test = pd.read_csv('housing_test.csv')

target = 'median_house_value'
y_train = df_train[target]
X_train = df_train.drop(columns=target)
y_test = df_test[target]
X_test = df_test.drop(columns=target)

In [19]:
df_test.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,3935.0,3935.0,3935.0,3935.0,3901.0,3935.0,3935.0,3935.0,3935.0
mean,-119.591532,35.669144,28.500127,2672.709276,551.404512,1474.880559,512.887421,3.661974,192564.345616
std,2.008371,2.145469,12.554383,2303.064646,443.31103,1217.637301,403.753704,1.575484,97883.225468
min,-124.35,32.55,2.0,16.0,3.0,8.0,2.0,0.536,17500.0
25%,-121.8,33.94,18.0,1461.5,301.0,814.5,286.5,2.5,115200.0
50%,-118.58,34.29,28.0,2131.0,443.0,1200.0,417.0,3.4005,174100.0
75%,-118.015,37.72,37.0,3170.0,655.0,1756.0,615.0,4.5439,251200.0
max,-114.47,41.81,52.0,37937.0,6445.0,28566.0,6082.0,13.1107,500000.0


In [4]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

In [5]:
estimator = [DecisionTreeRegressor(),RandomForestRegressor(n_estimators=300),LinearRegression()]

In [6]:
preds = {}
for e in estimator:
    pipeline = pp.pipe(X_train,e)
    pipeline.fit(X_train,y_train)
    preds[str(e)] = pipeline.predict(X_test)

In [7]:
d = pd.DataFrame(preds)
d

Unnamed: 0,DecisionTreeRegressor(),RandomForestRegressor(n_estimators=300),LinearRegression()
0,399300.0,399350.666667,318657.0
1,71600.0,95746.000000,80449.0
2,88900.0,94427.000000,69761.0
3,242900.0,265101.000000,250113.0
4,339200.0,320025.000000,310337.0
...,...,...,...
3930,151000.0,202697.000000,242945.0
3931,93800.0,175463.333333,73505.0
3932,57300.0,66793.666667,66001.0
3933,135300.0,125687.666667,159425.0


In [8]:
final_preds = d.copy()
final_preds['mean'] = d.mean(axis=1)
final_preds['median'] = d.median(axis=1)

In [13]:
pd.concat([final_preds,y_test], axis=1)

Unnamed: 0,DecisionTreeRegressor(),RandomForestRegressor(n_estimators=300),LinearRegression(),mean,median,median_house_value
0,399300.0,399350.666667,318657.0,372435.888889,399300.0,461300.0
1,71600.0,95746.000000,80449.0,82598.333333,80449.0,75000.0
2,88900.0,94427.000000,69761.0,84362.666667,88900.0,87500.0
3,242900.0,265101.000000,250113.0,252704.666667,250113.0,229300.0
4,339200.0,320025.000000,310337.0,323187.333333,320025.0,324700.0
...,...,...,...,...,...,...
3930,151000.0,202697.000000,242945.0,198880.666667,202697.0,143800.0
3931,93800.0,175463.333333,73505.0,114256.111111,93800.0,98200.0
3932,57300.0,66793.666667,66001.0,63364.888889,66001.0,53500.0
3933,135300.0,125687.666667,159425.0,140137.555556,135300.0,131400.0


In [9]:
score = mean_absolute_error(y_test, final_preds['mean'])
print('MAE:', score)
r2score = r2_score(y_test, final_preds['mean'])
print('R2:', r2score)

MAE: 31680.174780742625
R2: 0.769390086790086


In [10]:
score = mean_absolute_error(y_test, final_preds['median'])
print('MAE:', score)
r2score = r2_score(y_test, final_preds['median'])
print('R2:', r2score)

MAE: 29644.609104616688
R2: 0.7855865228644684


In [16]:
#Bagging takes subsets of the training data, train the same model on all the subsets and average the final results
bagging = BaggingRegressor(DecisionTreeRegressor(), max_samples=0.5, max_features=0.5)

#Boosting uses weak learner and learn from errors and train new model based on the errors
boosting = AdaBoostRegressor(n_estimators=100)

#Stacking trains different model on part of the training set and use one model to decide which prediction to choose
estimators = [('decision_tree', DecisionTreeRegressor()),
              ('rf', RandomForestRegressor(n_estimators=300)),
              ('lr', LinearRegression())]
final_estimator = LinearRegression()
stacking = StackingRegressor(estimators=estimators,final_estimator=final_estimator)

In [17]:
ensembles = [bagging, boosting, stacking]

for e in ensembles :
    print(e)
    model = e
    my_pipeline = pp.pipe(X_train, model)
    my_pipeline.fit(X_train,y_train)
    preds = my_pipeline.predict(X_test)
    score = mean_absolute_error(y_test, preds)
    print('MAE:', score)
    r2score = r2_score(y_test, preds)
    print('R2:', r2score, '\n')

BaggingRegressor(base_estimator=DecisionTreeRegressor(), max_features=0.5,
                 max_samples=0.5)
MAE: 36376.24393900889
R2: 0.7284353567531051 

AdaBoostRegressor(n_estimators=100)
MAE: 54374.01864095516
R2: 0.5325157128609047 

StackingRegressor(estimators=[('decision_tree', DecisionTreeRegressor()),
                              ('rf', RandomForestRegressor(n_estimators=300)),
                              ('lr', LinearRegression())],
                  final_estimator=LinearRegression())
MAE: 27675.07826428215
R2: 0.8069610828241802 

