### 【問題1】ブレンディングのスクラッチ実装

In [3]:
import numpy as np
import pandas as pd
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb

df = pd.read_csv("../data/house-prices-advanced-regression-techniques/train.csv")
x = df.loc[:,['GrLivArea','YearBuilt']]
y = df['SalePrice']
display(x.describe())
display(y.describe())

# 標準化
sc = StandardScaler()
x = np.array(x)
y = np.array(y)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=23)
sc.fit(x_train)
x_train = sc.transform(x_train)
sc.fit(x_test)
x_test = sc.transform(x_test)

Unnamed: 0,GrLivArea,YearBuilt
count,1460.0,1460.0
mean,1515.463699,1971.267808
std,525.480383,30.202904
min,334.0,1872.0
25%,1129.5,1954.0
50%,1464.0,1973.0
75%,1776.75,2000.0
max,5642.0,2010.0


count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64

In [5]:
# パターン1

clf1 = lgb.LGBMRegressor()
clf1.fit(x_train, y_train)
y_pred1 = clf1.predict(x_test)
res1 = np.log(mean_squared_error(y_test ,y_pred1))

clf2 = DecisionTreeRegressor(max_depth=2)
clf2.fit(x_train, y_train)
y_pred2 = clf2.predict(x_test)
res2 = np.log(mean_squared_error(y_test, y_pred2))

clf3 = ElasticNet()
clf3.fit(x_train, y_train)
y_pred3 = clf3.predict(x_test)
res3 = np.log(mean_squared_error(y_test, y_pred3))

# predの平均を取る
y_pred_all = (y_pred1 + y_pred2 + y_pred3) / 3
res_all = np.log(mean_squared_error(y_test, y_pred_all))

# 値が小さいほど優秀
display(res1)
display(res2)
display(res3)
display(res_all) # ブレンディング後が最優秀

21.3135193002795

21.734726609958688

21.292246763650407

21.198264315267572

In [6]:
# パターン2
from sklearn.linear_model import LinearRegression

clf4 = LinearRegression()
clf4.fit(x_train, y_train)
y_pred4 = clf4.predict(x_test)
res4 = np.log(mean_squared_error(y_test, y_pred4))

# predの平均を取る
y_pred_all = (y_pred1 + y_pred2 + y_pred4) / 3
res_all = np.log(mean_squared_error(y_test, y_pred_all))

# 値が小さいほど優秀
display(res1)
display(res2)
display(res4)
display(res_all) # ブレンディング後が最優秀

21.3135193002795

21.734726609958688

21.250318074189572

21.200153162720344

In [16]:
# パターン3
clf5 = DecisionTreeRegressor(max_depth=3)
clf5.fit(x_train, y_train)
y_pred5 = clf5.predict(x_test)
res5 = np.log(mean_squared_error(y_test, y_pred5))

# predの平均を取る
y_pred_all1 = (y_pred1 + y_pred2 + y_pred3 + y_pred4 + y_pred5) / 5
y_pred_all2 = y_pred1*0.1 + y_pred2*0.05 + y_pred3*0.3 + y_pred4*0.5 + y_pred5*0.05 # 重み付けパターン
res_all1 = np.log(mean_squared_error(y_test, y_pred_all1))
res_all2 = np.log(mean_squared_error(y_test, y_pred_all2))

# 値が小さいほど優秀
display(res1)
display(res2)
display(res3)
display(res4)
display(res5)
display(res_all1) # ブレンディング後が最優秀
display(res_all2) # ブレンディング後が最優秀 (重み付け調整が最優秀)

21.3135193002795

21.734726609958688

21.292246763650407

21.250318074189572

21.534825890573824

21.20278920487294

21.16153362237194

```
memo
https://qiita.com/Moby-Dick/items/e2f1efd923fddd72b17d
```

### 【問題2】バギングのスクラッチ実装

In [22]:
x_train_a, x_train_b, y_train_a, y_train_b = train_test_split(x, y, test_size=0.5, random_state=23)
x_train1, x_train2, y_train1, y_train2 = train_test_split(x_train_a, y_train_a, test_size=0.5, random_state=23)
x_train3, x_test, y_train3, y_test = train_test_split(x_train_b, y_train_b, test_size=0.5, random_state=23)

clf1 = lgb.LGBMRegressor()
clf1.fit(x_train1, y_train1)
y_pred1 = clf1.predict(x_test)
res1 = np.log(mean_squared_error(y_test ,y_pred1))

clf1 = lgb.LGBMRegressor()
clf1.fit(x_train2, y_train2)
y_pred2 = clf1.predict(x_test)
res2 = np.log(mean_squared_error(y_test ,y_pred2))

clf1 = lgb.LGBMRegressor()
clf1.fit(x_train3, y_train3)
y_pred3 = clf1.predict(x_test)
res3 = np.log(mean_squared_error(y_test ,y_pred3))

y_pred_all1 = (y_pred1 + y_pred2 + y_pred3) / 3
res_all1 = np.log(mean_squared_error(y_test, y_pred_all1))


display(res1)
display(res2)
display(res3)
display(res_all1) # バギングしたものが最も結果が良い

21.659043803814455

21.706264040679674

21.64668295681728

21.591500176987264

### 【問題3】スタッキングのスクラッチ実装

In [52]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.6, random_state=3)
x_test1, x_test2, y_test1, y_test2 = train_test_split(x, y, test_size=0.5, random_state=3)

# 標準化
sc.fit(x_train)
x_train = sc.transform(x_train)
sc.fit(x_test1)
x_test1 = sc.transform(x_test1)
x_test2 = sc.transform(x_test2)

# モデル1
clf1 = lgb.LGBMRegressor()
clf1.fit(x_train, y_train)
y_pred1 = clf1.predict(x_test1)
res1 = np.log(mean_squared_error(y_test1 ,y_pred1))
# モデル2
clf2 = DecisionTreeRegressor(max_depth=2)
clf2.fit(x_train, y_train)
y_pred2 = clf2.predict(x_test1)
res2 = np.log(mean_squared_error(y_test1, y_pred2))
# モデル3
clf3 = ElasticNet()
clf3.fit(x_train, y_train)
y_pred3 = clf3.predict(x_test1)
res3 = np.log(mean_squared_error(y_test1, y_pred3))

In [55]:
# メタモデル モデル1~3のpredの組み合わせを特徴量として、対応するy_test1を学習
stacked_predictions = np.column_stack((y_pred1, y_pred2, y_pred3))
meta_model = LinearRegression()
meta_model.fit(stacked_predictions, y_test1)

# 新しい特徴量で 各モデル+メタモデル の結果をだす
valid_pred_1 = clf1.predict(x_test2)
valid_pred_2 = clf2.predict(x_test2)
valid_pred_3 = clf3.predict(x_test2)
stacked_valid_predictions = np.column_stack((valid_pred_1, valid_pred_2, valid_pred_3))
meta_valid_pred = meta_model.predict(stacked_valid_predictions)

print ("mean squared error of model 1: {:.4f}".format( np.log(mean_squared_error(y_test2, valid_pred_1))) )
print ("mean squared error of model 2: {:.4f}".format( np.log(mean_squared_error(y_test2, valid_pred_2))) )
print ("mean squared error of model 3: {:.4f}".format( np.log(mean_squared_error(y_test2, valid_pred_3))) )

# スタッキングモデルが結果最大化
print ("mean squared error of meta model: {:.4f}".format( np.log(mean_squared_error(y_test2, meta_valid_pred))) )

mean squared error of model 1: 21.4691
mean squared error of model 2: 22.1056
mean squared error of model 3: 21.4981
mean squared error of meta model: 21.4368
