# **Sprint8 アンサンブル学習**

### **データの読み込み**

In [279]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [305]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [306]:
X = df.loc[:, ['GrLivArea', 'YearBuilt']]
y = df.loc[:, 'SalePrice']

In [189]:
X.loc[[1, 2, 3], :]

Unnamed: 0,GrLivArea,YearBuilt
1,1262,1976
2,1786,2001
3,1717,1915


In [238]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### **【問題1】ブレンディングのスクラッチ実装**

In [76]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

from sklearn.metrics import mean_squared_error

In [50]:
# 線形回帰、SVR、K最近傍法を使って平均をとったもの
model_1 = LinearRegression()
model_2 = SVR()
model_3 = KNeighborsRegressor()

model_1.fit(X_train, y_train)
model_2.fit(X_train, y_train)
model_3.fit(X_train, y_train)

pred_1 = model_1.predict(X_test)
pred_2 = model_2.predict(X_test)
pred_3 = model_3.predict(X_test)

print('model LinearRegression:', mean_squared_error(y_test, pred_1))
print('model SVR:', mean_squared_error(y_test, pred_2))
print('model KNeighbors:', mean_squared_error(y_test, pred_3))

brend_pred = (pred_1 + pred_2 + pred_3) / 3
print('model Brending:', mean_squared_error(y_test, brend_pred))

model LinearRegression: 2942066921.672108
model SVR: 7243319908.928937
model KNeighbors: 3119613673.729041
model Brending: 3156990574.0246778




In [85]:
model_1 = LinearRegression()
model_2 = SVR('linear', C=100)
model_3 = KNeighborsRegressor()

model_1.fit(X_train, y_train)
model_2.fit(X_train, y_train)
model_3.fit(X_train, y_train)

pred_1 = model_1.predict(X_test)
pred_2 = model_2.predict(X_test)
pred_3 = model_3.predict(X_test)

print('model LinearRegression:', mean_squared_error(y_test, pred_1))
print('model SVR:', mean_squared_error(y_test, pred_2))
print('model KNeighbors:', mean_squared_error(y_test, pred_3))

brend_pred = (pred_1 + pred_2 + pred_3) / 3
print('model Brending:', mean_squared_error(y_test, brend_pred))

model LinearRegression: 2942066921.672108
model SVR: 2933120591.9921155
model KNeighbors: 3119613673.729041
model Brending: 2816564414.3172693


**単一モデルよりも精度が高くなった**

In [69]:
# 線形回帰、SVR、K最近傍法を使って推定値の重みを変えたもの
model_1 = LinearRegression()
model_2 = SVR()
model_3 = KNeighborsRegressor()

model_1.fit(X_train, y_train)
model_2.fit(X_train, y_train)
model_3.fit(X_train, y_train)

pred_1 = model_1.predict(X_test)
pred_2 = model_2.predict(X_test)
pred_3 = model_3.predict(X_test)

print('model LinearRegression:', mean_squared_error(y_test, pred_1))
print('model SVR:', mean_squared_error(y_test, pred_2))
print('model KNeighbors:', mean_squared_error(y_test, pred_3))

brend_pred = 0.5*pred_1 + 0.1*pred_2 + 0.4*pred_3
print('model Brending:', mean_squared_error(y_test, brend_pred))

model LinearRegression: 2942066921.672108
model SVR: 7243319908.928937
model KNeighbors: 3119613673.729041
model Brending: 2814478878.1220684




**単一モデルよりも精度が高くなった**

In [53]:
# 線形回帰、LightGBMRegressor、K最近傍法を使って平均をとったもの
model_1 = LinearRegression()
model_2 = LGBMRegressor()
model_3 = KNeighborsRegressor()

model_1.fit(X_train, y_train)
model_2.fit(X_train, y_train)
model_3.fit(X_train, y_train)

pred_1 = model_1.predict(X_test)
pred_2 = model_2.predict(X_test)
pred_3 = model_3.predict(X_test)

print('model LinearRegression:', mean_squared_error(y_test, pred_1))
print('model LGBMRegressor:', mean_squared_error(y_test, pred_2))
print('model KNeighbors:', mean_squared_error(y_test, pred_3))

brend_pred = (pred_1 + pred_2 + pred_3) / 3
print('model Brending:', mean_squared_error(y_test, brend_pred))

model LinearRegression: 2942066921.672108
model LGBMRegressor: 1980418355.58938
model KNeighbors: 3119613673.729041
model Brending: 2412116418.3228445


In [72]:
# 線形回帰、LightGBMRegressor、K最近傍法を使って推定値の重みを変えたもの
model_1 = LinearRegression()
model_2 = LGBMRegressor()
model_3 = KNeighborsRegressor()

model_1.fit(X_train, y_train)
model_2.fit(X_train, y_train)
model_3.fit(X_train, y_train)

pred_1 = model_1.predict(X_test)
pred_2 = model_2.predict(X_test)
pred_3 = model_3.predict(X_test)

print('model LinearRegression:', mean_squared_error(y_test, pred_1))
print('model LGBMRegressor:', mean_squared_error(y_test, pred_2))
print('model KNeighbors:', mean_squared_error(y_test, pred_3))

brend_pred = 0.1*pred_1 + 0.8*pred_2 + 0.1*pred_3
print('model Brending:', mean_squared_error(y_test, brend_pred))

model LinearRegression: 2942066921.672108
model LGBMRegressor: 1980418355.58938
model KNeighbors: 3119613673.729041
model Brending: 2047871505.5311673


In [73]:
# 線形回帰、LightGBMRegressor、決定木を使って推定値の重みを変えたもの
model_1 = LinearRegression()
model_2 = LGBMRegressor()
model_3 = DecisionTreeRegressor()

model_1.fit(X_train, y_train)
model_2.fit(X_train, y_train)
model_3.fit(X_train, y_train)

pred_1 = model_1.predict(X_test)
pred_2 = model_2.predict(X_test)
pred_3 = model_3.predict(X_test)

print('model LinearRegression:', mean_squared_error(y_test, pred_1))
print('model LGBMRegressor:', mean_squared_error(y_test, pred_2))
print('model DecisionTreeRegressor:', mean_squared_error(y_test, pred_3))

brend_pred = 0.2*pred_1 + 0.7*pred_2 + 0.1*pred_3
print('model Brending:', mean_squared_error(y_test, brend_pred))

model LinearRegression: 2942066921.672108
model LGBMRegressor: 1980418355.58938
model DecisionTreeRegressor: 3391334438.67618
model Brending: 2044584100.9107177


In [79]:
# 線形回帰、LightGBMRegressor、XGBRegressorを使って推定値の重みを変えたもの
model_1 = LinearRegression()
model_2 = LGBMRegressor()
model_3 = XGBRegressor()

model_1.fit(X_train, y_train)
model_2.fit(X_train, y_train)
model_3.fit(X_train, y_train)

pred_1 = model_1.predict(X_test)
pred_2 = model_2.predict(X_test)
pred_3 = model_3.predict(X_test)

print('model LinearRegression:', mean_squared_error(y_test, pred_1))
print('model LGBMRegressor:', mean_squared_error(y_test, pred_2))
print('model XGBRegressor:', mean_squared_error(y_test, pred_3))

brend_pred = 0.05*pred_1 + 0.25*pred_2 + 0.7*pred_3
print('model Brending:', mean_squared_error(y_test, brend_pred))

model LinearRegression: 2942066921.672108
model LGBMRegressor: 1980418355.58938
model XGBRegressor: 1360699331.7691965
model Brending: 1455973321.8518739


  if getattr(data, 'base', None) is not None and \


### **【問題2】バギングのスクラッチ実装**

In [194]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=0)

In [180]:
np.random.choice([0, 8, 9], 5, replace=True)

array([0, 8, 8, 8, 9])

In [173]:
print(len(np.unique(X_train.index)))
print(len(X_train.index))

1168
1168


In [202]:
np.random.seed(1)
X_train1_index = np.random.choice(X_train.index, X_train.shape[0], replace=True)
X_train1 = X_train.loc[X_train1_index, :]
y_train1 = y_train.loc[X_train1.index]

np.random.seed(2)
X_train2_index = np.random.choice(X_train.index, X_train.shape[0], replace=True)
X_train2 = X_train.loc[X_train2_index, :]
y_train2 = y_train.loc[X_train2.index]

np.random.seed(3)
X_train3_index = np.random.choice(X_train.index, X_train.shape[0], replace=True)
X_train3 = X_train.loc[X_train3_index, :]
y_train3 = y_train.loc[X_train3.index]

print(X_train2_index)
print(len(np.unique(X_train1_index)))
#print(np.unique(X_train1_index))
#print(X_train1)

[ 493  225  864 ... 1425 1451  194]
741


In [203]:
# 決定木を使ってバギング
model_1 = DecisionTreeRegressor(max_depth=5)
model_2 = DecisionTreeRegressor(max_depth=5)
model_3 = DecisionTreeRegressor(max_depth=5)

model_1.fit(X_train1, y_train1)
model_2.fit(X_train2, y_train2)
model_3.fit(X_train3, y_train3)

pred_1 = model_1.predict(X_test)
pred_2 = model_2.predict(X_test)
pred_3 = model_3.predict(X_test)

print('MSE of max_depth=3:', mean_squared_error(y_test, pred_1))
print('MSE of max_depth=4:', mean_squared_error(y_test, pred_2))
print('MSE of max_depth=5:', mean_squared_error(y_test, pred_3))

bagging_pred = (pred_1 + pred_2 + pred_3)/3
print('model Bagging:', mean_squared_error(y_test, bagging_pred))

MSE of max_depth=3: 2858916527.31606
MSE of max_depth=4: 1645326912.2082877
MSE of max_depth=5: 1599022771.990175
model Bagging: 1605360844.8657339


In [204]:
# 線形回帰を使ってバギング
model_1 = SVR('linear', C=1)
model_2 = SVR('linear', C=1)
model_3 = SVR('linear', C=1)

model_1.fit(X_train1, y_train1)
model_2.fit(X_train2, y_train2)
model_3.fit(X_train3, y_train3)

pred_1 = model_1.predict(X_test)
pred_2 = model_2.predict(X_test)
pred_3 = model_3.predict(X_test)

print('MSE of C=1:', mean_squared_error(y_test, pred_1))
print('MSE of C=10:', mean_squared_error(y_test, pred_2))
print('MSE of C=100:', mean_squared_error(y_test, pred_3))

bagging_pred = (pred_1 + pred_2 + pred_3) / 3
print('model Bagging:', mean_squared_error(y_test, bagging_pred))

MSE of C=1: 2990936291.450201
MSE of C=10: 2934765094.635084
MSE of C=100: 2984776518.4681315
model Bagging: 2965183989.7504797


In [205]:
# 決定木を使ってバギング
model_1 = DecisionTreeRegressor()
model_2 = DecisionTreeRegressor()
model_3 = DecisionTreeRegressor()

model_1.fit(X_train1, y_train1)
model_2.fit(X_train2, y_train2)
model_3.fit(X_train3, y_train3)

pred_1 = model_1.predict(X_test)
pred_2 = model_2.predict(X_test)
pred_3 = model_3.predict(X_test)

print('MSE of model 1:', mean_squared_error(y_test, pred_1))
print('MSE of model 2:', mean_squared_error(y_test, pred_2))
print('MSE of model 3:', mean_squared_error(y_test, pred_3))

bagging_pred = (pred_1 + pred_2 + pred_3) / 3
print('model Bagging:', mean_squared_error(y_test, bagging_pred))

MSE of model 1: 3821218744.4509487
MSE of model 2: 2565987171.527397
MSE of model 3: 2773936393.932602
model Bagging: 2098015586.4248374


**単一のモデルよりも精度が高くなった**

### **【問題3】スタッキングのスクラッチ実装**

In [None]:
from sklearn.model_selection import train_test_split
X_train_valid, X_meta_valid, y_train_valid, y_meta_valid = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size=0.5, random_state=0)

In [32]:
base_model_1 = LinearRegression()
base_model_2 = SVR()
base_model_3 = KNeighborsRegressor()

base_model_1.fit(X_train, y_train)
base_model_2.fit(X_train, y_train)
base_model_3.fit(X_train, y_train)

base_pred_1 = base_model_1.predict(X_valid)
base_pred_2 = base_model_2.predict(X_valid)
base_pred_3 = base_model_3.predict(X_valid)

valid_pred_1 = base_model_1.predict(X_meta_valid)
valid_pred_2 = base_model_2.predict(X_meta_valid)
valid_pred_3 = base_model_3.predict(X_meta_valid)



In [35]:
print('MSE of model LinearRegression: {:.4f}'.format(mean_squared_error(y_meta_valid, valid_pred_1)))
print('MSE of model SVR: {:.4f}'.format(mean_squared_error(y_meta_valid, valid_pred_2)))
print('MSE of model KNeighbors: {:.4f}'.format(mean_squared_error(y_meta_valid, valid_pred_3)))

MSE of model LinearRegression: 2888168119.8862
MSE of model SVR: 7285911255.4753
MSE of model KNeighbors: 3070952953.0530


In [34]:
stacked_predictions = np.column_stack((base_pred_1, base_pred_2, base_pred_3))
stacked_valid_predictions = np.column_stack((valid_pred_1, valid_pred_2, valid_pred_3))

meta_model = LinearRegression()
meta_model.fit(stacked_predictions, y_valid)

meta_valid_pred = meta_model.predict(stacked_valid_predictions)
print('MSE of meta model: {:.4f}'.format(mean_squared_error(y_meta_valid, meta_valid_pred)))

MSE of meta model: 2846620046.6803


**単一モデルよりも精度が高くなった**

In [41]:
base_model_1 = LinearRegression()
base_model_2 = LGBMRegressor()
base_model_3 = KNeighborsRegressor()

base_model_1.fit(X_train, y_train)
base_model_2.fit(X_train, y_train)
base_model_3.fit(X_train, y_train)

base_pred_1 = base_model_1.predict(X_valid)
base_pred_2 = base_model_2.predict(X_valid)
base_pred_3 = base_model_3.predict(X_valid)

valid_pred_1 = base_model_1.predict(X_meta_valid)
valid_pred_2 = base_model_2.predict(X_meta_valid)
valid_pred_3 = base_model_3.predict(X_meta_valid)

In [42]:
print('MSE of model LinearRegression: {:.4f}'.format(mean_squared_error(y_meta_valid, valid_pred_1)))
print('MSE of model LGBMRegressor: {:.4f}'.format(mean_squared_error(y_meta_valid, valid_pred_2)))
print('MSE of model KNeighbors: {:.4f}'.format(mean_squared_error(y_meta_valid, valid_pred_3)))

MSE of model LinearRegression: 2942066921.6721
MSE of model LGBMRegressor: 1980418355.5894
MSE of model KNeighbors: 3119613673.7290


In [43]:
stacked_predictions = np.column_stack((base_pred_1, base_pred_2, base_pred_3))
stacked_valid_predictions = np.column_stack((valid_pred_1, valid_pred_2, valid_pred_3))

meta_model = LinearRegression()
meta_model.fit(stacked_predictions, y_valid)

meta_valid_pred = meta_model.predict(stacked_valid_predictions)
print('MSE of meta model: {:.4f}'.format(mean_squared_error(y_meta_valid, meta_valid_pred)))

MSE of meta model: 2008120847.0258


**単一モデルよりも精度が低くなった**

In [266]:
X_train1, X_train2, X_train3 = np.split(X_train, [389, 778])
y_train1, y_train2, y_train3 = np.split(y_train, [389, 778])
# print(len(X_train1), len(X_train2), len(X_train3))

X_train_1 = np.vstack((X_train2, X_train3))
y_train_1 = np.hstack((y_train2, y_train3))

X_train_2 = np.vstack((X_train1, X_train3))
y_train_2 = np.hstack((y_train1, y_train3))

X_train_3 = np.vstack((X_train1, X_train2))
y_train_3 = np.hstack((y_train1, y_train2))

In [267]:
base_model_1 = LinearRegression()
base_model_2 = LGBMRegressor()
base_model_3 = KNeighborsRegressor()

base_model_1.fit(X_train_1, y_train_1)
base_model_1.fit(X_train_2, y_train_2)
base_model_1.fit(X_train_3, y_train_3)

base_pred_11 = base_model_1.predict(X_train1)
base_pred_21 = base_model_1.predict(X_train2)
base_pred_31 = base_model_1.predict(X_train3)

In [268]:
brend_data1 = np.hstack((base_pred_11, base_pred_22, base_pred_33))

In [269]:
base_model_2.fit(X_train_1, y_train_1)
base_model_2.fit(X_train_2, y_train_2)
base_model_2.fit(X_train_3, y_train_3)

base_pred_12 = base_model_2.predict(X_train1)
base_pred_22 = base_model_2.predict(X_train2)
base_pred_32 = base_model_2.predict(X_train3)

In [270]:
brend_data2 = np.hstack((base_pred_12, base_pred_22, base_pred_32))

In [271]:
base_model_3.fit(X_train_1, y_train_1)
base_model_3.fit(X_train_2, y_train_2)
base_model_3.fit(X_train_3, y_train_3)

base_pred_13 = base_model_3.predict(X_train1)
base_pred_23 = base_model_3.predict(X_train2)
base_pred_33 = base_model_3.predict(X_train3)

In [272]:
brend_data3 = np.hstack((base_pred_13, base_pred_23, base_pred_33))

In [275]:
brend_data = (brend_data1 + brend_data2 + brend_data3) / 3

### **スタッキングクラスに挑戦**

In [336]:
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from collections import defaultdict
from sklearn.model_selection import KFold
from copy import deepcopy

In [337]:
estimators = list(zip(['svm', 'lr', 'knn'], [svm, lr, knn]))
dict(estimators)

{'svm': SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
     decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
     kernel='rbf', max_iter=-1, probability=False, random_state=None,
     shrinking=True, tol=0.001, verbose=False),
 'lr': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='warn', n_jobs=None, penalty='l2',
                    random_state=None, solver='warn', tol=0.0001, verbose=0,
                    warm_start=False),
 'knn': KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                      metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                      weights='uniform')}

In [347]:
skf = KFold(n_splits=5)
index_list = list(skf.split(X, y))
for train_index, test_index in index_list:
    print(len(train_index))
    print(len(test_index))
# print(len(test_index))
print(len(train_index)/1460)

1168
292
1168
292
1168
292
1168
292
1168
292
0.8


In [3]:
class StackingClassifier:
    def __init__(self, estimators, merge_estimator):
        self.original_clfs = dict(estimators)
        self.m_clf = merge_estimator

        self.clfs_dict = defaultdict(list)
        self.clfs_index = sorted(self.original_clfs.keys())

    def fit(self, X, y):
        self.clfs_dict = defaultdict(list)

        skf = KFold(n_splits=5)
        index_list = list(skf.split(X, y))

        merge_feature_list = []
        for clf_name in self.clfs_index:
            clf_origin = self.original_clfs[clf_name]
            preds_tmp_list = []
            for train_index, test_index in index_list:
                # clf_copy = deepcopy(clf_origin)
                clf_origin.fit(X.iloc[train_index, :], y.iloc[train_index])
                preds_tmp_list.append(
                    clf_origin.predict(X[test_index]))
                self.clfs_dict[clf_name].append(clf_origin)
            merge_feature_list.append(np.vstack(preds_tmp_list))
        
        X_merged = np.hstack(merge_feature_list)
        y_merged = np.hstack([y[test_index] 
                              for _, test_index in index_list])

        self.m_clf.fit(X_merged, y_merged)
        return self

        
    def predict(self, X):
        merge_feature_list = []
        for clf_name in self.clfs_index:
            tmp_proba_list = []
            for clf in self.clfs_dict[clf_name]:
                tmp_proba_list.append(clf.predict(X))
            merge_feature_list.append(
                np.mean(tmp_proba_list, axis=0))
        X_merged = np.hstack(merge_feature_list)

        return self.m_clf.predict(X_merged)

In [344]:
# svm = SVC(C=5, gamma=0.001)
# lr = LogisticRegression()
# knn = KNeighborsClassifier()

# estimators = list(zip(['svm', 'lr', 'knn'], [svm, lr, knn]))

# # for name, clf in estimators:
# #     clf.fit(X_train, y_train)
# #     preds = clf.predict(X_test)
# #     print(name)
# #     print('mse:', mean_squared_error(y_test, preds))
    
# stcl = ScratchStacking(estimators, RandomForestClassifier(n_estimators=10, n_jobs=-1))
# stcl.fit(X_train, y_train)
# preds = stcl.predict(X_test)
# print('stacking')
# print('mse:', mean_squared_error(y_test, preds))

In [345]:
svm = SVC()
lr = LogisticRegression()
knn = KNeighborsClassifier()

estimators = list(zip(['svm', 'lr', 'knn'], [svm, lr, knn]))

for name, clf in estimators:
    clf.fit(X_train, y_train)
    preds = clf.predict(X_test)
    print(name)
    print('mse:', mean_squared_error(y_test, preds))



svm
mse: 5873069339.79452




lr
mse: 4066152564.729452
knn
mse: 3126137578.6643834


In [348]:
# stcl = ScratchStacking(estimators, LinearRegression())
# stcl.fit(X_train, y_train)
# preds = stcl.predict(X_test)
# print('stacking')
# print('mse:', mean_squared_error(y_test, preds))