# アンサンブル学習（Stacking）と提出

In [42]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
import lightgbm as lgb
import xgboost as xgb
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
import copy

DIVEINTOCODE課題で作成したスクラッチ実装を使用

In [2]:
class Stacking_Regression():
    """
    回帰問題スタッキングのスクラッチ実装

    Parameters
    ----------
    submodels : list
      サブモデルのオブジェクト2次元リスト
    model : object
      メインモデルのオブジェクト
    n_split : int
      データの分割数
    random_state : int
      乱数シード

    Attributes
    ----------
    self.fitted_submodels : list
      学習済みサブモデルのオブジェクト3次元リスト
    """
    
    def __init__(self, submodels, mainmodel, n_split, max_stage=1, random_state=None):
        # ハイパーパラメータを属性として記録
        # 該当するstage・種類での2次元サブモデルリスト
        self.submodels = submodels
        # メインモデルオブジェクト
        self.mainmodel = mainmodel
        # 各stageでのデータ分割数
        self.n_split = n_split
        # ステージ2以上を設定できるようにする
        self.max_stage = max_stage
        # 再現性のため、乱数シードを渡す
        self.random_state = random_state
        # 学習済みサブモデルを保持する3次元リスト
        self.fitted_submodels = []
    
    def fit(self, X, y):
        """
   　　 メインモデルによる最終学習

    　　Parameters
    　　----------
    　　X : 次の形のndarray, shape (n_samples, n_features)
      　　訓練データの説明変数
        y : 次の形のndarray, shape (n_samples, )
      　　訓練データの目的変数
    　　
        """
        # stage0からサブモデルによる再帰的学習を開始
        X_main = self._sub_fit(X, y, stage=0, max_stage=self.max_stage)
        # サブモデルからの最終説明変数をメインモデルで学習
        self.mainmodel.fit(X_main, y)
        
        return self
        
    
    def _sub_fit(self, X, y, stage, max_stage):
        """
   　　 サブモデルによる再帰的学習

    　　Parameters
    　　----------
    　　X : 次の形のndarray, shape (n_samples, n_features)
      　　訓練データの説明変数
        y : 次の形のndarray, shape (n_samples, )
      　　訓練データの目的変数
    　　
        """
        
        # 乱数シード固定
        np.random.seed(self.random_state)
        # サンプルのインデックスをシャッフル
        indices = np.arange(X.shape[0])
        np.random.shuffle(indices)
        # 分割数より検証データ数を算出しておく
        n_val = int(X.shape[0]/self.n_split)
        # 出力する説明変数を初期値0で作成
        X_main = np.zeros([X.shape[0], len(self.submodels[stage])])
        
        # 各ステージでの学習済みサブモデル格納2次元リスト
        submodels_2 = []
        # サブモデルの種類ループ
        
        for i in range(len(self.submodels[stage])):
            # モデルの種類ごとの出力結果を初期値0で作成
            submodel_preds = np.zeros(X.shape[0])
            # モデル種別の学習済みサブモデル格納1次元リスト
            submodels_1 = []
            # 分割によるループ
            for j in range(self.n_split):
                # 検証/訓練データのインデックス取得
                idx_val = np.arange(n_val*j, n_val*(j+1))
                idx_train = np.array(list(set(indices) - set(idx_val)))
                X_val = X[idx_val]
                X_train = X[idx_train]
                y_train = y[idx_train]
                # 該当stage・種類のモデルオブジェクトの複製を作り学習
                submodel = copy.deepcopy(self.submodels[stage][i])
                submodel.fit(X_train, y_train)
                # 種類別予測結果の該当インデックスを推定結果で更新
                submodel_preds[idx_val] = submodel.predict(X_val)
                # 学習済みモデルを後に呼び出せるよう、リストに格納
                submodels_1.append(submodel)
            # メインモデルへの説明変数の該当する列を更新
            X_main[:, i] = submodel_preds
            # リストをリストに格納し、2次元リストで学習済みサブモデルを保持
            submodels_2.append(submodels_1)
        # 2次元リストをリストに格納し、3次元リストで学習済みサブモデルを保持
        self.fitted_submodels.append(submodels_2)
        
        # 最終の1stage前なら、サブモデルからの説明変数を出力
        if stage == max_stage - 1:
            output = X_main
        # 上記以外では、再帰的に学習を継続
        else:
            output = self._sub_fit(X_main, y, stage+1, max_stage)
        
        return output
    

    def predict(self, X):
        """
        メインモデルによる最終推定

        Parameters
        ----------
        X : 次の形のndarray, shape (n_samples, n_features)
        　予測するデータの説明変数
        Returns
        -------
        y_pred : 次の形のndarray, shape (n_samples, 1)
        　スタッキング後のメインモデルによる推定結果
        """
        # stage0からサブモデルによる再帰的推定を開始
        sub_preds = self._sub_predict(X, stage=0, max_stage=self.max_stage)

        # サブモデル推定結果からメインモデルが推定し結果を出力
        return self.mainmodel.predict(sub_preds)

    
    def _sub_predict(self, X, stage, max_stage):
        """
        サブモデルによる再帰的推定

        Parameters
        ----------
        X : 次の形のndarray, shape (n_samples, n_features)
        　予測するデータの説明変数
        Returns
        -------
        y_pred : 次の形のndarray, shape (n_samples, 1)
        　スタッキング後のメインモデルによる推定結果
        """
        # サブモデルの推定結果を初期値0で作成
        sub_preds = np.zeros([X.shape[0], len(self.submodels[stage])])

        for i in range(len(self.submodels[stage])):
            # モデルの種類ごとの予測結果を初期値0で作成
            submodel_preds = np.zeros([X.shape[0], self.n_split])
            for j in range(self.n_split):
                # 学習済みモデル3次元リストから、該当stageの該当モデル呼び出し
                submodel = self.fitted_submodels[stage][i][j]
                # 種類別予測行列の該当列を更新
                submodel_preds[:, j] = submodel.predict(X)  
            
            # 予測行列を行方向に平均し、サブモデル推定結果の該当列を更新
            sub_preds[:, i] = np.mean(submodel_preds, axis=1)
        
        # 最終の1stage前なら、サブモデルからの予測結果を出力
        if stage == max_stage-1:
            output = sub_preds
        # 上記以外では、再帰的に予測を継続
        else:
            output = self._sub_predict(sub_preds, stage+1, max_stage)
        
        return output

まずは真値のわかっている訓練データをtrain/validationに分けて、色々なモデルの組み合わせを試す。

In [3]:
# 訓練データのロード
df_7 = pd.read_csv('data_7.csv')
df_7.head()

Unnamed: 0,Id,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2,IC_01,IC_07,IC_05,IC_16,...,CBN(13)_vs_DMN(94),CBN(18)_vs_DMN(94),CBN(4)_vs_DMN(94),CBN(7)_vs_DMN(94),CBN(18)_vs_CBN(13),CBN(4)_vs_CBN(13),CBN(7)_vs_CBN(13),CBN(4)_vs_CBN(18),CBN(7)_vs_CBN(18),CBN(7)_vs_CBN(4)
0,10001,57.436077,30.571975,62.553736,53.32513,51.427998,0.00607,0.014466,0.004136,0.000658,...,-0.203459,0.630524,0.227728,0.407087,0.450352,-0.074469,0.151244,-0.105504,0.626158,0.352528
1,10002,59.580851,50.969456,67.470628,60.651856,58.311361,0.009087,0.009291,0.007049,-0.002076,...,-0.306443,-0.047546,0.26053,-0.207901,0.595487,0.809261,0.648386,0.05219,0.772152,0.271778
2,10004,71.413018,53.152498,58.012103,52.418389,62.536641,0.004675,0.000957,0.006154,-0.000429,...,-0.276111,0.39304,0.370413,0.04279,0.437297,0.560578,0.440411,0.032559,0.923573,0.112833
3,10005,66.53263,51.847306,60.052535,52.108977,69.993075,-0.000398,0.006878,0.009051,0.000369,...,-0.248033,0.522701,0.10787,0.605457,0.660222,0.7115,0.321404,0.077306,1.06345,0.124246
4,10007,38.617381,49.197021,65.674285,40.151376,34.096421,0.005192,0.010585,0.01216,-0.00092,...,-0.178258,0.387695,0.166441,0.176302,0.504423,0.821638,0.517638,0.372172,0.787271,0.328486


In [4]:
# 前処理
X = df_7.iloc[:, 6:].values
y = df_7.iloc[:, 1:6].values

# train/validationデータに分割
X_train, X_val, y_train, y_val = train_test_split(
    X, y, random_state=2020, train_size=0.8)

sc_x = StandardScaler()
sc_y = StandardScaler()
X_train_std = sc_x.fit_transform(X_train)
X_val_std = sc_x.transform(X_val)
y_train_std = sc_y.fit_transform(y_train)
y_val_std = sc_y.transform(y_val)

### 1. SVR→LGM→Lasso

In [6]:
svm_1 = SVR()
lgm_1 = lgb.LGBMRegressor()
lasso_1 = Lasso()

stk_1 = Stacking_Regression(submodels=[[svm_1], [lgm_1]], 
                            mainmodel=lasso_1, 
                            n_split=5,
                            max_stage=1,
                            random_state=2020)

targets = df_7.columns[1:6]
models_1 = []
preds = np.empty_like(y_val)

for i, target in enumerate(targets):
    y_tr = y_train_std[:, i]
    
    model = copy.deepcopy(stk_1)
    model.fit(X_train_std, y_tr)
    
    models_1.append(model)
    preds[:, i] = model.predict(X_val_std)

preds = sc_y.inverse_transform(preds)
score_1 = np.sum(np.abs(y_val - preds), axis=0) / np.sum(preds, axis=0)
print(score_1)

[0.21968424 0.14023924 0.13922421 0.18514698 0.17292105]


### 2. SVR→LGM→Ridge

In [7]:
svm_2 = SVR()
lgm_2 = lgb.LGBMRegressor()
ridge_2 = Ridge()

stk_2 = Stacking_Regression(submodels=[[svm_2], [lgm_2]], 
                            mainmodel=ridge_2, 
                            n_split=5,
                            max_stage=1,
                            random_state=2020)

targets = df_7.columns[1:6]
models_2 = []
preds = np.empty_like(y_val)

for i, target in enumerate(targets):
    y_tr = y_train_std[:, i]
    
    model = copy.deepcopy(stk_2)
    model.fit(X_train_std, y_tr)
    
    models_2.append(model)
    preds[:, i] = model.predict(X_val_std)

preds = sc_y.inverse_transform(preds)
score_2 = np.sum(np.abs(y_val - preds), axis=0) / np.sum(preds, axis=0)
print(score_2)

[0.14314439 0.13742073 0.13909655 0.18124603 0.17158811]


### 3. SVR→XGB→Lasso

In [10]:
svm_3 = SVR()
xgb_3 = xgb.XGBRegressor()
lasso_3 = Lasso()

stk_3 = Stacking_Regression(submodels=[[svm_3], [xgb_3]], 
                            mainmodel=lasso_3, 
                            n_split=5,
                            max_stage=1,
                            random_state=2020)

models_3 = []
preds = np.empty_like(y_val)

for i, target in enumerate(targets):
    y_tr = y_train_std[:, i]
    
    model = copy.deepcopy(stk_3)
    model.fit(X_train_std, y_tr)
    
    models_3.append(model)
    preds[:, i] = model.predict(X_val_std)

preds = sc_y.inverse_transform(preds)
score_3 = np.sum(np.abs(y_val - preds), axis=0) / np.sum(preds, axis=0)
print(score_3)

[0.21968424 0.14023924 0.13922421 0.18514698 0.17292105]


### 4. SVR→XGB→Ridge

In [None]:
svm_4 = SVR()
xgb_4 = xgb.XGBRegressor()
ridge_4 = Ridge()

stk_4 = Stacking_Regression(submodels=[[svm_4], [xgb_4]], 
                            mainmodel=lasso_4, 
                            n_split=5,
                            max_stage=1,
                            random_state=2020)

models_4 = []
preds = np.empty_like(y_val)

for i, target in enumerate(targets):
    y_tr = y_train_std[:, i]
    
    model = copy.deepcopy(stk_4)
    model.fit(X_train_std, y_tr)
    
    models_4.append(model)
    preds[:, i] = model.predict(X_val_std)

preds = sc_y.inverse_transform(preds)
score_4 = np.sum(np.abs(y_val - preds), axis=0) / np.sum(preds, axis=0)
print(score_4)

### 5. SVR→LGB→KNN→Ridge

In [12]:
svm_5 = SVR()
lgm_5 = lgb.LGBMRegressor()
knn_5 = KNeighborsRegressor()
ridge_5 = Ridge()

stk_5 = Stacking_Regression(submodels=[[svm_5], [lgm_5], [knn_5]], 
                            mainmodel=ridge_5, 
                            n_split=5,
                            max_stage=1,
                            random_state=2020)

models_5 = []
preds = np.empty_like(y_val)

for i, target in enumerate(targets):
    y_tr = y_train_std[:, i]
    
    model = copy.deepcopy(stk_5)
    model.fit(X_train_std, y_tr)
    
    models_5.append(model)
    preds[:, i] = model.predict(X_val_std)

preds = sc_y.inverse_transform(preds)
score_5 = np.sum(np.abs(y_val - preds), axis=0) / np.sum(preds, axis=0)
print(score_5)

[0.14314439 0.13742073 0.13909655 0.18124603 0.17158811]


### 6. SVR+XGB+KNN→Lasso

In [13]:
svm_6 = SVR()
xgb_6 = xgb.XGBRegressor()
knn_6 = KNeighborsRegressor()
lasso_6 = Lasso()

stk_6 = Stacking_Regression(submodels=[[svm_6, xbg_6, knn_6],], 
                            mainmodel=lasso_6, 
                            n_split=5,
                            max_stage=1,
                            random_state=2020)

models_6 = []
preds = np.empty_like(y_val)

for i, target in enumerate(targets):
    y_tr = y_train_std[:, i]
    
    model = copy.deepcopy(stk_6)
    model.fit(X_train_std, y_tr)
    
    models_6.append(model)
    preds[:, i] = model.predict(X_val_std)

preds = sc_y.inverse_transform(preds)
score_6 = np.sum(np.abs(y_val - preds), axis=0) / np.sum(preds, axis=0)
print(score_6)

[0.21968424 0.14023924 0.13922421 0.18514698 0.17292105]


5.のSVR,LGB,KNN,Ridgeの組み合わせがscoreが良さそうなので、これらを使用することにする。
以後は、真値のわかっている訓練データを全て使用し、貪欲的に学習させる。
SVR→LGM→KNN...と1stageあたりに1modelで推移させるか、SVR+LGM→KNNというように1stageに2つ以上のmodelを組み込むかで、いくつかの組み合わせを試す。  

### 5s. SVR→LGM→KNN→Ridge（4stage）

In [41]:
# submit用

X = df_7.iloc[:, 6:].values
y = df_7.iloc[:, 1:6].values
sc_x = StandardScaler()
sc_y = StandardScaler()
X_std = sc_x.fit_transform(X)
y_std = sc_y.fit_transform(y)

svm_5s = SVR()
lgm_5s = lgb.LGBMRegressor()
knn_5s = KNeighborsRegressor()
ridge_5s = Ridge()

stk_5s = Stacking_Regression(submodels=[[svm_5s], [lgm_5s], [knn_5s]], 
                             mainmodel=ridge_5s, 
                             n_split=5,
                             max_stage=1,
                             random_state=2020)

models_5s = []
preds = np.empty_like(y_std)
targets = df_7.columns[1:6]

for i, target in enumerate(targets):
    y_tr = y_std[:, i]
    
    model = copy.deepcopy(stk_5s)
    model.fit(X_std, y_tr)
    
    models_5s.append(model)
    preds[:, i] = model.predict(X_std)

preds = sc_y.inverse_transform(preds)
score_5s = np.sum(np.abs(y - preds), axis=0) / np.sum(preds, axis=0)
print(score_5s)

[0.06196852 0.08673237 0.12913163 0.13207412 0.13992317]


### 6s. SVR+LGM→KNN→Ridge（3stage）

In [8]:
# submit用

X = df_7.iloc[:, 6:].values
y = df_7.iloc[:, 1:6].values
sc_x = StandardScaler()
sc_y = StandardScaler()
X_std = sc_x.fit_transform(X)
y_std = sc_y.fit_transform(y)

svm_6s = SVR()
lgm_6s = lgb.LGBMRegressor()
knn_6s = KNeighborsRegressor()
ridge_6s = Ridge()

stk_6s = Stacking_Regression(submodels=[[svm_6s, lgm_6s], [knn_6s]], 
                            mainmodel=ridge_6s, 
                            n_split=5,
                            max_stage=1,
                            random_state=2020)

models_5s = []
preds = np.empty_like(y_std)
targets = df_7.columns[1:6]

for i, target in enumerate(targets):
    y_tr = y_std[:, i]
    
    model = copy.deepcopy(stk_6s)
    model.fit(X_std, y_tr)
    
    models_6s.append(model)
    preds[:, i] = model.predict(X_std)

preds = sc_y.inverse_transform(preds)
score_6s = np.sum(np.abs(y - preds), axis=0) / np.sum(preds, axis=0)
print(score_6s)

[0.06550215 0.09729177 0.12993031 0.1354467  0.14413561]


### 7s. SVR→LGM+KNN→Ridge（3stage）

In [20]:
# submit用

X = df_7.iloc[:, 6:].values
y = df_7.iloc[:, 1:6].values
sc_x = StandardScaler()
sc_y = StandardScaler()
X_std = sc_x.fit_transform(X)
y_std = sc_y.fit_transform(y)

svm_7s = SVR()
lgm_7s = lgb.LGBMRegressor()
knn_7s = KNeighborsRegressor()
ridge_7s = Ridge()

stk_7s = Stacking_Regression(submodels=[[svm_7s], [knn_7s, lgm_7s]], 
                             mainmodel=ridge_7s, 
                             n_split=5,
                             max_stage=1,
                             random_state=2020)

models_7s = []
preds = np.empty_like(y_std)
targets = df_7.columns[1:6]

for i, target in enumerate(targets):
    y_tr = y_std[:, i]
    
    model = copy.deepcopy(stk_7s)
    model.fit(X_std, y_tr)
    
    models_7s.append(model)
    preds[:, i] = model.predict(X_std)

preds = sc_y.inverse_transform(preds)
score_7s = np.sum(np.abs(y - preds), axis=0) / np.sum(preds, axis=0)
print(score_7s)

[0.06550215 0.09729177 0.12993031 0.1354467  0.14413561]


### 8s. SVR+LGM+KNN→Ridge（2stage）

In [20]:
# submit用

X = df_7.iloc[:, 6:].values
y = df_7.iloc[:, 1:6].values
sc_x = StandardScaler()
sc_y = StandardScaler()
X_std = sc_x.fit_transform(X)
y_std = sc_y.fit_transform(y)

svm_8s = SVR()
lgm_8s = lgb.LGBMRegressor()
knn_8s = KNeighborsRegressor()
ridge_8s = Ridge()

stk_8s = Stacking_Regression(submodels=[[svm_8s, knn_8s, lgm_8s]], 
                             mainmodel=ridge_8s, 
                             n_split=5,
                             max_stage=1,
                             random_state=2020)

models_8s = []
preds = np.empty_like(y_std)
targets = df_7.columns[1:6]

for i, target in enumerate(targets):
    y_tr = y_std[:, i]
    
    model = copy.deepcopy(stk_8s)
    model.fit(X_std, y_tr)
    
    models_8s.append(model)
    preds[:, i] = model.predict(X_std)

preds = sc_y.inverse_transform(preds)
score_8s = np.sum(np.abs(y - preds), axis=0) / np.sum(preds, axis=0)
print(score_8s)

[0.06550215 0.09729177 0.12993031 0.1354467  0.14413561]


In [43]:
# 後に使用するかもしれないので、モデルを保存しておく
import pickle
filename = 'stks_5.sav'
pickle.dump(models_5s, open(filename, 'wb'))
filename = 'stks_6.sav'
pickle.dump(models_6s, open(filename, 'wb'))
filename = 'stks_7.sav'
pickle.dump(models_7s, open(filename, 'wb'))

以上より、素直に1model/1stageで推移させる方法が最もscoreが良いので、5sのパターンを提出することにする。  
submodelの学習順を入れ替えると若干変化が出るかもしれないが、時間もなく今回は検証できなかった。

In [None]:
# testデータのロード
df_test7 = pd.read_csv('df_test7.csv')

X_test = df_test7.iloc[:, 1:].values
X_test_std = sc_x.transform(X_test)

In [18]:
ID_test = df_test7['Id']

# 5sの組み合わせで推論
preds = np.empty_like(y)
preds[:, 0] = models_5s[0].predict(X_test_std)
preds[:, 1] = models_5s[1].predict(X_test_std)
preds[:, 2] = models_5s[2].predict(X_test_std)
preds[:, 3] = models_5s[3].predict(X_test_std)
preds[:, 4] = models_5s[4].predict(X_test_std)
preds = sc_y.inverse_transform(preds)

# IDの後ろに変数名を付ける
ID_age = ID_test.astype('str') + np.array(['_age'], dtype=object)
df_age = pd.DataFrame({'Id': ID_age, 'Predicted': preds[:,0]})
ID_d1v1 = ID_test.astype('str') + np.array(['_domain1_var1'], dtype=object)
df_d1v1 = pd.DataFrame({'Id': ID_d1v1, 'Predicted': preds[:,1]})
ID_d1v2 = ID_test.astype('str') + np.array(['_domain1_var2'], dtype=object)
df_d1v2 = pd.DataFrame({'Id': ID_d1v2, 'Predicted': preds[:,2]})
ID_d2v1 = ID_test.astype('str') + np.array(['_domain2_var1'], dtype=object)
df_d2v1 = pd.DataFrame({'Id': ID_d2v1, 'Predicted': preds[:,3]})
ID_d2v2 = ID_test.astype('str') + np.array(['_domain2_var2'], dtype=object)
df_d2v2 = pd.DataFrame({'Id': ID_d2v2, 'Predicted': preds[:,4]})

# 提出用のCSVファイルへ出力
df_sub = pd.concat([df_age, df_d1v1, df_d1v2, df_d2v1, df_d2v2], axis=0)
df_sub = df_sub.sort_values('Id')
display(df_sub.head())
df_sub.to_csv('submission9', index=False, header=True)

Unnamed: 0,Id,Predicted
0,10003_age,59.29182
0,10003_domain1_var1,50.183164
0,10003_domain1_var2,59.348081
0,10003_domain2_var1,49.184432
0,10003_domain2_var2,53.869996


この提出が、コンペ期間中に取り組めた中でpubric leaderbordのベストスコアであった。（0.1642）