# 堆疊(Stacking)測試

## 載入相關套件

In [1]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
import numpy as np

## 載入資料集

In [2]:
X, y = datasets.load_breast_cancer(return_X_y=True)

## 資料分割

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2) # split dataset, 20% test, 80% train

## 模型訓練

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import StackingClassifier

def get_models():
    models = []
    
    models.append(('cart', DecisionTreeClassifier()))
    models.append(('knn', KNeighborsClassifier()))
    models.append(('svm', SVC()))
    models.append(('bayes', GaussianNB()))
    return models

estimators = get_models() # get base models, 
model = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression()
) # stacking, 

model.fit(X_train,y_train) # fit model


0,1,2
,estimators,"[('cart', ...), ('knn', ...), ...]"
,final_estimator,LogisticRegression()
,cv,
,stack_method,'auto'
,n_jobs,
,passthrough,False
,verbose,0

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,

0,1,2
,priors,
,var_smoothing,1e-09

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [5]:
print('Stacking準確率=', model.score(X_test,y_test)) # score,  > 0.956140350877193

Stacking準確率= 0.9649122807017544


## 模型評估

In [6]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X_test,y_test, cv=10)
print(f'平均分數: {np.mean(scores)}, 標準差: {np.std(scores)}')

平均分數: 0.9469696969696969, 標準差: 0.07017294652672369


In [7]:
scores

array([1.        , 0.83333333, 1.        , 1.        , 1.        ,
       1.        , 1.        , 0.81818182, 0.90909091, 0.90909091])

## 使用迴歸模型

In [8]:
from sklearn.linear_model import RidgeCV #L2
from sklearn.svm import LinearSVR
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.preprocessing import StandardScaler

X, y = datasets.load_diabetes(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

estimators = [
    ('lr', RidgeCV()),
    # ('svr', LinearSVR(random_state=42))
    ('svr', SVR())
]

model = StackingRegressor(
    estimators=estimators,
    final_estimator=RandomForestRegressor(n_estimators=100, random_state=42))
model.fit(X_train_std, y_train)
scores = cross_val_score(model, X_test_std, y_test, cv=10)
print(f'平均分數: {np.mean(scores)}, 標準差: {np.std(scores)}')

平均分數: 0.10383762679327509, 標準差: 0.4229785126237812


## L2 程式集2
載入不同資料集

In [9]:
from sklearn.linear_model import RidgeCV #L2
from sklearn.svm import LinearSVR
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

#X, y = datasets.load_diabetes(return_X_y=True)

X, y = datasets.fetch_california_housing(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)
estimators = [
    ('lr', RidgeCV()),
    # ('svr', LinearSVR(random_state=42))
    ('svr', SVR())
]
model = StackingRegressor(
    estimators=estimators,
    final_estimator=RandomForestRegressor(n_estimators=100, random_state=42))
model.fit(X_train_std, y_train)
scores = cross_val_score(model, X_test_std, y_test, cv=10)
print(f'平均分數: {np.mean(scores)}, 標準差: {np.std(scores)}')

KeyboardInterrupt: 

In [None]:
svc = LinearSVR()
svc.fit(X_train_std, y_train)
scores = cross_val_score(svc, X_test_std, y_test, cv=10)
print(f'平均分數: {np.mean(scores)}, 標準差: {np.std(scores)}')

平均分數: -1.151999780003686, 標準差: 0.685351133764436


In [None]:
svc1 = SVR()
svc1.fit(X_train_std, y_train)
scores = cross_val_score(svc1, X_test_std, y_test, cv=10)
print(f'平均分數: {np.mean(scores)}, 標準差: {np.std(scores)}')