## 多數投票法

In [1]:
from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.externals import six
from sklearn.base import clone
from sklearn.pipeline import _name_estimators
import numpy as np
import operator

In [None]:
class MajorityVoteClassifier(BaseEstimator,ClassifierMixin):
    def __init__(self,classifiers,vote='classlabel',weights=None):
        self.classifiers=classifiers
        self.named_classifiers={key:value for key,value in _name_estimators(classifiers)}
        self.vote=vote
        self.weights=weights
    def fit(self,x,y):
        self.lablenc_=LabelEncoder()
        self.lablenc_.fit(y)
        self.classes_=self.lablenc_.classes_
        self.classifiers_=[]
        for clf in self.classifiers:
            fitted_clf=clone(clf).fit(x,self.lablenc_.transform(y))
            self.classifiers_.append(fitted_clf)
        return self
    def predict(self,x):
        if self.vote=='probability':
            maj_vote=np.argmax(self.predict_proba(x),axis=1)
        else:
            predictions=np.asarray([clf.predict(x) for clf in self.classifiers_]).T
        maj_vote=np.array_along_axis(lamda x:np.argmax(np.bincount(x,weight=self.weights)),axis=1,arr=predictions)
        return maj_vote
    def predict_proba(self,x):
        probas=np.asarray([clf.predict_proba(x) for clf in self.classifiers_])
        avg_proba=np.average(probas,axis=0,weights=self.weights)
        return avg_proba
    def get_params(self,deep=True):
        if not deep:
            return super(MajorityVoteClassifier,self).get_params(deep=False)
        else:
            out=self.named_classifiers.copy()
            for name,step in six.iteritems(self.named_classifiers):
                for key,value in six.iteritems(step.get_params(deep=True)):
                    out['%s__%s' % (name,key)]=value
            return out
        

In [None]:
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
import numpy as np
clf1=LogisticRegression(penalty='l2',C=0.001,random_state=0)
clf2=DecisionTreeClassifier(max_depth=1,critierion='entropy',random_state=0)
clf3=KNeighborsClassifier(n_neighbors=1,p=2,metric='minkowski')
pipe1=Pipeline([['sc',StandardScaler()],['clf',clf1]])
pipe3=Pipeline([['sc',StandardScaler()],['clf',clf3]])
clf_labels=['Logistic Regression','Decision Tree','KNN']
print('10-fold cross validation:\n')
for clf,label in zip([pipe1,clf2,pipe3],clf_labels):
    scores=cross_val_score(estimators=clf,x=x_train,y=y_train,cv=10,scoring='roc_auc')
    print("ROC AUC:%0.2f (+/- %0.2f) [%s]" % (scores.mean(),scores.std(),label))

In [None]:
mv_clf=MajorityVoteClassifier(classifiers=[pipe1,clf2,pipe3])
clf_labels+=['Majority Voting']
all_clf=[pipe1,clf2,pipe3,mv_clf]
for clf,label in zip(all_clf,clf_labels):
    scores=cross_val_score(estimator=clf,x=x_train,y=y_train,cv=10,scoring='roc_auc')
    print("Accuracy:%0.2f (+/-%0.2f) [%s]" % (scores.mean(),scores.std(),label))

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
colors=['black','orange','blue','green']
linestyle=[':','--','-.','-']
for clf,label,clr,ls in zip(all_clf,clf_labels,colors,linestyles):
    y_pred=clf.fit(x_train,y_train).predict_proba(x_test)[:,1]
    fpr,tpr,thresholds=roc_curve(y_true=y_test,y_score=y_pred)
    roc_auc=auc(x=fpr,y=tpr)
    plt.plot(fpr,tpr,color=clr,linestyle=ls,label='%s (auc=%0.2f)' % (label,roc_auc))
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],linestyle='--',color='gray',linewidth=2)
plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.grid()
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()

In [None]:
mv_clf.get_params()

In [None]:
from sklearn.grid_search import GridSearchCV
params={'decisiontreeclassifier_max_depth':[1,2],'pipeline-1_clf_C':[0.001,0.1,100.0]}
grid=GridSearchCV(estimator=mv_clf,param_grid=params,cv=10,scoring='roc_auc')
grid.fit(x_train,y_train)
for params,mean_score,scores in grid.grid_scores_:
    print("%0.3f+/-%0.2f %r" % (mean_score,scores_std() /2,params))
    

In [None]:
print('Best parameters:%s' % grid.best_params_)
print('Accuracy:%.2f' % grid.best_score_)

## Bagging

## base_estimator:object, default=None

## n_estimators:int, default=10

## max_samples:int or float, default=1.0

## max_features:int or float, default=1.0

## bootstrap:bool, default=True

## bootstrap_features:bool, default=False

## oob_score:bool, default=False

## warm_start:bool, default=False

## n_jobs:int, default=None

## random_state:int or RandomState, default=None

## verbose:int, default=0

In [None]:
from sklearn.ensemble import BaggingClassifier
tree=DecisionTreeClassifier(criterion='entropy',max_depth=None)
bag=BaggingClassifier(base_estimator=tree,n_estimators=500,max_samples=1.0,max_features=1.0,bootstrap=True,bootstrap_features=False,n_jobs=1,random_state=1)
from sklearn.metrics import accuracy_score
tree=tree.fit(x_train,y_train)
y_train_pred=tree.predict(x_train)
y_test_pred=tree.predict(x_test)
tree_train=accuracy_score(y_train,y_train_pred)
tree_test=accuracy_score(y_test,y_test_pred)
print('Decision tree train/test accuracies %.3f/%.3f' % (tree_train,tree_test))
bag=bag.fit(x_train,y_train)
y_train_pred=bag.predict(x_train)
y_test_pred=bag.predict(x_test)
bag_train=accuracy_score(y_train,y_train_pred)
bag_test=accuracy_score(y_test,y_test_pred)
print('Bagging train/test accuracies %.3f/%.3f' % (bag_train,bag_test))


## Adaboosting

## base_estimator:object, default=None

## n_estimators:int, default=50

## learning_rate:float, default=1.

## algorithm{‘SAMME’, ‘SAMME.R’}, default=’SAMME.R’

## random_state:int or RandomState, default=None

In [None]:
from sklearn.ensemble import AdaBoostClassifier
tree=DecisionTreeClassifier(criterion='entropy',max_depth=1)
ada=AdaBoostClassifier(base_estimator=tree,n_estimators=500,learning_rate=0.1,random_state=0)
tree=tree.fit(x_train,y_train)
y_train_pred=tree.predict(x_train)
y_test_pred=tree.predict(x_test)
tree_train=accuracy_score(y_train,y_train_pred)
tree_test=accuracy_score(y_test,y_test_pred)
print('Decision tree train/test accuracies %.3f/%.3f' % (tree_train,tree_test))
ada=ada.fit(x_train,y_train)
y_train_pred=ada.predict(x_train)
y_test_pred=ada.predict(x_test)
ada_train=accuracy_score(y_train,y_train_pred)
ada_test=accuracy_score(y_test,y_test_pred)
print('Bagging train/test accuracies %.3f/%.3f' % (ada_train,ada_test))

## Gradient boosting

## loss{‘deviance’, ‘exponential’}, default=’deviance’

## learning_rate:float, default=0.1

## n_estimators:int, default=100

## subsample:float, default=1.0

## criterion{‘friedman_mse’, ‘mse’, ‘mae’}, default=’friedman_mse’

## min_samples_split:int or float, default=2

## min_samples_leaf:int or float, default=1

## min_weight_fraction_leaf:float, default=0.0

## max_depth:int, default=3

## min_impurity_decrease:float, default=0.0

## min_impurity_split:float, default=None

## init:estimator or ‘zero’, default=None

## max_features{‘auto’, ‘sqrt’, ‘log2’}, int or float, default=None

## max_leaf_nodes:int, default=None

## warm_start:bool, default=False

## validation_fraction:float, default=0.1

## n_iter_no_change:int, default=None

## tol:float, default=1e-4

## ccp_alpha:non-negative float, default=0.0

In [None]:
from sklearn.datasets import make_classification
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
X, y = make_classification(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
clf = GradientBoostingClassifier(random_state=0)
clf.fit(X_train, y_train)
clf.predict(X_test[:2])
clf.score(X_test, y_test)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
linear = LinearRegression(normalize=False, fit_intercept=True, copy_X=True)
gdbt = GradientBoostingRegressor(tol=0.1, subsample=0.37, n_estimators=200, max_features=20, 
                                 max_depth=6, learning_rate=0.03)
rf = RandomForestRegressor(n_estimators=300, min_samples_split=9, min_samples_leaf=10, 
                           max_features='sqrt', max_depth=8, bootstrap=False)

## xgboost

## booster [default=gbtree]：選擇基分類器，gbtree: tree-based models/gblinear: linear models

## silent [default=0]:設定成1則沒有執行資訊輸出，最好是設定為0.

## nthread [default to maximum number of threads available if not set]：執行緒數

## eta [default=0.3]

## min_child_weight [default=1]

## max_depth [default=6]

## gamma [default=0]

## max_delta_step [default=0]

## subsample [default=1]

## colsample_bytree [default=1]

## lambda [default=1]

## alpha [default=0]

## scale_pos_weight [default=1]

## objective [default=reg:linear]

## eval_metric [ default according to objective ]

## seed [default=0]

In [None]:
from xgboost import XGBClassifier as xgb

In [None]:
def xgb_model(train_data, train_label, test_data, test_label):
    clf = xgb.XGBClassifier(max_depth=7,
                           min_child_weight=1,
                           learning_rate=0.1,
                           n_estimators=500,
                           silent=True,
                           objective='binary:logistic',
                           gamma=0,
                           max_delta_step=0,
                           subsample=1,
                           colsample_bytree=1,
                           colsample_bylevel=1,
                           reg_alpha=0,
                           reg_lambda=0,
                           scale_pos_weight=1,
                           seed=1,
                           missing=None)
    clf.fit(train_data, train_label, eval_metric='auc', verbose=True,
            eval_set=[(test_data, test_label)], early_stopping_rounds=100)
    y_pre = clf.predict(test_data)
    y_pro = clf.predict_proba(test_data)[:, 1]
    #print "AUC Score : %f" % metrics.roc_auc_score(test_label, y_pro)
    #print"Accuracy : %.4g" % metrics.accuracy_score(test_label, y_pre)
    return clf 

## catboost

## learning_rate(eta)=automatically


## depth(max_depth)=6: 树的深度

## l2_leaf_reg(reg_lambda)=3 L2正则化系数

## n_estimators(num_boost_round)(num_trees=1000)=1000: 解决ml问题的树的最大数量

## one_hot_max_size=2: 对于某些变量进行one-hot编码

## loss_function=‘Logloss’

## custom_metric=None

## eval_metric=Optimized objective

## nan_mode=None：处理NAN的方法

## leaf_estimation_method=None：迭代求解的方法，梯度和牛顿

## random_seed=None: 训练时候的随机种子

In [None]:
X_train, X_cv, y_train, y_cv = train_test_split(X_train, y_train, test_size = 0.25, stratify=y_train, random_state=123)

In [None]:
cat_features_index = [i for i, x in enumerate(X_train.columns) if X_train[x].dtype.kind == 'O']
model = CatBoostClassifier(iterations = 50, learning_rate = 0.3, eval_metric='AUC', max_ctr_complexity=2, boosting_type = 'Plain', bootstrap_type= 'Bernoulli', use_best_model=True, random_seed=123)
model.fit(X_train, y_train, cat_features=cat_features_index, eval_set=(X_cv, y_cv))

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.grid_search import ParameterGrid
from sklearn.model_selection import train_test_split
from itertools import product, chain
from tqdm import tqdm

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

RANDOM_STATE = 0

def get_x(df):
    df['Cabin'].fillna('Unknown', inplace=True)
    df['Embarked'].fillna('Unknown', inplace=True)
    df['Age'].fillna(-1, inplace=True)
    df['Fare'].fillna(df['Fare'].median(), inplace=True)
    df['Title'] = df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    
    df['Title'].fillna('na', inplace=True)
    df = df.drop(['Name', 'PassengerId', 'Cabin', 'Embarked'], axis=1)
    
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = 0
    df.loc[df['FamilySize'] == 1, 'IsAlone'] = 1
    
    df = df.drop(['Ticket'], axis=1)
    columns = list(df.columns)
    if 'Survived' in columns:
        columns.remove('Survived')
    cat_features = np.where(df[columns].dtypes != np.float)[0]
    return df[columns].values, cat_features


def get_xy(df):
    X, _ = get_x(df)
    y = df['Survived']
    return X, y

#  
def cross_val(X, y, X_test, param, cat_features, n_splits=3):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)
    
    acc = []
    predict = None
    
    for tr_ind, val_ind in skf.split(X, y):
        X_train = X[tr_ind]
        y_train = y[tr_ind]
        
        X_valid = X[val_ind]
        y_valid = y[val_ind]
        
        clf = CatBoostClassifier(iterations=500,
                                loss_function = param['loss_function'],
                                depth=param['depth'],
                                l2_leaf_reg = param['l2_leaf_reg'],
                                eval_metric = 'Accuracy',
                                leaf_estimation_iterations = 10,
                                use_best_model=True,
                                logging_level='Silent'
        )
        
        clf.fit(X_train, 
                y_train,
                cat_features=cat_features,
                eval_set=(X_valid, y_valid)
        )
        
        y_pred = clf.predict(X_valid)
        accuracy = accuracy_score(y_valid, y_pred)
        acc.append(accuracy)
    return sum(acc)/n_splits
    
def catboost_GridSearchCV(X, y, X_test, params, cat_features, n_splits=5):
    ps = {'acc':0,
          'param': []
    }
    
    predict=None
    
    for prms in tqdm(list(ParameterGrid(params)), ascii=True, desc='Params Tuning:'):
                          
        acc = cross_val(X, y, X_test, prms, cat_features, n_splits=5)

        if acc>ps['acc']:
            ps['acc'] = acc
            ps['param'] = prms
    print('Acc: '+str(ps['acc']))
    print('Params: '+str(ps['param']))
    
    return ps['param']
    
    
def main():
    train = pd.read_csv("../input/train.csv")
    test = pd.read_csv("../input/test.csv")
    
    X_train, y_train = get_xy(train)
    X_test, cat_features = get_x(test)
    
    params = {'depth':[2, 3, 4],
              'loss_function': ['Logloss', 'CrossEntropy'],
              'l2_leaf_reg':np.logspace(-20, -19, 3)
    }
    
    param = catboost_GridSearchCV(X_train, y_train, X_test, params, cat_features)

    clf = CatBoostClassifier(iterations=2500,
                            loss_function = param['loss_function'],
                            depth=param['depth'],
                            l2_leaf_reg = param['l2_leaf_reg'],
                            eval_metric = 'Accuracy',
                            leaf_estimation_iterations = 10,
                            use_best_model=True
    )
    X_train, X_valid, y_train, y_valid = train_test_split(X_train,
                                                        y_train, 
                                                        shuffle=True,
                                                        random_state=RANDOM_STATE,
                                                        train_size=0.8,
                                                        stratify=y_train
    )
    clf.fit(X_train, 
            y_train,
            cat_features=cat_features,
            logging_level='Silent',
            eval_set=(X_valid, y_valid)
    )
    
    sub = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':np.array(clf.predict(X_test)).astype(int)})
    sub.to_csv('cat_sub_1.csv',index=False)
    
if __name__=='__main__':
    main()

## lightgbm

## Task	資料的用途

## application	模型的用途

## boosting要用的演算法

## num_boost_round	迭代次數

## learning_rate

## num_leaves

## metric

## max_depth	樹的最大深度

## min_data_in_leaf	葉子可能具有的最小記錄數

## feature_fraction	例如 為0.8時，意味著在每次迭代中隨機選擇80％的引數來建樹	

## bagging_fraction	每次迭代時用的資料比例

## early_stopping_round	如果一次驗證資料的一個度量在最近的early_stopping_round 回合中沒有提高，模型將停止訓練

## lambda	指定正則化

## min_gain_to_split	描述分裂的最小 gain

## max_cat_group	在 group 邊界上找到分割點	

## max_bin	表示 feature 將存入的 bin 的最大數量

## categorical_feature	如果 categorical_features = 0,1,2， 則列 0，1，2是 categorical 變數

## ignore_column	與 categorical_features 類似，只不過不是將特定的列視為categorical，而是完全忽略

## save_binary	這個引數為 true 時，則資料集被儲存為二進位制檔案，下次讀資料時速度會變快

In [None]:
import pandas as pd
import lightgbm as lgb
from sklearn.datasets import load_breast_cancer
from sklearn.cross_validation import train_test_split

canceData=load_breast_cancer()
X=canceData.data
y=canceData.target
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0,test_size=0.2)
params = {    
          'boosting_type': 'gbdt',
          'objective': 'binary',
          'metric': 'auc',
          'nthread':4,
          'learning_rate':0.1,
          'num_leaves':30, 
          'max_depth': 5,   
          'subsample': 0.8, 
          'colsample_bytree': 0.8, 
    }
    
data_train = lgb.Dataset(X_train, y_train)
cv_results = lgb.cv(params, data_train, num_boost_round=1000, nfold=5, stratified=False, shuffle=True, metrics='auc',early_stopping_rounds=50,seed=0)
print('best n_estimators:', len(cv_results['auc-mean']))
print('best cv score:', pd.Series(cv_results['auc-mean']).max())

In [None]:
from sklearn.grid_search import GridSearchCV

In [None]:
params_test1={'max_depth': range(3,8,1), 'num_leaves':range(5, 100, 5)}
              
gsearch1 = GridSearchCV(estimator = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='auc',learning_rate=0.1, n_estimators=188, max_depth=6, bagging_fraction = 0.8,feature_fraction = 0.8), 
                       param_grid = params_test1, scoring='roc_auc',cv=5,n_jobs=-1)
gsearch1.fit(X_train,y_train)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

In [None]:
params_test2={'max_bin': range(5,256,10), 'min_data_in_leaf':range(1,102,10)}
              
gsearch2 = GridSearchCV(estimator = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='auc',learning_rate=0.1, n_estimators=188, max_depth=4, num_leaves=10,bagging_fraction = 0.8,feature_fraction = 0.8), 
                       param_grid = params_test2, scoring='roc_auc',cv=5,n_jobs=-1)
gsearch2.fit(X_train,y_train)
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_

In [None]:
params_test3={'feature_fraction': [0.6,0.7,0.8,0.9,1.0],
              'bagging_fraction': [0.6,0.7,0.8,0.9,1.0],
              'bagging_freq': range(0,81,10)
}
              
gsearch3 = GridSearchCV(estimator = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='auc',learning_rate=0.1, n_estimators=188, max_depth=4, num_leaves=10,max_bin=15,min_data_in_leaf=51), 
                       param_grid = params_test3, scoring='roc_auc',cv=5,n_jobs=-1)
gsearch3.fit(X_train,y_train)
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_

In [None]:
params_test4={'lambda_l1': [1e-5,1e-3,1e-1,0.0,0.1,0.3,0.5,0.7,0.9,1.0],
              'lambda_l2': [1e-5,1e-3,1e-1,0.0,0.1,0.3,0.5,0.7,0.9,1.0]
}
              
gsearch4 = GridSearchCV(estimator = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='auc',learning_rate=0.1, n_estimators=188, max_depth=4, num_leaves=10,max_bin=15,min_data_in_leaf=51,bagging_fraction=0.6,bagging_freq= 0, feature_fraction= 0.8), 
                       param_grid = params_test4, scoring='roc_auc',cv=5,n_jobs=-1)
gsearch4.fit(X_train,y_train)
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_

In [None]:
model=lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='auc',learning_rate=0.01, n_estimators=1000, max_depth=4, num_leaves=10,max_bin=15,min_data_in_leaf=51,bagging_fraction=0.6,bagging_freq= 0, feature_fraction= 0.8,
lambda_l1=1e-05,lambda_l2=1e-05,min_split_gain=0)
model.fit(X_train,y_train)
y_pre=model.predict(X_test)
print("acc:",metrics.accuracy_score(y_test,y_pre))
print("auc:",metrics.roc_auc_score(y_test,y_pre))

In [None]:
model=lgb.LGBMClassifier()
model.fit(X_train,y_train)
y_pre=model.predict(X_test)
print("acc:",metrics.accuracy_score(y_test,y_pre))
print("auc:",metrics.roc_auc_score(y_test,y_pre))

In [None]:
import pandas as pd
import lightgbm as lgb
from sklearn import metrics
from sklearn.datasets import load_breast_cancer
from sklearn.cross_validation import train_test_split

canceData=load_breast_cancer()
X=canceData.data
y=canceData.target
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0,test_size=0.2)

### 資料轉換
print('資料轉換')
lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=False)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train,free_raw_data=False)

### 設定初始引數--不含交叉驗證引數
print('設定引數')
params = {
          'boosting_type': 'gbdt',
          'objective': 'binary',
          'metric': 'auc',
          'nthread':4,
          'learning_rate':0.1
          }

### 交叉驗證(調參)
print('交叉驗證')
max_auc = float('0')
best_params = {}

# 準確率
print("調參1：提高準確率")
for num_leaves in range(5,100,5):
    for max_depth in range(3,8,1):
        params['num_leaves'] = num_leaves
        params['max_depth'] = max_depth

        cv_results = lgb.cv(
                            params,
                            lgb_train,
                            seed=1,
                            nfold=5,
                            metrics=['auc'],
                            early_stopping_rounds=10,
                            verbose_eval=True
                            )
            
        mean_auc = pd.Series(cv_results['auc-mean']).max()
        boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()
            
        if mean_auc >= max_auc:
            max_auc = mean_auc
            best_params['num_leaves'] = num_leaves
            best_params['max_depth'] = max_depth
if 'num_leaves' and 'max_depth' in best_params.keys():          
    params['num_leaves'] = best_params['num_leaves']
    params['max_depth'] = best_params['max_depth']

# 過擬合
print("調參2：降低過擬合")
for max_bin in range(5,256,10):
    for min_data_in_leaf in range(1,102,10):
            params['max_bin'] = max_bin
            params['min_data_in_leaf'] = min_data_in_leaf
            
            cv_results = lgb.cv(
                                params,
                                lgb_train,
                                seed=1,
                                nfold=5,
                                metrics=['auc'],
                                early_stopping_rounds=10,
                                verbose_eval=True
                                )
                    
            mean_auc = pd.Series(cv_results['auc-mean']).max()
            boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()

            if mean_auc >= max_auc:
                max_auc = mean_auc
                best_params['max_bin']= max_bin
                best_params['min_data_in_leaf'] = min_data_in_leaf
if 'max_bin' and 'min_data_in_leaf' in best_params.keys():
    params['min_data_in_leaf'] = best_params['min_data_in_leaf']
    params['max_bin'] = best_params['max_bin']

print("調參3：降低過擬合")
for feature_fraction in [0.6,0.7,0.8,0.9,1.0]:
    for bagging_fraction in [0.6,0.7,0.8,0.9,1.0]:
        for bagging_freq in range(0,50,5):
            params['feature_fraction'] = feature_fraction
            params['bagging_fraction'] = bagging_fraction
            params['bagging_freq'] = bagging_freq
            
            cv_results = lgb.cv(
                                params,
                                lgb_train,
                                seed=1,
                                nfold=5,
                                metrics=['auc'],
                                early_stopping_rounds=10,
                                verbose_eval=True
                                )
                    
            mean_auc = pd.Series(cv_results['auc-mean']).max()
            boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()

            if mean_auc >= max_auc:
                max_auc=mean_auc
                best_params['feature_fraction'] = feature_fraction
                best_params['bagging_fraction'] = bagging_fraction
                best_params['bagging_freq'] = bagging_freq

if 'feature_fraction' and 'bagging_fraction' and 'bagging_freq' in best_params.keys():
    params['feature_fraction'] = best_params['feature_fraction']
    params['bagging_fraction'] = best_params['bagging_fraction']
    params['bagging_freq'] = best_params['bagging_freq']


print("調參4：降低過擬合")
for lambda_l1 in [1e-5,1e-3,1e-1,0.0,0.1,0.3,0.5,0.7,0.9,1.0]:
    for lambda_l2 in [1e-5,1e-3,1e-1,0.0,0.1,0.4,0.6,0.7,0.9,1.0]:
        params['lambda_l1'] = lambda_l1
        params['lambda_l2'] = lambda_l2
        cv_results = lgb.cv(
                            params,
                            lgb_train,
                            seed=1,
                            nfold=5,
                            metrics=['auc'],
                            early_stopping_rounds=10,
                            verbose_eval=True
                            )
                
        mean_auc = pd.Series(cv_results['auc-mean']).max()
        boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()

        if mean_auc >= max_auc:
            max_auc=mean_auc
            best_params['lambda_l1'] = lambda_l1
            best_params['lambda_l2'] = lambda_l2
if 'lambda_l1' and 'lambda_l2' in best_params.keys():
    params['lambda_l1'] = best_params['lambda_l1']
    params['lambda_l2'] = best_params['lambda_l2']

print("調參5：降低過擬合2")
for min_split_gain in [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
    params['min_split_gain'] = min_split_gain
    
    cv_results = lgb.cv(
                        params,
                        lgb_train,
                        seed=1,
                        nfold=5,
                        metrics=['auc'],
                        early_stopping_rounds=10,
                        verbose_eval=True
                        )
            
    mean_auc = pd.Series(cv_results['auc-mean']).max()
    boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()

    if mean_auc >= max_auc:
        max_auc=mean_auc
        
        best_params['min_split_gain'] = min_split_gain
if 'min_split_gain' in best_params.keys():
    params['min_split_gain'] = best_params['min_split_gain']

print(best_params)

In [None]:
model=lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='auc',learning_rate=0.01, n_estimators=1000, max_depth=4, num_leaves=10,max_bin=255,min_data_in_leaf=81,bagging_fraction=0.7,bagging_freq= 30, feature_fraction= 0.8,
lambda_l1=0.1,lambda_l2=0,min_split_gain=0.1)
model.fit(X_train,y_train)
y_pre=model.predict(X_test)
print("acc:",metrics.accuracy_score(y_test,y_pre))
print("auc:",metrics.roc_auc_score(y_test,y_pre))

## Stacking

## 如果使用交叉驗證，可以將StackingClassifier都變為StackingCVClassifier即可

In [None]:
from sklearn import datasets

iris = datasets.load_iris()
X, y = iris.data[:, 1:3], iris.target

from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingClassifier


clf1 = KNeighborsClassifier(n_neighbors=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()
lr = LogisticRegression()
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3],
                          meta_classifier=lr)

print('3-fold cross validation:\n')

for clf, label in zip([clf1, clf2, clf3, sclf],
                      ['KNN',
                       'Random Forest',
                       'Naive Bayes',
                       'StackingClassifier']):
    scores = model_selection.cross_val_score(clf, X, y,
                                             cv=3, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]"
          % (scores.mean(), scores.std(), label))

In [None]:
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_decision_regions
import matplotlib.gridspec as gridspec
import itertools
gs = gridspec.GridSpec(2, 2)
fig = plt.figure(figsize=(10,8))
for clf, lab, grd in zip([clf1, clf2, clf3, sclf],
    ['KNN',
    'Random Forest',
    'Naive Bayes',
    'StackingClassifier'],
    itertools.product([0, 1], repeat=2)):
    clf.fit(X, y)
    ax = plt.subplot(gs[grd[0], grd[1]])
    fig = plot_decision_regions(X=X, y=y, clf=clf)
    plt.title(lab)
plt.show()

In [None]:
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3],
                          use_probas=True,
                          average_probas=False,
                          meta_classifier=lr)

In [None]:
from sklearn.datasets import load_iris
from mlxtend.classifier import StackingClassifier
from mlxtend.feature_selection import ColumnSelector
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

iris = load_iris()
X = iris.data  # (150,4)
y = iris.target

pipe1 = make_pipeline(ColumnSelector(cols=(0, 2)),  # 選擇第0,2列特徵
                      LogisticRegression())
pipe2 = make_pipeline(ColumnSelector(cols=(1, 2, 3)),  # 選擇第1,2,3列特徵
                      LogisticRegression())

sclf = StackingClassifier(classifiers=[pipe1, pipe2],
                          meta_classifier=LogisticRegression())

sclf.fit(X, y)

In [None]:
# 堆疊泛化套件 mlxtend, 需要先行安裝(使用 pip 安裝即可)在執行環境下
from mlxtend.regressor import StackingRegressor

# 因為 Stacking 需要以模型作為第一層的特徵來源, 因此在 StackingRegressor 中,
# 除了要設本身(第二層)的判定模型 - meta_regressor, 也必須填入第一層的單模作為編碼器 - regressors
# 這裡第二層模型(meta_regressor)的參數, 一樣也需要用 Grid/Random Search, 請參閱講義中的 mlxtrend 網頁
meta_estimator = GradientBoostingRegressor(tol=10, subsample=0.44, n_estimators=100, 
                                           max_features='log2', max_depth=4, learning_rate=0.1)
stacking = StackingRegressor(regressors=[linear, gdbt, rf], meta_regressor=meta_estimator)