In [39]:
import numpy as np
import pandas as pd

In [40]:
#dataset
from sklearn.datasets import load_iris,fetch_openml

In [41]:
#transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder,MinMaxScaler,RobustScaler

In [42]:
#estimator
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC,LinearSVC

In [43]:
#pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn import set_config

In [44]:
from sklearn.model_selection import train_test_split#データセット分割

In [45]:
#パラメータ調整
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,ParameterGrid

In [46]:
#評価関数
from sklearn.metrics import accuracy_score,f1_score

In [47]:
X,y = load_iris(as_frame=True,return_X_y=True)

In [48]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
dtypes: float64(4)
memory usage: 4.8 KB


In [119]:
#パイプライン構築
##特徴量
features = ['sepal length (cm)',
            'sepal width (cm)',   
            'petal length (cm)', 
            'petal width (cm)']
##変換器パイプライン
###パイプライン定義
transformer = Pipeline(steps=[
    ('scaler',StandardScaler()) #特徴量を標準化
])
###統合（特徴量を変換器にかける）
preprocesser = ColumnTransformer(transformers=[
    ('transform',transformer,features)
])
##パイプライン全体：変換器パイプラインから予測器
pipeline = Pipeline(steps=[
    ('preprocesser',preprocesser),#変換器パイプライン
    ('classifier',LogisticRegression())#予測器（推定器）
])

In [50]:
#パイプライン表示
set_config(display='diagram')
pipeline

In [51]:
set_config(display='None')

In [52]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.5)

In [53]:
pipeline.fit(X_train,y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('transform',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['sepal length (cm)',
                                                   'sepal width (cm)',
                                                   'petal length (cm)',
                                                   'petal width (cm)'])])),
                ('classifier', LogisticRegression())])

In [54]:
y_test_pred = pipeline.predict(X_test)
accuracy_score(y_test,y_test_pred)

0.9333333333333333

In [115]:
#グリッドサーチの設定　
param_grid = [
    {
        'preprocesser__transform__scaler':[StandardScaler(),MinMaxScaler(),RobustScaler()],
        'classifier__C':[0.1,1.0,10.0,100.0],
        'classifier':[LogisticRegression()]
    }
]

grid_search = GridSearchCV(pipeline,param_grid,cv=10,verbose=3,n_jobs=-1)

In [56]:
grid_search.fit(X_train,y_train)

Fitting 10 folds for each of 12 candidates, totalling 120 fits


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('transform',
                                                                         Pipeline(steps=[('scaler',
                                                                                          StandardScaler())]),
                                                                         ['sepal '
                                                                          'length '
                                                                          '(cm)',
                                                                          'sepal '
                                                                          'width '
                                                                          '(cm)',
                                                                          'petal '
                                               

In [57]:
print(grid_search.best_params_)#最適なパイプライン

{'classifier': LogisticRegression(C=100.0), 'classifier__C': 100.0, 'preprocessor__transform__scaler': MinMaxScaler()}


In [58]:
print(grid_search.best_score_)#正答率

0.9857142857142858


In [59]:
#テストデータで精度を検証する
y_test_pred = grid_search.predict(X_test)
print('accuracy :',accuracy_score(y_test,y_test_pred))


accuracy : 0.96


In [117]:
#ランダムサーチ
param_grid = [
    {
        "preprocesser__transform__scaler": [StandardScaler(), MinMaxScaler(), RobustScaler()],       
        "classifier__C": [0.1, 1.0, 10.0, 100.0],
        "classifier": [LogisticRegression()]
    },
    {
        "preprocesser__transform__scaler": [StandardScaler(), MinMaxScaler(), RobustScaler()],   
        "classifier__n_estimators": [10, 100, 1000],
        "classifier": [RandomForestClassifier(), GradientBoostingClassifier(), AdaBoostClassifier()]
    },
    {
        "preprocesser__transform__scaler": [StandardScaler(), MinMaxScaler(), RobustScaler()],   
        "classifier__C": [1, 10, 100, 1000],
        "classifier": [SVC(),LinearSVC()]
    }
]

rand_search = RandomizedSearchCV(pipeline, param_grid, cv=10, verbose=3, n_jobs=-1, n_iter=10)

In [120]:
# ランダムサーチの実行
rand_search.fit(X_train, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


ValueError: Invalid parameter transform for estimator ColumnTransformer(transformers=[('num_transform',
                                 Pipeline(steps=[('imputer', SimpleImputer()),
                                                 ('scaler', StandardScaler())]),
                                 ['age', 'sibsp', 'parch', 'fare'])]). Check the list of available parameters with `estimator.get_params().keys()`.

In [None]:
# ランダムサーチの結果
print(rand_search.best_params_) #最適なパイプライン
print(rand_search.best_score_)  #正答率

In [64]:
#taitanic
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

In [65]:
X

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0000,0.0,0.0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1.0,"Allison, Miss. Helen Loraine",female,2.0000,1.0,2.0,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1.0,2.0,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1.0,2.0,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3.0,"Zabour, Miss. Hileni",female,14.5000,1.0,0.0,2665,14.4542,,C,,328.0,
1305,3.0,"Zabour, Miss. Thamine",female,,1.0,0.0,2665,14.4542,,C,,,
1306,3.0,"Zakarian, Mr. Mapriededer",male,26.5000,0.0,0.0,2656,7.2250,,C,,304.0,
1307,3.0,"Zakarian, Mr. Ortin",male,27.0000,0.0,0.0,2670,7.2250,,C,,,


In [66]:
y

0       1
1       1
2       0
3       0
4       0
       ..
1304    0
1305    0
1306    0
1307    0
1308    0
Name: survived, Length: 1309, dtype: category
Categories (2, object): ['0', '1']

In [67]:
y = y.astype('int')

In [68]:
y

0       1
1       1
2       0
3       0
4       0
       ..
1304    0
1305    0
1306    0
1307    0
1308    0
Name: survived, Length: 1309, dtype: int64

In [92]:
#パイプライン構築

##特徴量
numeric_features = ["age", "sibsp", "parch", "fare"] 
categorical_features = ["sex", "pclass"]  
###数値型
numeric_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer()),#欠損値補完
    ('scaler',StandardScaler())#標準化
])

###カテゴリ型
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

###特徴量を変換器にかける
preprocesser = ColumnTransformer(transformers=[
    ('num_transform',numeric_transformer,numeric_features)
])

pipeline = Pipeline(steps=[#変換器パイプラインから予測器へ
    ('preprocesser',preprocesser),
    ('classifier',LogisticRegression())
])




In [93]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [94]:
pipeline.fit(X_train,y_train)

In [95]:
y_test_pred1 = pipeline.predict(X_test)
print('accuracy',accuracy_score(y_test,y_test_pred1))
print('f1',f1_score(y_test,y_test_pred1))

accuracy 0.648854961832061
f1 0.28125000000000006


In [None]:
#optuna
import optuna

def objective(trial):
    x = trial.suggest_uniform('x', -10, 10)
    score = (x - 2) ** 2
    print('x: %1.3f, score: %1.3f' % (x, score))
    return score

study = optuna.create_study()
study.optimize(objective, n_trials=100)

In [85]:
study.best_params

{'x': 2.0200526150770277}

In [96]:
# グリッドサーチの設定
param_grid = [
    {
        "preprocesser__num_transform__imputer__strategy": ["mean", "median"],
        "preprocesser__num_transform__scaler": [StandardScaler(), MinMaxScaler(), RobustScaler()],       
        "classifier__C": [0.1, 1.0, 10.0, 100.0],
        "classifier": [LogisticRegression()]
    }
]
grid_search = GridSearchCV(pipeline, param_grid, cv=10, verbose=3,n_jobs=-1)


In [97]:
grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 24 candidates, totalling 240 fits


In [99]:
print(grid_search.best_params_) #最適なパイプライン
print(grid_search.best_score_)  #正答率


{'classifier': LogisticRegression(), 'classifier__C': 1.0, 'preprocesser__num_transform__imputer__strategy': 'mean', 'preprocesser__num_transform__scaler': StandardScaler()}
0.6810256410256411


In [101]:
# テストデータで精度検証
y_test_pred = grid_search.predict(X_test)
print('accuracy:',accuracy_score(y_test, y_test_pred))
print('f1:', f1_score(y_test, y_test_pred))

accuracy: 0.648854961832061
f1: 0.28125000000000006


In [104]:
# グリッドサーチの設定その２
param_grid = [
    {
        "preprocesser__num_transform__imputer__strategy": ["mean", "median"],
        "preprocesser__num_transform__scaler": [StandardScaler(), MinMaxScaler(), RobustScaler()],   
        "classifier__C": [0.1, 1.0, 10.0, 100.0],
        "classifier": [LogisticRegression()]
    },
    {
        "preprocesser__num_transform__imputer__strategy": ["mean", "median"],
        "preprocesser__num_transform__scaler": [StandardScaler(), MinMaxScaler(), RobustScaler()],   
        "classifier__n_estimators": [10, 100, 1000],
        "classifier": [RandomForestClassifier(), GradientBoostingClassifier(), AdaBoostClassifier()]
    },
    {
        "preprocesser__num_transform__imputer__strategy": ["mean", "median"],
        "preprocesser__num_transform__scaler": [StandardScaler(), MinMaxScaler(), RobustScaler()],   
        "classifier__C": [1, 10, 100, 1000],
        "classifier": [SVC(),LinearSVC()]
    }
]
grid_search = GridSearchCV(pipeline, param_grid, cv=10, verbose=3,n_jobs=-1)


In [105]:
grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 126 candidates, totalling 1260 fits


In [107]:
# グリッドサーチの結果
print(grid_search.best_params_) #最適なパイプライン
print(grid_search.best_score_)  #正答率

{'classifier': SVC(C=100), 'classifier__C': 100, 'preprocesser__num_transform__imputer__strategy': 'mean', 'preprocesser__num_transform__scaler': StandardScaler()}
0.723919413919414


In [108]:
# テストデータで精度検証
y_test_pred = grid_search.predict(X_test)
print('accuracy:',accuracy_score(y_test, y_test_pred))
print('f1:', f1_score(y_test, y_test_pred))


accuracy: 0.6984732824427481
f1: 0.5269461077844312


In [110]:
rand_search = RandomizedSearchCV(pipeline, param_grid, cv=10, verbose=3, n_jobs=-1, n_iter=10)

In [111]:
# ランダムサーチの実行
rand_search.fit(X_train, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [113]:
# ランダムサーチの結果
print(rand_search.best_params_) #最適なパイプライン
print(rand_search.best_score_)  #正答率

{'preprocesser__num_transform__scaler': StandardScaler(), 'preprocesser__num_transform__imputer__strategy': 'median', 'classifier__n_estimators': 100, 'classifier': AdaBoostClassifier(n_estimators=100)}
0.7172802197802198


In [114]:
# テストデータで精度検証
y_test_pred = rand_search.predict(X_test)
print('accuracy:',accuracy_score(y_test, y_test_pred))
print('f1:', f1_score(y_test, y_test_pred))

accuracy: 0.6755725190839694
f1: 0.5251396648044693


In [129]:
# https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_breast_cancer.html
from sklearn.datasets import load_breast_cancer
breast_cancer = load_breast_cancer()
X = breast_cancer.data
y = breast_cancer.target.ravel()

In [145]:
from sklearn.model_selection import train_test_split 
# 訓練データ・テストデータへ6:4の比でランダムに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4) 

In [133]:
import numpy as np
import lightgbm as lgb
# 目的関数
def objective(trial):
    learning_rate = trial.suggest_loguniform('learning_rate', 0.1,0.2),
    n_estimators, = trial.suggest_int('n_estimators', 20, 200),
    max_depth, = trial.suggest_int('max_depth', 3, 9),
    min_child_weight = trial.suggest_loguniform('min_child_weight', 0.5, 2),
    min_child_samples, = trial.suggest_int('min_child_samples', 5, 20),
    classifier = lgb.LGBMClassifier(learning_rate=learning_rate, 
                                    n_estimators=n_estimators,
                                    max_depth=max_depth, 
                                    min_child_weight=min_child_weight,
                                    min_child_samples=min_child_samples,
                                    subsample=0.8, colsample_bytree=0.8,
                                    verbose=-1, num_leaves=80)
    classifier.fit(X_train, y_train)
    #return classifier.score(X_train, y_train) # 正答率（train） の最適化
    return np.linalg.norm(y_train - classifier.predict_proba(X_train)[:, 1], ord=1) # 尤度の最適化

In [131]:
#study = optuna.create_study(direction='maximize') # 最大化
study = optuna.create_study(direction='minimize') # 最小化

[32m[I 2021-12-25 21:44:24,562][0m A new study created in memory with name: no-name-149e0114-fe25-4332-97ad-db8790652fb2[0m


In [134]:
study.optimize(objective, n_trials=100)

[32m[I 2021-12-25 21:45:04,737][0m Trial 1 finished with value: 3.363022286438531 and parameters: {'learning_rate': 0.16873617172493194, 'n_estimators': 52, 'max_depth': 5, 'min_child_weight': 1.0662962128087536, 'min_child_samples': 20}. Best is trial 1 with value: 3.363022286438531.[0m
[32m[I 2021-12-25 21:45:04,810][0m Trial 2 finished with value: 3.5124274362137418 and parameters: {'learning_rate': 0.16041622088571988, 'n_estimators': 80, 'max_depth': 6, 'min_child_weight': 1.4243835796874045, 'min_child_samples': 10}. Best is trial 1 with value: 3.363022286438531.[0m
[32m[I 2021-12-25 21:45:04,964][0m Trial 3 finished with value: 4.249353868424041 and parameters: {'learning_rate': 0.12394358346446024, 'n_estimators': 176, 'max_depth': 5, 'min_child_weight': 1.952840078125491, 'min_child_samples': 9}. Best is trial 1 with value: 3.363022286438531.[0m
[32m[I 2021-12-25 21:45:05,027][0m Trial 4 finished with value: 5.310181510582019 and parameters: {'learning_rate': 0.1644

In [138]:
best_params = study.best_params

In [136]:
study.best_value

1.0212243098271432

In [151]:
classifier = lgb.LGBMClassifier(**study.best_params,subsample=0.8,colsample_bytree=0.8,verbose=-1,num_leaves=80)

In [141]:
classifier

In [152]:
classifier.fit(X_train, y_train)

In [153]:
classifier.score(X_train, y_train)


1.0

In [154]:
classifier.score(X_test, y_test)


0.9736842105263158