In [1]:
import numpy as np
import pandas as pd

In [2]:
#dataset
from sklearn.datasets import load_iris,fetch_openml

In [3]:
#transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder,MinMaxScaler,RobustScaler

In [4]:
#estimator
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC,LinearSVC

In [5]:
#pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn import set_config

In [6]:
from sklearn.model_selection import train_test_split#データセット分割

In [7]:
#パラメータ調整
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,ParameterGrid

In [8]:
#評価関数
from sklearn.metrics import accuracy_score,f1_score

In [9]:
X,y = load_iris(as_frame=True,return_X_y=True)

In [10]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
dtypes: float64(4)
memory usage: 4.8 KB


In [11]:
#パイプライン構築
##特徴量
features = ['sepal length (cm)',
            'sepal width (cm)',   
            'petal length (cm)', 
            'petal width (cm)']
##変換器パイプライン
###パイプライン定義
transformer = Pipeline(steps=[
    ('scaler',StandardScaler()) #特徴量を標準化
])
###統合（特徴量を変換器にかける）
preprocesser = ColumnTransformer(transformers=[
    ('transform',transformer,features)
])
##パイプライン全体：変換器パイプラインから予測器
pipeline = Pipeline(steps=[
    ('preprocesser',preprocesser),#変換器パイプライン
    ('classifier',LogisticRegression())#予測器（推定器）
])

In [12]:
#パイプライン表示
set_config(display='diagram')
pipeline

In [13]:
set_config(display='None')

In [14]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.5)

In [15]:
pipeline.fit(X_train,y_train)

Pipeline(steps=[('preprocesser',
                 ColumnTransformer(transformers=[('transform',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['sepal length (cm)',
                                                   'sepal width (cm)',
                                                   'petal length (cm)',
                                                   'petal width (cm)'])])),
                ('classifier', LogisticRegression())])

In [16]:
y_test_pred = pipeline.predict(X_test)
accuracy_score(y_test,y_test_pred)

0.96

In [17]:
#グリッドサーチの設定　
param_grid = [
    {
        'preprocesser__transform__scaler':[StandardScaler(),MinMaxScaler(),RobustScaler()],
        'classifier__C':[0.1,1.0,10.0,100.0],
        'classifier':[LogisticRegression()]
    }
]

grid_search = GridSearchCV(pipeline,param_grid,cv=10,verbose=3,n_jobs=-1)

In [18]:
grid_search.fit(X_train,y_train)

Fitting 10 folds for each of 12 candidates, totalling 120 fits


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('preprocesser',
                                        ColumnTransformer(transformers=[('transform',
                                                                         Pipeline(steps=[('scaler',
                                                                                          StandardScaler())]),
                                                                         ['sepal '
                                                                          'length '
                                                                          '(cm)',
                                                                          'sepal '
                                                                          'width '
                                                                          '(cm)',
                                                                          'petal '
                                               

In [19]:
print(grid_search.best_params_)#最適なパイプライン

{'classifier': LogisticRegression(C=100.0), 'classifier__C': 100.0, 'preprocesser__transform__scaler': RobustScaler()}


In [20]:
print(grid_search.best_score_)#正答率

0.9571428571428571


In [21]:
#テストデータで精度を検証する
y_test_pred = grid_search.predict(X_test)
print('accuracy :',accuracy_score(y_test,y_test_pred))


accuracy : 0.96


In [22]:
#ランダムサーチ
param_grid = [
    {
        "preprocesser__transform__scaler": [StandardScaler(), MinMaxScaler(), RobustScaler()],       
        "classifier__C": [0.1, 1.0, 10.0, 100.0],
        "classifier": [LogisticRegression()]
    },
    {
        "preprocesser__transform__scaler": [StandardScaler(), MinMaxScaler(), RobustScaler()],   
        "classifier__n_estimators": [10, 100, 1000],
        "classifier": [RandomForestClassifier(), GradientBoostingClassifier(), AdaBoostClassifier()]
    },
    {
        "preprocesser__transform__scaler": [StandardScaler(), MinMaxScaler(), RobustScaler()],   
        "classifier__C": [1, 10, 100, 1000],
        "classifier": [SVC(),LinearSVC()]
    }
]

rand_search = RandomizedSearchCV(pipeline, param_grid, cv=10, verbose=3, n_jobs=-1, n_iter=10)

In [23]:
# ランダムサーチの実行
rand_search.fit(X_train, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


RandomizedSearchCV(cv=10,
                   estimator=Pipeline(steps=[('preprocesser',
                                              ColumnTransformer(transformers=[('transform',
                                                                               Pipeline(steps=[('scaler',
                                                                                                StandardScaler())]),
                                                                               ['sepal '
                                                                                'length '
                                                                                '(cm)',
                                                                                'sepal '
                                                                                'width '
                                                                                '(cm)',
                                                                

In [24]:
# ランダムサーチの結果
print(rand_search.best_params_) #最適なパイプライン
print(rand_search.best_score_)  #正答率

{'preprocesser__transform__scaler': MinMaxScaler(), 'classifier__C': 10, 'classifier': SVC(C=10)}
0.9571428571428571


In [25]:
#taitanic
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

In [26]:
X

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0000,0.0,0.0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1.0,"Allison, Miss. Helen Loraine",female,2.0000,1.0,2.0,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1.0,2.0,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1.0,2.0,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3.0,"Zabour, Miss. Hileni",female,14.5000,1.0,0.0,2665,14.4542,,C,,328.0,
1305,3.0,"Zabour, Miss. Thamine",female,,1.0,0.0,2665,14.4542,,C,,,
1306,3.0,"Zakarian, Mr. Mapriededer",male,26.5000,0.0,0.0,2656,7.2250,,C,,304.0,
1307,3.0,"Zakarian, Mr. Ortin",male,27.0000,0.0,0.0,2670,7.2250,,C,,,


In [27]:
y

0       1
1       1
2       0
3       0
4       0
       ..
1304    0
1305    0
1306    0
1307    0
1308    0
Name: survived, Length: 1309, dtype: category
Categories (2, object): ['0', '1']

In [28]:
y = y.astype('int')

In [29]:
y

0       1
1       1
2       0
3       0
4       0
       ..
1304    0
1305    0
1306    0
1307    0
1308    0
Name: survived, Length: 1309, dtype: int64

In [42]:
#パイプライン構築

##特徴量
numeric_features = ["age", "sibsp", "parch", "fare"] 
categorical_features = ["sex", "pclass"]  
###数値型
numeric_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer()),#欠損値補完
    ('scaler',StandardScaler())#標準化
])

###カテゴリ型
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

###特徴量を変換器にかける
preprocesser = ColumnTransformer(transformers=[
    ('num_transform',numeric_transformer,numeric_features),
    ('cat_transform',categorical_transformer,categorical_features)
])

pipeline = Pipeline(steps=[#変換器パイプラインから予測器へ
    ('preprocesser',preprocesser),
    ('classifier',LogisticRegression())
])




In [44]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [43]:
pipeline.fit(X_train,y_train)

Pipeline(steps=[('preprocesser',
                 ColumnTransformer(transformers=[('num_transform',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['age', 'sibsp', 'parch',
                                                   'fare']),
                                                 ('cat_transform',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['sex', 'pclass'])])),
                ('classifier', LogisticRegression())])

In [45]:
y_test_pred1 = pipeline.predict(X_test)
print('accuracy',accuracy_score(y_test,y_test_pred1))
print('f1',f1_score(y_test,y_test_pred1))

accuracy 0.8053435114503816
f1 0.7213114754098361


In [34]:
#optuna
import optuna

def objective(trial):
    x = trial.suggest_uniform('x', -10, 10)
    score = (x - 2) ** 2
    print('x: %1.3f, score: %1.3f' % (x, score))
    return score

study = optuna.create_study()
study.optimize(objective, n_trials=100)

[32m[I 2021-12-26 17:58:43,141][0m A new study created in memory with name: no-name-86259136-6b75-4f43-b13a-8f7aae67102f[0m
[32m[I 2021-12-26 17:58:43,145][0m Trial 0 finished with value: 0.9575739923050356 and parameters: {'x': 1.0214429028896497}. Best is trial 0 with value: 0.9575739923050356.[0m
[32m[I 2021-12-26 17:58:43,147][0m Trial 1 finished with value: 125.7020185734259 and parameters: {'x': -9.211691155817034}. Best is trial 0 with value: 0.9575739923050356.[0m
[32m[I 2021-12-26 17:58:43,149][0m Trial 2 finished with value: 4.443017791693236 and parameters: {'x': 4.107846719212105}. Best is trial 0 with value: 0.9575739923050356.[0m
[32m[I 2021-12-26 17:58:43,153][0m Trial 3 finished with value: 14.100256996678311 and parameters: {'x': 5.755030891574437}. Best is trial 0 with value: 0.9575739923050356.[0m
[32m[I 2021-12-26 17:58:43,161][0m Trial 4 finished with value: 7.620184097092344 and parameters: {'x': 4.760468093837048}. Best is trial 0 with value: 0.9

x: 1.021, score: 0.958
x: -9.212, score: 125.702
x: 4.108, score: 4.443
x: 5.755, score: 14.100
x: 4.760, score: 7.620
x: 4.236, score: 5.000
x: -9.928, score: 142.280
x: 8.680, score: 44.621
x: -8.412, score: 108.417
x: 9.254, score: 52.617
x: -2.889, score: 23.900
x: -1.046, score: 9.281
x: 1.629, score: 0.137
x: -4.265, score: 39.244
x: 1.268, score: 0.536
x: 1.293, score: 0.500
x: -4.449, score: 41.588
x: -0.805, score: 7.865
x: 2.041, score: 0.002
x: 7.216, score: 27.204
x: 2.861, score: 0.742
x: 2.515, score: 0.265
x: 2.685, score: 0.469
x: -2.706, score: 22.146
x: 6.430, score: 19.622


[32m[I 2021-12-26 17:58:43,350][0m Trial 25 finished with value: 0.44126000639613266 and parameters: {'x': 2.664274044650348}. Best is trial 18 with value: 0.0016410879047678837.[0m
[32m[I 2021-12-26 17:58:43,363][0m Trial 26 finished with value: 3.9876749962851132 and parameters: {'x': 0.003083628119316284}. Best is trial 18 with value: 0.0016410879047678837.[0m
[32m[I 2021-12-26 17:58:43,373][0m Trial 27 finished with value: 17.520439514710855 and parameters: {'x': -2.185742409025053}. Best is trial 18 with value: 0.0016410879047678837.[0m
[32m[I 2021-12-26 17:58:43,382][0m Trial 28 finished with value: 77.22140562068174 and parameters: {'x': -6.787571087660216}. Best is trial 18 with value: 0.0016410879047678837.[0m
[32m[I 2021-12-26 17:58:43,391][0m Trial 29 finished with value: 2.1711450880243985 and parameters: {'x': 0.5265193967939725}. Best is trial 18 with value: 0.0016410879047678837.[0m
[32m[I 2021-12-26 17:58:43,400][0m Trial 30 finished with value: 0.00020

x: 2.664, score: 0.441
x: 0.003, score: 3.988
x: -2.186, score: 17.520
x: -6.788, score: 77.221
x: 0.527, score: 2.171
x: 1.986, score: 0.000
x: 2.376, score: 0.141
x: 1.586, score: 0.172
x: 3.757, score: 3.086
x: 5.476, score: 12.086
x: -0.992, score: 8.954
x: 3.699, score: 2.885
x: 4.971, score: 8.825
x: 1.904, score: 0.009
x: 7.715, score: 32.660
x: 0.252, score: 3.055


[32m[I 2021-12-26 17:58:43,587][0m Trial 41 finished with value: 0.13025270628875055 and parameters: {'x': 1.639094602023258}. Best is trial 30 with value: 0.0002020792862011061.[0m
[32m[I 2021-12-26 17:58:43,606][0m Trial 42 finished with value: 0.2889938755873387 and parameters: {'x': 1.4624184940054776}. Best is trial 30 with value: 0.0002020792862011061.[0m
[32m[I 2021-12-26 17:58:43,616][0m Trial 43 finished with value: 4.936128671787946 and parameters: {'x': 4.221740009944446}. Best is trial 30 with value: 0.0002020792862011061.[0m
[32m[I 2021-12-26 17:58:43,625][0m Trial 44 finished with value: 1.5677759619158738 and parameters: {'x': 3.252108606278175}. Best is trial 30 with value: 0.0002020792862011061.[0m
[32m[I 2021-12-26 17:58:43,634][0m Trial 45 finished with value: 6.310540707031527 and parameters: {'x': -0.5120789611458331}. Best is trial 30 with value: 0.0002020792862011061.[0m
[32m[I 2021-12-26 17:58:43,643][0m Trial 46 finished with value: 14.66760794

x: 1.639, score: 0.130
x: 1.462, score: 0.289
x: 4.222, score: 4.936
x: 3.252, score: 1.568
x: -0.512, score: 6.311
x: -1.830, score: 14.668
x: 1.967, score: 0.001
x: 0.836, score: 1.355
x: 4.631, score: 6.924
x: 6.030, score: 16.238
x: 1.904, score: 0.009
x: 2.045, score: 0.002
x: 3.458, score: 2.126
x: 2.005, score: 0.000
x: 0.793, score: 1.457
x: -0.207, score: 4.871
x: 2.175, score: 0.030
x: 3.129, score: 1.274
x: -1.738, score: 13.972
x: 5.086, score: 9.521


[32m[I 2021-12-26 17:58:43,790][0m Trial 61 finished with value: 0.0009158742161180719 and parameters: {'x': 2.0302634138212805}. Best is trial 54 with value: 2.7600377630875524e-05.[0m
[32m[I 2021-12-26 17:58:43,802][0m Trial 62 finished with value: 1.7563138976411465 and parameters: {'x': 0.6747400641228354}. Best is trial 54 with value: 2.7600377630875524e-05.[0m
[32m[I 2021-12-26 17:58:43,810][0m Trial 63 finished with value: 0.0750212120877428 and parameters: {'x': 2.2739000038111405}. Best is trial 54 with value: 2.7600377630875524e-05.[0m
[32m[I 2021-12-26 17:58:43,819][0m Trial 64 finished with value: 0.6545023197867101 and parameters: {'x': 1.190986823477695}. Best is trial 54 with value: 2.7600377630875524e-05.[0m
[32m[I 2021-12-26 17:58:43,827][0m Trial 65 finished with value: 4.837321617296304 and parameters: {'x': 4.1993911924203715}. Best is trial 54 with value: 2.7600377630875524e-05.[0m
[32m[I 2021-12-26 17:58:43,838][0m Trial 66 finished with value: 3.

x: 2.030, score: 0.001
x: 0.675, score: 1.756
x: 2.274, score: 0.075
x: 1.191, score: 0.655
x: 4.199, score: 4.837
x: 0.087, score: 3.660
x: 2.786, score: 0.618
x: 2.165, score: 0.027
x: 3.670, score: 2.790
x: 1.124, score: 0.768
x: 1.942, score: 0.003
x: 2.872, score: 0.761
x: 1.869, score: 0.017
x: 0.426, score: 2.478
x: -0.619, score: 6.858
x: 1.795, score: 0.042
x: 2.880, score: 0.775
x: -1.384, score: 11.452
x: 3.826, score: 3.334
x: 4.559, score: 6.548
x: 2.047, score: 0.002
x: 1.345, score: 0.429

[32m[I 2021-12-26 17:58:43,994][0m Trial 82 finished with value: 0.42857336833301235 and parameters: {'x': 1.345344847776318}. Best is trial 54 with value: 2.7600377630875524e-05.[0m
[32m[I 2021-12-26 17:58:44,005][0m Trial 83 finished with value: 0.14072522628566106 and parameters: {'x': 2.3751336112449284}. Best is trial 54 with value: 2.7600377630875524e-05.[0m
[32m[I 2021-12-26 17:58:44,015][0m Trial 84 finished with value: 1.7270155541657093 and parameters: {'x': 3.314159638006627}. Best is trial 54 with value: 2.7600377630875524e-05.[0m
[32m[I 2021-12-26 17:58:44,026][0m Trial 85 finished with value: 1.1705970940225314 and parameters: {'x': 0.918058645756374}. Best is trial 54 with value: 2.7600377630875524e-05.[0m
[32m[I 2021-12-26 17:58:44,035][0m Trial 86 finished with value: 0.005385362506184472 and parameters: {'x': 1.9266149708306628}. Best is trial 54 with value: 2.7600377630875524e-05.[0m
[32m[I 2021-12-26 17:58:44,042][0m Trial 87 finished with value: 4.


x: 2.375, score: 0.141
x: 3.314, score: 1.727
x: 0.918, score: 1.171
x: 1.927, score: 0.005
x: -0.157, score: 4.653
x: 2.482, score: 0.233
x: 1.448, score: 0.305
x: 9.992, score: 63.867
x: 2.029, score: 0.001
x: 3.128, score: 1.273
x: 2.064, score: 0.004
x: 0.564, score: 2.063
x: -8.085, score: 101.703
x: 1.202, score: 0.637
x: 2.633, score: 0.401
x: 4.107, score: 4.439
x: 0.245, score: 3.079


In [35]:
study.best_params

{'x': 2.005253606154907}

In [36]:
# グリッドサーチの設定
param_grid = [
    {
        "preprocesser__num_transform__imputer__strategy": ["mean", "median"],
        "preprocesser__num_transform__scaler": [StandardScaler(), MinMaxScaler(), RobustScaler()],       
        "classifier__C": [0.1, 1.0, 10.0, 100.0],
        "classifier": [LogisticRegression()]
    }
]
grid_search = GridSearchCV(pipeline, param_grid, cv=10, verbose=3,n_jobs=-1)


In [37]:
grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 24 candidates, totalling 240 fits


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('preprocesser',
                                        ColumnTransformer(transformers=[('num_transform',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer()),
                                                                                         ('scaler',
                                                                                          StandardScaler())]),
                                                                         ['age',
                                                                          'sibsp',
                                                                          'parch',
                                                                          'fare'])])),
                                       ('classifier', LogisticRegression())]),
 

In [38]:
print(grid_search.best_params_) #最適なパイプライン
print(grid_search.best_score_)  #正答率


{'classifier': LogisticRegression(), 'classifier__C': 1.0, 'preprocesser__num_transform__imputer__strategy': 'mean', 'preprocesser__num_transform__scaler': RobustScaler()}
0.671419413919414


In [39]:
# テストデータで精度検証
y_test_pred = grid_search.predict(X_test)
print('accuracy:',accuracy_score(y_test, y_test_pred))
print('f1:', f1_score(y_test, y_test_pred))

accuracy: 0.6564885496183206
f1: 0.30769230769230765


In [40]:
# グリッドサーチの設定その２
param_grid = [
    {
        "preprocesser__num_transform__imputer__strategy": ["mean", "median"],
        "preprocesser__num_transform__scaler": [StandardScaler(), MinMaxScaler(), RobustScaler()],   
        "classifier__C": [0.1, 1.0, 10.0, 100.0],
        "classifier": [LogisticRegression()]
    },
    {
        "preprocesser__num_transform__imputer__strategy": ["mean", "median"],
        "preprocesser__num_transform__scaler": [StandardScaler(), MinMaxScaler(), RobustScaler()],   
        "classifier__n_estimators": [10, 100, 1000],
        "classifier": [RandomForestClassifier(), GradientBoostingClassifier(), AdaBoostClassifier()]
    },
    {
        "preprocesser__num_transform__imputer__strategy": ["mean", "median"],
        "preprocesser__num_transform__scaler": [StandardScaler(), MinMaxScaler(), RobustScaler()],   
        "classifier__C": [1, 10, 100, 1000],
        "classifier": [SVC(),LinearSVC()]
    }
]
grid_search = GridSearchCV(pipeline, param_grid, cv=10, verbose=3,n_jobs=-1)


In [41]:
grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 126 candidates, totalling 1260 fits


KeyboardInterrupt: 

In [None]:
# グリッドサーチの結果
print(grid_search.best_params_) #最適なパイプライン
print(grid_search.best_score_)  #正答率

{'classifier': SVC(C=100), 'classifier__C': 100, 'preprocesser__num_transform__imputer__strategy': 'mean', 'preprocesser__num_transform__scaler': StandardScaler()}
0.723919413919414


In [None]:
# テストデータで精度検証
y_test_pred = grid_search.predict(X_test)
print('accuracy:',accuracy_score(y_test, y_test_pred))
print('f1:', f1_score(y_test, y_test_pred))


accuracy: 0.6984732824427481
f1: 0.5269461077844312


In [None]:
rand_search = RandomizedSearchCV(pipeline, param_grid, cv=10, verbose=3, n_jobs=-1, n_iter=10)

In [None]:
# ランダムサーチの実行
rand_search.fit(X_train, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [None]:
# ランダムサーチの結果
print(rand_search.best_params_) #最適なパイプライン
print(rand_search.best_score_)  #正答率

{'preprocesser__num_transform__scaler': StandardScaler(), 'preprocesser__num_transform__imputer__strategy': 'median', 'classifier__n_estimators': 100, 'classifier': AdaBoostClassifier(n_estimators=100)}
0.7172802197802198


In [None]:
# テストデータで精度検証
y_test_pred = rand_search.predict(X_test)
print('accuracy:',accuracy_score(y_test, y_test_pred))
print('f1:', f1_score(y_test, y_test_pred))

accuracy: 0.6755725190839694
f1: 0.5251396648044693


In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_breast_cancer.html
from sklearn.datasets import load_breast_cancer
breast_cancer = load_breast_cancer()
X = breast_cancer.data
y = breast_cancer.target.ravel()

In [None]:
from sklearn.model_selection import train_test_split 
# 訓練データ・テストデータへ6:4の比でランダムに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4) 

In [None]:
%%time#grifsearchの場合
from sklearn.model_selection import GridSearchCV

# LightGBM
import lightgbm as lgb

# グリッドサーチを行うためのパラメーター
parameters = [{
    'learning_rate':[0.1,0.2],
    'n_estimators':[20,100,200],
    'max_depth':[3,5,7,9],
    'min_child_weight':[0.5,1,2],
    'min_child_samples':[5,10,20],
    'subsample':[0.8],
    'colsample_bytree':[0.8],
    'verbose':[-1],
    'num_leaves':[80]
}]

#グリッドサーチ実行
classifier = GridSearchCV(lgb.LGBMClassifier(), parameters, cv=3, n_jobs=-1)
classifier.fit(X_train, y_train)
print("Accuracy score (train): ", classifier.score(X_train, y_train))
print("Accuracy score (test): ", classifier.score(X_test, y_test))
print(classifier.best_estimator_) # ベストのパラメーター

Accuracy score (train):  1.0
Accuracy score (test):  0.9736842105263158
LGBMClassifier(colsample_bytree=0.8, max_depth=3, min_child_weight=2,
               n_estimators=200, num_leaves=80, subsample=0.8, verbose=-1)
CPU times: user 1.15 s, sys: 144 ms, total: 1.3 s
Wall time: 16.7 s


In [None]:
import numpy as np
import lightgbm as lgb
# 目的関数
def objective(trial):
    learning_rate = trial.suggest_loguniform('learning_rate', 0.1,0.2),
    n_estimators, = trial.suggest_int('n_estimators', 20, 200),
    max_depth, = trial.suggest_int('max_depth', 3, 9),
    min_child_weight = trial.suggest_loguniform('min_child_weight', 0.5, 2),
    min_child_samples, = trial.suggest_int('min_child_samples', 5, 20),
    classifier = lgb.LGBMClassifier(learning_rate=learning_rate, 
                                    n_estimators=n_estimators,
                                    max_depth=max_depth, 
                                    min_child_weight=min_child_weight,
                                    min_child_samples=min_child_samples,
                                    subsample=0.8, colsample_bytree=0.8,
                                    verbose=-1, num_leaves=80)
    classifier.fit(X_train, y_train)
    #return classifier.score(X_train, y_train) # 正答率（train） の最適化
    return np.linalg.norm(y_train - classifier.predict_proba(X_train)[:, 1], ord=1) # 尤度の最適化

In [None]:
#study = optuna.create_study(direction='maximize') # 最大化
study = optuna.create_study(direction='minimize') # 最小化

[32m[I 2021-12-25 21:44:24,562][0m A new study created in memory with name: no-name-149e0114-fe25-4332-97ad-db8790652fb2[0m


In [None]:
study.optimize(objective, n_trials=100)

[32m[I 2021-12-25 21:45:04,737][0m Trial 1 finished with value: 3.363022286438531 and parameters: {'learning_rate': 0.16873617172493194, 'n_estimators': 52, 'max_depth': 5, 'min_child_weight': 1.0662962128087536, 'min_child_samples': 20}. Best is trial 1 with value: 3.363022286438531.[0m
[32m[I 2021-12-25 21:45:04,810][0m Trial 2 finished with value: 3.5124274362137418 and parameters: {'learning_rate': 0.16041622088571988, 'n_estimators': 80, 'max_depth': 6, 'min_child_weight': 1.4243835796874045, 'min_child_samples': 10}. Best is trial 1 with value: 3.363022286438531.[0m
[32m[I 2021-12-25 21:45:04,964][0m Trial 3 finished with value: 4.249353868424041 and parameters: {'learning_rate': 0.12394358346446024, 'n_estimators': 176, 'max_depth': 5, 'min_child_weight': 1.952840078125491, 'min_child_samples': 9}. Best is trial 1 with value: 3.363022286438531.[0m
[32m[I 2021-12-25 21:45:05,027][0m Trial 4 finished with value: 5.310181510582019 and parameters: {'learning_rate': 0.1644

In [None]:
best_params = study.best_params

In [None]:
study.best_value

1.0212243098271432

In [None]:
classifier = lgb.LGBMClassifier(**study.best_params,subsample=0.8,colsample_bytree=0.8,verbose=-1,num_leaves=80)

In [None]:
classifier

In [None]:
classifier.fit(X_train, y_train)

In [None]:
classifier.score(X_train, y_train)


1.0

In [None]:
classifier.score(X_test, y_test)


0.9736842105263158