<a href="https://colab.research.google.com/github/peisuke/ml-works/blob/main/4/4_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

- 本コードは以下のサイトを参考にしました
- https://qiita.com/tomov3/items/039d4271ed30490edf7b

In [1]:
# 必要なライブラリの import
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
# データのロード
iris = load_iris()

# データの分割
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=0)

# training set を用いて学習
logreg = LogisticRegression(max_iter=1000).fit(X_train, y_train)

# test set を用いて評価
score = logreg.score(X_test, y_test)
print('Test set score: {}'.format(score))

Test set score: 0.9736842105263158


In [3]:
from sklearn.model_selection import cross_val_score
logreg = LogisticRegression(max_iter=1000)
# 交差検証
scores = cross_val_score(logreg, iris.data, iris.target)
# 各分割におけるスコア
print('Cross-Validation scores: {}'.format(scores))
# スコアの平均値
import numpy as np
print('Average score: {}'.format(np.mean(scores)))

Cross-Validation scores: [0.96666667 1.         0.93333333 0.96666667 1.        ]
Average score: 0.9733333333333334


In [4]:
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

# 単純な方法
kfold = KFold(n_splits=3)
print('Cross-validation scores: \n{}'.format(cross_val_score(logreg, iris.data, iris.target, cv=kfold)))

# 層化 k 分割交差検証
stratifiedkfold = StratifiedKFold(n_splits=3)
print('Cross-validation scores: \n{}'.format(cross_val_score(logreg, iris.data, iris.target, cv=stratifiedkfold)))

Cross-validation scores: 
[0. 0. 0.]
Cross-validation scores: 
[0.98 0.96 0.98]


In [5]:
from sklearn.svm import SVC

X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=0)

param_list = [0.001, 0.01, 0.1, 1, 10, 100]

best_score = 0
best_parameters = {}

for gamma in param_list:
    for C in param_list:
        svm = SVC(gamma=gamma, C=C)
        svm.fit(X_train, y_train)
        score = svm.score(X_test, y_test)
        # 最も良いスコアのパラメータとスコアを更新
        if score > best_score:
            best_score = score
            best_parameters = {'gamma' : gamma, 'C' : C}

print('Best score: {}'.format(best_score))
print('Best parameters: {}'.format(best_parameters))

Best score: 0.9736842105263158
Best parameters: {'gamma': 0.001, 'C': 100}


In [6]:
X_trainval, X_test, y_trainval, y_test = train_test_split(iris.data, iris.target, random_state=0)
X_train, X_valid, y_train, y_valid = train_test_split(X_trainval, y_trainval, random_state=1)
print('Size of trainings set: {}, validation set: {}, test set: {}'.format(X_train.shape, X_valid.shape, X_test.shape))

Size of trainings set: (84, 4), validation set: (28, 4), test set: (38, 4)


In [7]:
best_score = 0
best_parameters = {}

for gamma in param_list:
    for C in param_list:
        svm = SVC(gamma=gamma, C=C)
        svm.fit(X_train, y_train)
        # validation set を用いて score を計算する
        score = svm.score(X_valid, y_valid)
        if score > best_score:
            best_score = score
            best_parameters = {'gamma' : gamma, 'C' : C}

svm = SVC(**best_parameters)
# best_parameters に対し，training set + validation set を用いて学習する
svm.fit(X_trainval, y_trainval)
# test set による評価は，best_parameters が得られて初めて行われる
test_score = svm.score(X_test, y_test)

print('Best score on validation set: {}'.format(best_score))
print('Best parameters: {}'.format(best_parameters))
print('Test set score with best parameters: {}'.format(test_score))

Best score on validation set: 0.9642857142857143
Best parameters: {'gamma': 0.001, 'C': 10}
Test set score with best parameters: 0.9210526315789473


In [8]:
best_score = 0
best_parameters  = {}

for gamma in param_list:
    for C in param_list:
        svm = SVC(gamma=gamma, C=C)
        # cross_val_score() による交差検証
        scores = cross_val_score(svm, X_trainval, y_trainval, cv=5)
        # k 個の評価値の平均を用いる
        score = np.mean(scores)
        if score > best_score:
            best_score = score
            best_parameters = {'gamma' : gamma, 'C' : C}

svm = SVC(**best_parameters)
# best_parameters に対し，training set + validation set を用いて学習する
svm.fit(X_trainval, y_trainval)
# test set による評価は，best_parameters が得られて初めて行われる
test_score = svm.score(X_test, y_test)

print('Best score on validation set: {}'.format(best_score))
print('Best parameters: {}'.format(best_parameters))
print('Test set score with best parameters: {}'.format(test_score))

Best score on validation set: 0.9731225296442687
Best parameters: {'gamma': 0.1, 'C': 10}
Test set score with best parameters: 0.9736842105263158


In [9]:
from sklearn.model_selection import GridSearchCV

# パラメータを dict 型で指定
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],  'gamma' : [0.001, 0.01, 0.1, 1, 10, 100]}

# validation set は GridSearchCV が自動で作成してくれるため，
# training set と test set の分割のみを実行すればよい
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=0)

grid_search = GridSearchCV(SVC(), param_grid, cv=5)

# fit 関数を呼ぶことで交差検証とグリッドサーチがどちらも実行される
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100],
                         'gamma': [0.001, 0.01, 0.1, 1, 10, 100]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [10]:
print('Test set score: {}'.format(grid_search.score(X_test, y_test)))
print('Best parameters: {}'.format(grid_search.best_params_))
print('Best cross-validation: {}'.format(grid_search.best_score_))

Test set score: 0.9736842105263158
Best parameters: {'C': 10, 'gamma': 0.1}
Best cross-validation: 0.9731225296442687
