<a href="https://colab.research.google.com/github/masayasato0407/SKlearn_classification/blob/main/Scilit_learn_RF_Bayesian_CV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Google Colabでのパッケージインストール
!pip install optuna scikit-learn

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
import optuna
from optuna.samplers import TPESampler

In [None]:
# 使用したいcsvファイルを直接アップロードする
from google.colab import files
uploaded = files.upload()

In [None]:
#dataの読み込み
data = pd.read_csv('sampledata.csv')
data.head()

In [None]:
# 特徴量と目的変数の定義
X = data[['col1', 'col2', 'col3', 'col4', 'col5']]
y = data['event']

In [None]:
# 最適化のための目的関数の定義
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 10, 200)
    max_depth = trial.suggest_int('max_depth', 1, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
    max_features = trial.suggest_uniform('max_features', 0.1, 1.0)

    rf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42
    )

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    accuracy_scores = cross_val_score(rf, X, y, cv=cv, scoring='accuracy')
    return accuracy_scores.mean()

In [None]:
# ベイズ最適化によるハイパーパラメータの最適化
sampler = TPESampler(seed=42)
study = optuna.create_study(direction='maximize', sampler=sampler)
study.optimize(objective, n_trials=32)

In [None]:
# 最適なモデルパラメータの取得
best_params = study.best_params

In [None]:
# 最適なパラメータでモデルを訓練
best_rf = RandomForestClassifier(**best_params, random_state=42)
best_rf.fit(X, y)

In [None]:
# 5分割交差検証による正診率とAUCの計算
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_scores = cross_val_score(best_rf, X, y, cv=cv, scoring='accuracy')
auc_scores = cross_val_score(best_rf, X, y, cv=cv, scoring='roc_auc')

In [None]:
# 最適なパラメータ、正診率、AUCの表示
best_accuracy = accuracy_scores.mean()
best_auc = auc_scores.mean()

print("Best parameters found: ", best_params)
print("Best cross-validation accuracy: {:.4f}".format(best_accuracy))
print("Best cross-validation AUC: {:.4f}".format(best_auc))