In [22]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
import pandas as pd
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
#from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
#from matplotlib import pyplot as plt

# wineデータセットのロード
wine = load_wine()
X = wine.data
y = wine.target
print(X.shape)
print(y.shape, wine.target_names)

# データセットをトレーニングセットとテストセットに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# XGBoostモデルの定義と学習
model = XGBClassifier()

# グリッドサーチのパラメータグリッドの設定
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# グリッドサーチの実行
grid_search = GridSearchCV(model, param_grid=param_grid, cv=3, scoring='accuracy') # cv:交差検証の回数 scoring:評価方法
grid_search.fit(X_train, y_train)

# テストセットでの推定
y_pred = grid_search.predict(X_test)

# 最適なパラメータとスコアの表示
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)
# testデータに対する精度の評価
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# 特徴量ごとの重要度の可視化
importances = pd.Series(index=wine.feature_names, data=grid_search.best_estimator_.feature_importances_)
print(importances.sort_values(ascending=False).head(10))


(178, 13)
(178,) ['class_0' 'class_1' 'class_2']
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.001, 'max_depth': 5, 'n_estimators': 100, 'subsample': 1.0}
Best Score: 0.9858156028368793
Accuracy: 0.9722222222222222
od280/od315_of_diluted_wines    0.353045
color_intensity                 0.193652
proline                         0.157545
flavanoids                      0.077928
alcohol                         0.061694
total_phenols                   0.049474
hue                             0.035294
magnesium                       0.017033
alcalinity_of_ash               0.016824
malic_acid                      0.015773
dtype: float32


In [33]:
# MLflowによる推定結果の管理
import mlflow
import mlflow.xgboost
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# MLflowのトラッキングサーバーに接続
mlflow.set_tracking_uri("http://127.0.0.1:5000") # 個人開発ならlocalhostを指定

# 実験の開始
mlflow.set_experiment("templete_xgboost") # 実験名を記入

# wineデータセットのロード
wine = load_wine()
X = wine.data
y = wine.target

# データセットをトレーニングセットとテストセットに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# XGBoostモデルの定義
model = XGBClassifier()

# MLflowで結果を記録
with mlflow.start_run():
    # 学習と予測
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # 精度の計算
    accuracy = accuracy_score(y_test, y_pred)
    print("accuracy:{}".format(accuracy))
    params = model.get_params()

    # パラメータとメトリクスの記録
    mlflow.log_params(params) # モデルパラメータの記録
    mlflow.log_metric("accuracy", accuracy) # 精度の記録
    
    # モデルの保存
    mlflow.xgboost.log_model(model, "xgboost_model")


accuracy:0.9722222222222222
