In [1]:
import numpy as np
import xgboost as xgb

from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import confusion_matrix, mean_squared_error

In [2]:
iris = load_iris()
y = iris["target"]
X = iris["data"]

In [3]:
X

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [4]:
np.unique(y)

array([0, 1, 2])

In [5]:
iris.target_names, iris.feature_names

(array(['setosa', 'versicolor', 'virginica'], dtype='<U10'),
 ['sepal length (cm)',
  'sepal width (cm)',
  'petal length (cm)',
  'petal width (cm)'])

In [6]:
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=1)

In [7]:
kf = KFold(n_splits=5, shuffle=True, random_state=1)

In [8]:
xgb_model_list = []
xgb_mse_list = []
for train_index, test_index in kf.split(train_x):
    xgb_model = xgb.XGBClassifier().fit(train_x[train_index], train_y[train_index])
    # 增加模型
    xgb_model_list.append(xgb_model)

    # 预测测试集
    predictions = xgb_model.predict(train_x[test_index])
    # 获取真实值
    actual = train_y[test_index]
    # 计算损失
    mse = mean_squared_error(actual, predictions)
    # 增加损失
    xgb_mse_list.append(mse)

print(f"xgb_mse_list: {xgb_mse_list}")
print(f"xgb mse 均值为: {np.mean(xgb_mse_list)}")

xgb_mse_list: [0.0, 0.08333333333333333, 0.041666666666666664, 0.0, 0.08333333333333333]
xgb mse 均值为: 0.041666666666666664


In [9]:
# 使用随机森林在训练集上交叉验证
rf_model_list = []
rf_mse_list = []
for train_index, test_index in kf.split(train_x):
    rf = RandomForestClassifier(
        n_estimators=10,
        max_depth=10,
        random_state=10,
    )
    rf.fit(train_x[train_index], train_y[train_index])
    rf_model_list.append(rf)

    predictions = rf.predict(train_x[test_index])
    actual = train_y[test_index]
    mse = mean_squared_error(actual, predictions)

    rf_mse_list.append(mse)

print(f"rf_mse_list: {rf_mse_list}")
print(f"rf mse 均值为: {np.mean(rf_mse_list)}")

rf_mse_list: [0.08333333333333333, 0.041666666666666664, 0.08333333333333333, 0.041666666666666664, 0.08333333333333333]
rf mse 均值为: 0.06666666666666667


In [10]:
# 模型评估和选择
if np.mean(rf_mse_list) <= np.mean(xgb_mse_list):
    min_mse = min(rf_mse_list)
    ind = rf_mse_list.index(min_mse)
    best_estimator = rf_model_list[ind]
    print("best estimator is random forest {}, mse is {}".format(
        ind, min_mse
    ))
else:
    min_mse = min(xgb_mse_list)
    ind = xgb_mse_list.index(min_mse)
    best_estimator = xgb_model_list[ind]
    print("best estimator is xgb {}, mse is {}".format(
        ind, min_mse
    ))

best estimator is xgb 0, mse is 0.0


In [11]:
# 使用最好的模型和参数预测测试集，估计模型在实际使用时的判别能力
pred = best_estimator.predict(test_x)
mse = mean_squared_error(pred, test_y)
print("test data mse is: {}".format(mse))
print(confusion_matrix(test_y, pred))

test data mse is: 0.03333333333333333
[[11  0  0]
 [ 0 12  1]
 [ 0  0  6]]
