Method 1: Scikit-learn style API

In [15]:
import torch
import pandas as pd
from xgboost import XGBClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

data = load_iris()

pd = pd.DataFrame(data.data, columns=data.feature_names)
pd.to_csv('iris.csv')
"""
data.data是一个二维数组，其形状为(150, 4)。
这个数组的每一行都对应着一个样本，也就是一朵鸢尾花。
每一列代表一种特征，依次是萼片长度、萼片宽度、花瓣长度和花瓣宽度。
你可以通过执行print(X.shape)来查看其形状，结果会显示(150, 4)。

data.target是一个一维数组，其形状为(150,)。
该数组的每个元素代表对应样本的类别标签。
标签用整数进行编码：
0代表 Setosa（山鸢尾）；
1代表 Versicolour（变色鸢尾）；
2代表 Virginica（维吉尼亚鸢尾）。
若要查看标签的具体编码，可执行print(data.target_names)。
"""
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    train_size=0.8,
    test_size=0.2,  # Default value is 0.25.
    random_state=42,
    shuffle=True,   # Whether to disrupt.
    stratify=y  # Make sure that the proportion of the different samples in trained dataset and test dataset equal to the original dataset.(In practical implementation, the sample size will be rounded up (for example, 67.5 -> 68), but the ratio remains basically the same). Default value is None.
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# initial model
model = XGBClassifier(
    objective='mult:softmax',
    num_class=3,
    learning_rate=0.01, # learning steps
    max_depth=3,
    n_estimators=100,    # iteration times(number of base models)
    tree_method='hist',
    device='cuda'
)

# train model
model.fit(X_train, y_train)

# prediction and evaluation
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of test dataset: {accuracy:.4f}")

Accuracy of test dataset: 0.9667


Method 2: Native API

In [7]:
import xgboost as xgb
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

data = load_iris()
X, y = data.data, data.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    train_size=0.8,
    test_size=0.2,
    random_state=42,
    shuffle=True,
    stratify=None
)

# transform to DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# set hyperparameters
params = {
    'objective': 'multi:softmax',
    'learning_rate': 0.01,
    'num_class': 3,
    'max_depth': 3,
    # 'n_estimators': 100,    # n_estimators is a parameter of scikit-learing API, it cannot be used in native XGBoost API.
    'tree_method': 'hist',
    'device': 'cuda'
}

# initial and train model
model = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=100,
    evals=[(dtest, 'eval')] # evaluation dataset and dataset name
)

y_pred = model.predict(dtest)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of test dataset: {accuracy:.4f}")

[0]	eval-mlogloss:1.08469
[1]	eval-mlogloss:1.07100
[2]	eval-mlogloss:1.05755
[3]	eval-mlogloss:1.04432
[4]	eval-mlogloss:1.03132
[5]	eval-mlogloss:1.01853
[6]	eval-mlogloss:1.00596
[7]	eval-mlogloss:0.99359
[8]	eval-mlogloss:0.98143
[9]	eval-mlogloss:0.96946
[10]	eval-mlogloss:0.95768
[11]	eval-mlogloss:0.94609
[12]	eval-mlogloss:0.93469
[13]	eval-mlogloss:0.92346
[14]	eval-mlogloss:0.91241
[15]	eval-mlogloss:0.90153
[16]	eval-mlogloss:0.89081
[17]	eval-mlogloss:0.88027
[18]	eval-mlogloss:0.86988
[19]	eval-mlogloss:0.85965
[20]	eval-mlogloss:0.84957
[21]	eval-mlogloss:0.83964
[22]	eval-mlogloss:0.82986
[23]	eval-mlogloss:0.82023
[24]	eval-mlogloss:0.81074
[25]	eval-mlogloss:0.80138
[26]	eval-mlogloss:0.79216
[27]	eval-mlogloss:0.78308
[28]	eval-mlogloss:0.77412
[29]	eval-mlogloss:0.76530
[30]	eval-mlogloss:0.75660
[31]	eval-mlogloss:0.74802
[32]	eval-mlogloss:0.73956
[33]	eval-mlogloss:0.73122
[34]	eval-mlogloss:0.72313
[35]	eval-mlogloss:0.71520
[36]	eval-mlogloss:0.70733
[37]	eval-m