In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import plot_importance
from xgboost import XGBClassifier
from xgboost import XGBRegressor
from matplotlib import pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import time
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import time
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import mean_absolute_percentage_error

In [2]:
train_data_path = 'D:/TEST2/adult.data'
test_data_path = 'D:/TEST2/adult.test'

#列名
columns = [
    "age", "workclass", "fnlwgt", "education", "education-num", "marital-status", 
    "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", 
    "hours-per-week", "native-country", "income"
]

In [3]:
#載入
train_data = pd.read_csv(train_data_path, header=None, names=columns)
test_data = pd.read_csv(test_data_path, header=None, names=columns)

In [4]:
# 定義數值型和類別型特徵
numeric_features = train_data.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_features.remove('hours-per-week')

categorical_features = train_data.select_dtypes(include=['object']).columns.tolist()
categorical_features.remove('income')

In [5]:
# 預處理

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

In [6]:
xgb_model = Pipeline(steps=[('preprocessor', preprocessor),
                            ('regressor', XGBRegressor())])

In [7]:
X_train = train_data.drop(['hours-per-week', 'income'], axis=1)
y_train = train_data['hours-per-week']
X_test = test_data.drop(['hours-per-week', 'income'], axis=1)
y_test = test_data['hours-per-week']

In [8]:
# 訓練模型
start_time = time.time()
xgb_model.fit(X_train, y_train)
training_time = time.time() - start_time

In [9]:
xgb_model.fit(X_train, y_train)

In [10]:
# 預測
y_pred_train = xgb_model.predict(X_train)
y_pred_test = xgb_model.predict(X_test)

In [11]:
# 計算評估指標
mape_xgb = mean_absolute_percentage_error(y_test, y_pred_test)
rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_test))
r2_xgb = r2_score(y_test, y_pred_test)

print("XGBoost - MAPE:", mape_xgb)
print("XGBoost - RMSE:", rmse_xgb)
print("XGBoost - R2 Score:", r2_xgb)

XGBoost - MAPE: 0.29936242248036193
XGBoost - RMSE: 10.7720424478624
XGBoost - R2 Score: 0.25485619853839825


In [13]:
# 定義計算 MAPE, RMSE 和 R-squared 的函數
def calculate_metrics(y_true, y_pred):
    mape = mean_absolute_percentage_error(y_true, y_pred) * 100
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2 = r2_score(y_true, y_pred)
    return mape, rmse, r2

In [14]:
mape_train, rmse_train, r2_train = calculate_metrics(y_train, y_pred_train)

In [15]:
mape_test, rmse_test, r2_test = calculate_metrics(y_test, y_pred_test)

In [16]:
# 輸出訓練時間和績效指標
print(f"訓練時間：{training_time:.2f} 秒")
print(f"訓練集績效指標 - MAPE: {mape_train:.2f}%, RMSE: {rmse_train:.2f}, R-squared: {r2_train:.2f}")
print(f"測試集績效指標 - MAPE: {mape_test:.2f}%, RMSE: {rmse_test:.2f}, R-squared: {r2_test:.2f}")

訓練時間：0.36 秒
訓練集績效指標 - MAPE: 24.78%, RMSE: 9.19, R-squared: 0.45
測試集績效指標 - MAPE: 29.94%, RMSE: 10.77, R-squared: 0.25
