In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, r2_score

In [2]:
# 資料loading Boston Housing
boston_data_path = 'D:/TEST2/BostonHousing.csv'
boston_data = pd.read_csv(boston_data_path)

In [3]:
# 準備特徵和目標變量
X = boston_data.drop('medv', axis=1)
y = boston_data['medv']

In [4]:
# 初始化 XGBoost 回歸模型
xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)
# 初始化XGBoost回歸器並調整參數
xgb_reg = XGBRegressor(n_estimators=100, max_depth=3)

In [5]:
# 定義一個函數來計算 MAPE, RMSE, 和 R2
def compute_metrics(model, X, y, kf):
    mape_scores, rmse_scores, r2_scores = [], [], []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        mape_scores.append(mean_absolute_percentage_error(y_test, y_pred))
        rmse_scores.append(mean_squared_error(y_test, y_pred, squared=False))
        r2_scores.append(r2_score(y_test, y_pred))

    return np.mean(mape_scores), np.mean(rmse_scores), np.mean(r2_scores)

In [6]:
# k-fold交叉驗證
kf = KFold(n_splits=5, shuffle=True, random_state=42)


In [7]:
# 計算交叉驗證的評估指標
mae_scores = cross_val_score(xgb_reg, X, y, cv=kf, scoring='neg_mean_absolute_error')
mse_scores = cross_val_score(xgb_reg, X, y, cv=kf, scoring='neg_mean_squared_error')
r2_scores = cross_val_score(xgb_reg, X, y, cv=kf, scoring='r2') 

In [8]:
# 計算平均評估指標
avg_mae = np.mean(-mae_scores)
avg_mse = np.mean(-mse_scores)
avg_r2 = np.mean(r2_scores)

In [9]:
# 顯示每個fold的評估指標
print("每個fold的MAE (Mean Absolute Error):", -mae_scores)
print("每個fold的MSE (Mean Squared Error):", -mse_scores)
print("每個fold的R2 分數:", r2_scores)

每個fold的MAE (Mean Absolute Error): [1.87893812 1.88621263 2.45682633 2.26122548 2.03357905]
每個fold的MSE (Mean Squared Error): [ 6.28266553  7.16119511 13.58884962  9.00873935  8.02194322]
每個fold的R2 分數: [0.91432781 0.90880473 0.85063677 0.91374841 0.8884391 ]


In [10]:
# 顯示平均評估指標
print("\n平均MAE (Mean Absolute Error):", avg_mae)
print("平均MSE (Mean Squared Error):", avg_mse)
print("平均R2 分數:", avg_r2)


平均MAE (Mean Absolute Error): 2.1033563209445565
平均MSE (Mean Squared Error): 8.8126785650643
平均R2 分數: 0.8951913632596351


In [11]:
# 使用所有特徵進行分析
mape_all, rmse_all, r2_all = compute_metrics(xgb_model, X, y, kf)

In [12]:
# 計算特徵重要性
xgb_model.fit(X, y)
feature_importances = xgb_model.feature_importances_

In [13]:
# 計算特徵重要性
xgb_model.fit(X, y)
feature_importances1 = xgb_model.feature_importances_

In [14]:
# 選擇最重要的特徵
top_n = 10
indices = np.argsort(feature_importances)[::-1][:top_n]
top_features = X.columns[indices]

In [15]:
# 使用篩選後的特徵進行分析
X_top = X[top_features]
mape_top, rmse_top, r2_top = compute_metrics(xgb_model, X_top, y, kf)

In [16]:
# 選擇最重要的特徵
top_n1 = 2
indices = np.argsort(feature_importances1)[::-1][:top_n1]
top_features1 = X.columns[indices]

In [17]:
# 使用篩選後的特徵進行分析
X_top1 = X[top_features1]
mape_top1, rmse_top1, r2_top1 = compute_metrics(xgb_model, X_top1, y, kf)

In [18]:
print("使用所有特徵的性能指標:")
print(f"MAPE: {mape_all}, RMSE: {rmse_all}, R2: {r2_all}")

print("\n使用篩選10個特徵的性能指標:")
print(f"MAPE: {mape_top}, RMSE: {rmse_top}, R2: {r2_top}")

print("\n使用篩選2個特徵的性能指標:")
print(f"MAPE: {mape_top1}, RMSE: {rmse_top1}, R2: {r2_top1}")

print("\n篩選10個的特徵:")
print(top_features)

print("\n篩選2個的特徵:")
print(top_features1)

使用所有特徵的性能指標:
MAPE: 0.1080289212450403, RMSE: 3.084887602754878, R2: 0.8856603620598325

使用篩選10個特徵的性能指標:
MAPE: 0.10900486381243486, RMSE: 3.0378841419688465, R2: 0.8887885863167053

使用篩選2個特徵的性能指標:
MAPE: 0.19013125600436792, RMSE: 5.182871620785813, R2: 0.6711578193356287

篩選10個的特徵:
Index(['lstat', 'rm', 'dis', 'tax', 'nox', 'ptratio', 'crim', 'chas', 'age',
       'rad'],
      dtype='object')

篩選2個的特徵:
Index(['lstat', 'rm'], dtype='object')
