In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
import pandas as pd
import time

In [2]:
#路徑
train_data_path = 'D:/TEST2/adult.data'
test_data_path = 'D:/TEST2/adult.test'

#列名
columns = [
    "age", "workclass", "fnlwgt", "education", "education-num", "marital-status", 
    "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", 
    "hours-per-week", "native-country", "income"
]

In [3]:
#載入
train_data = pd.read_csv(train_data_path, header=None, names=columns)
test_data = pd.read_csv(test_data_path, header=None, names=columns)

In [4]:
# 定義數值型和類別型特徵
numeric_features = train_data.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_features.remove('hours-per-week')  # 移除目標變量

categorical_features = train_data.select_dtypes(include=['object']).columns.tolist()
categorical_features.remove('income')  # 移除非特徵欄位

# 創建列轉換器進行預處理
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

In [5]:
# 創建SVR pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', SVR())])

# 分離特徵和目標變量
X_train = train_data.drop(['hours-per-week', 'income'], axis=1)
y_train = train_data['hours-per-week']
X_test = test_data.drop(['hours-per-week', 'income'], axis=1)
y_test = test_data['hours-per-week']

In [6]:
# 訓練模型
start_time = time.time()
model.fit(X_train, y_train)
training_time = time.time() - start_time

# 訓練模型
model.fit(X_train, y_train)

In [7]:
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, r2_score
import numpy as np

# 在測試集上進行預測

y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# 計算評估指標
mape = mean_absolute_percentage_error(y_test, y_pred_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
r2 = r2_score(y_test, y_pred_test)

In [8]:
mape_xgb = mean_absolute_percentage_error(y_test, y_pred_test)
rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_test))
r2_xgb = r2_score(y_test, y_pred_test)

print("SVR - MAPE:", mape_xgb)
print("SVR - RMSE:", rmse_xgb)
print("SVR - R2 Score:", r2_xgb)

SVR - MAPE: 0.3116496491342985
SVR - RMSE: 11.111214899921158
SVR - R2 Score: 0.20719373130182683


In [9]:
# 定義計算 MAPE, RMSE 和 R-squared 的函數
def calculate_metrics(y_true, y_pred):
    mape = mean_absolute_percentage_error(y_true, y_pred) * 100
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2 = r2_score(y_true, y_pred)
    return mape, rmse, r2

In [10]:
mape_train, rmse_train, r2_train = calculate_metrics(y_train, y_pred_train)

In [11]:
mape_test, rmse_test, r2_test = calculate_metrics(y_test, y_pred_test)

In [12]:
# 輸出訓練時間和績效指標
print(f"訓練時間：{training_time:.2f} 秒")
print(f"訓練集績效指標 - MAPE: {mape_train:.2f}%, RMSE: {rmse_train:.2f}, R-squared: {r2_train:.2f}")
print(f"測試集績效指標 - MAPE: {mape_test:.2f}%, RMSE: {rmse_test:.2f}, R-squared: {r2_test:.2f}")

訓練時間：127.06 秒
訓練集績效指標 - MAPE: 30.40%, RMSE: 10.93, R-squared: 0.22
測試集績效指標 - MAPE: 31.16%, RMSE: 11.11, R-squared: 0.21
