In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, r2_score
import time
import os
import matplotlib.pyplot as plt
import seaborn as sns

# Figure Chinese font settings
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'sans-serif']
plt.rcParams['axes.unicode_minus'] = False

In [2]:
def load_adult_data(train_path, test_path):
    columns = [
        'age', 'workclass', 'fnlwgt', 'education', 'education-num',
        'marital-status', 'occupation', 'relationship', 'race', 'sex',
        'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income_bracket'
    ]

    train_df = pd.read_csv(train_path, names=columns, na_values='?', skipinitialspace=True)
    test_df = pd.read_csv(test_path, names=columns, na_values='?', skiprows=1, skipinitialspace=True)

    # Delete missing values
    train_df = train_df.dropna().reset_index(drop=True)
    test_df = test_df.dropna().reset_index(drop=True)

    return train_df, test_df

# Load data and run
train_df, test_df = load_adult_data('data/adult.train.txt', 'data/adult.test.txt')
print(f"訓練集: {train_df.shape}")
print(f"測試集: {test_df.shape}")
train_df.head()

訓練集: (30162, 15)
測試集: (15060, 15)


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income_bracket
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
def preprocess_data(train_df, test_df):
    y_train = train_df['hours-per-week'].values
    y_test = test_df['hours-per-week'].values
    X_train_raw = train_df.drop('hours-per-week', axis=1)
    X_test_raw = test_df.drop('hours-per-week', axis=1)

    # Categorical features
    categorical_cols = [
        'workclass', 'education', 'marital-status', 'occupation',
        'relationship', 'race', 'sex', 'native-country', 'income_bracket'
    ]
    encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        combined = pd.concat([X_train_raw[col], X_test_raw[col]])
        le.fit(combined.astype(str))
        X_train_raw[col] = le.transform(X_train_raw[col].astype(str))
        X_test_raw[col] = le.transform(X_test_raw[col].astype(str))
        encoders[col] = le

    # Normalize
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train_raw)
    X_test = scaler.transform(X_test_raw)

    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = preprocess_data(train_df, test_df)
print(f"X_train shape: {X_train.shape}, y_train: {y_train.shape}")

X_train shape: (30162, 14), y_train: (30162,)


In [12]:
# 先看一下資料的分布
print(f"工時範圍: {y_train.min():.1f} - {y_train.max():.1f}")
print(f"工時標準差: {np.std(y_train):.2f}")

# 設定一個合理的 epsilon（例如標準差的 10%）
epsilon = 0.1 * np.std(y_train)

# 或者直接設定一個有實際意義的值
# epsilon = 5.0  # 容忍 2 小時的誤差

svr = SVR(kernel='rbf', C=1.0, epsilon=epsilon)

print(epsilon)

工時範圍: 1.0 - 99.0
工時標準差: 11.98
1.1979785633632554
