In [10]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler

# 讀取訓練和測試數據
train_data = pd.read_csv('training.csv')
test_data = pd.read_csv('test_X.csv')

# 將字母標籤轉換為二元形式
train_data['lettr'] = train_data['lettr'].apply(lambda x: 1 if x in ['B','H','P','W','R','M'] else -1)

# 切分訓練數據的特徵和標籤
X_train = train_data.drop('lettr', axis=1)
y_train = train_data['lettr']

# 使用StandardScaler進行數據正規化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(test_data)

# 將資料轉換為 torch.Tensor
X_train_tensor = torch.Tensor(X_train_scaled)
X_test_tensor = torch.Tensor(X_test_scaled)

# 建立 DataLoader
train_loader = DataLoader(TensorDataset(X_train_tensor), batch_size=64, shuffle=True)

# 定義自編碼器模型
class AutoEncoder(nn.Module):
    def __init__(self):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(X_train.shape[1], 256),
            nn.ReLU(True),
            nn.Linear(256, 128),
            nn.ReLU(True),
            nn.Linear(128, 64)
        )
        self.decoder = nn.Sequential(
            nn.Linear(64, 128),
            nn.ReLU(True),
            nn.Linear(128, 256),
            nn.ReLU(True),
            nn.Linear(256, X_train.shape[1])
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

# 初始化模型和優化器
model = AutoEncoder()
criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)

# 訓練模型
for epoch in range(100):
    for data in train_loader:
        X = data[0]
        output = model(X)
        loss = criterion(output, X)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if epoch % 10 == 0:
        print('epoch [{}/{}], loss:{:.4f}'.format(epoch+1, 100, loss.item()))

# 在測試數據上進行異常檢測
model.eval()
with torch.no_grad():
    test_output = model(X_test_tensor)
    mse_loss = nn.MSELoss(reduction='none')
    losses = mse_loss(test_output, X_test_tensor)
    anomaly_scores = torch.mean(losses, dim=1)

# 輸出結果
result = pd.DataFrame(list(range(len(anomaly_scores))), columns=['id'])
result['outliers'] = anomaly_scores.numpy()
result.to_csv('submission.csv', index=False)

epoch [1/100], loss:4.1828, val_loss:3.9698
epoch [2/100], loss:1.2469, val_loss:0.9878
epoch [3/100], loss:0.7639, val_loss:0.6578
epoch [4/100], loss:0.5025, val_loss:0.3946
epoch [5/100], loss:0.3229, val_loss:0.2722
epoch [6/100], loss:0.2426, val_loss:0.1986
epoch [7/100], loss:0.1885, val_loss:0.1487
epoch [8/100], loss:0.1765, val_loss:0.1174
epoch [9/100], loss:0.1089, val_loss:0.1049
epoch [10/100], loss:0.0867, val_loss:0.0695
epoch [11/100], loss:0.0609, val_loss:0.0597
epoch [12/100], loss:0.0449, val_loss:0.0432
epoch [13/100], loss:0.0278, val_loss:0.0303
epoch [14/100], loss:0.0224, val_loss:0.0232
epoch [15/100], loss:0.0215, val_loss:0.0206
epoch [16/100], loss:0.0219, val_loss:0.0199
epoch [17/100], loss:0.0179, val_loss:0.0165
epoch [18/100], loss:0.0242, val_loss:0.0200
epoch [19/100], loss:0.0176, val_loss:0.0194
epoch [20/100], loss:0.0140, val_loss:0.0150
epoch [21/100], loss:0.0142, val_loss:0.0204
epoch [22/100], loss:0.0135, val_loss:0.0144
epoch [23/100], los