In [11]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler

# 讀取訓練和測試數據
train_data = pd.read_csv('training.csv')
test_data = pd.read_csv('test_X.csv')

# 將字母標籤轉換為二元形式
train_data['lettr'] = train_data['lettr'].apply(lambda x: 1 if x in ['B','H','P','W','R','M'] else -1)

# 切分訓練數據的特徵和標籤
X_train = train_data.drop('lettr', axis=1)
y_train = train_data['lettr']

# 使用StandardScaler進行數據正規化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(test_data)

# 將資料轉換為 torch.Tensor
X_train_tensor = torch.Tensor(X_train_scaled)
X_test_tensor = torch.Tensor(X_test_scaled)

# 建立 DataLoader
train_loader = DataLoader(TensorDataset(X_train_tensor), batch_size=256, shuffle=True)

# 定義自編碼器模型
class AutoEncoder(nn.Module):
    def __init__(self):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(X_train.shape[1], 256),
            nn.ReLU(True),
            nn.Linear(256, 128),
            nn.ReLU(True),
            nn.Linear(128, 64)
        )
        self.decoder = nn.Sequential(
            nn.Linear(64, 128),
            nn.ReLU(True),
            nn.Linear(128, 256),
            nn.ReLU(True),
            nn.Linear(256, X_train.shape[1])
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

# 初始化模型和優化器
model = AutoEncoder()
criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001)

# 訓練模型
for epoch in range(1000):
    for data in train_loader:
        X = data[0]
        output = model(X)
        loss = criterion(output, X)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if epoch % 10 == 0:
        print('epoch [{}/{}], loss:{:.4f}'.format(epoch+1, 100, loss.item()))

# 在測試數據上進行異常檢測
model.eval()
with torch.no_grad():
    test_output = model(X_test_tensor)
    mse_loss = nn.MSELoss(reduction='none')
    losses = mse_loss(test_output, X_test_tensor)
    anomaly_scores = torch.mean(losses, dim=1)

# 輸出結果
result = pd.DataFrame(list(range(len(anomaly_scores))), columns=['id'])
result['outliers'] = anomaly_scores.numpy()
result.to_csv('submission.csv', index=False)

epoch [1/100], loss:0.9741
epoch [11/100], loss:0.2397
epoch [21/100], loss:0.1025
epoch [31/100], loss:0.0503
epoch [41/100], loss:0.0188
epoch [51/100], loss:0.0106
epoch [61/100], loss:0.0064
epoch [71/100], loss:0.0038
epoch [81/100], loss:0.0025
epoch [91/100], loss:0.0017
epoch [101/100], loss:0.0014
epoch [111/100], loss:0.0011
epoch [121/100], loss:0.0010
epoch [131/100], loss:0.0009
epoch [141/100], loss:0.0008
epoch [151/100], loss:0.0007
epoch [161/100], loss:0.0006
epoch [171/100], loss:0.0006
epoch [181/100], loss:0.0005
epoch [191/100], loss:0.0005
epoch [201/100], loss:0.0005
epoch [211/100], loss:0.0004
epoch [221/100], loss:0.0005
epoch [231/100], loss:0.0004
epoch [241/100], loss:0.0004
epoch [251/100], loss:0.0004
epoch [261/100], loss:0.0003
epoch [271/100], loss:0.0003
epoch [281/100], loss:0.0003
epoch [291/100], loss:0.0003
epoch [301/100], loss:0.0003
epoch [311/100], loss:0.0003
epoch [321/100], loss:0.0003
epoch [331/100], loss:0.0003
epoch [341/100], loss:0.0