In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!unzip -qq "/content/drive/MyDrive/민호/private/task2.zip"

In [6]:
import numpy as np
import random
import torch

# Set random seed for reproducibility
seed = 42
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7bb3fa54ce10>

In [13]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import torch

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load dataset
file_path = "/content/Task2_data.csv"
data = pd.read_csv(file_path)

# Replace NaN values with 0
data.fillna(0, inplace=True)

# Feature engineering: Add temporal differences
for col in data.columns:
    if "inItems" in col or "NOR" in col or "DEF" in col:
        data[f'{col}_diff'] = data[col].diff().fillna(0)

# Add aggregated features for each process stage
for process in ["P1", "P2", "P3"]:
    data[f'{process}_inItems_sum'] = data.filter(like=f"{process}_LANE").filter(like="inItems").sum(axis=1)
    data[f'{process}_NOR_sum'] = data.filter(like=f"{process}_LANE").filter(like="NOR").sum(axis=1)
    data[f'{process}_DEF_sum'] = data.filter(like=f"{process}_LANE").filter(like="DEF").sum(axis=1)

data['NOR_ratio'] = data.filter(like="NOR").sum(axis=1) / (data.filter(like="inItems").sum(axis=1) + 1e-6)
data['DEF_ratio'] = data.filter(like="DEF").sum(axis=1) / (data.filter(like="inItems").sum(axis=1) + 1e-6)
data['CON_SIG_ratio'] = data['CON_SIG'] / (data.filter(like="inItems").sum(axis=1) + 1e-6)

# Drop the Timestamp column
data_processed = data.drop(columns=["Timestamp"])

# Apply MinMaxScaler column-wise
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data_processed)

# Convert to PyTorch tensors
X = torch.tensor(scaled_data, dtype=torch.float32).to(device)


Using device: cuda


In [14]:
from torch import nn

class EnhancedAutoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(EnhancedAutoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.BatchNorm1d(128),
            nn.LeakyReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, latent_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 64),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 128),
            nn.BatchNorm1d(128),
            nn.LeakyReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, input_dim)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# Initialize model
input_dim = X.shape[1]
latent_dim = 16  # Latent space dimensionality
model = EnhancedAutoencoder(input_dim, latent_dim).to(device)

# Define loss function and optimizer
criterion = nn.MSELoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [15]:
# Training loop
epochs = 50
batch_size = 64

for epoch in range(epochs):
    permutation = torch.randperm(X.size()[0])
    epoch_loss = 0
    for i in range(0, X.size()[0], batch_size):
        indices = permutation[i:i + batch_size]
        batch_x = X[indices]

        # Forward pass
        outputs = model(batch_x)
        loss = criterion(outputs, batch_x)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    print(f"Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss / len(X)}")

Epoch 1/50, Loss: 9.622430072750406e-05
Epoch 2/50, Loss: 8.309422481080901e-06
Epoch 3/50, Loss: 4.875654740649494e-06
Epoch 4/50, Loss: 4.0284873539127574e-06
Epoch 5/50, Loss: 3.6014147387824414e-06
Epoch 6/50, Loss: 3.3388436454439477e-06
Epoch 7/50, Loss: 3.1765809402398163e-06
Epoch 8/50, Loss: 3.0307386151411378e-06
Epoch 9/50, Loss: 2.8999936717419384e-06
Epoch 10/50, Loss: 2.831380617677535e-06
Epoch 11/50, Loss: 2.7611504791263184e-06
Epoch 12/50, Loss: 2.6805939330245023e-06
Epoch 13/50, Loss: 2.6278373069415873e-06
Epoch 14/50, Loss: 2.5745917782440847e-06
Epoch 15/50, Loss: 2.52755558624396e-06
Epoch 16/50, Loss: 2.476515029233784e-06
Epoch 17/50, Loss: 2.4858109405956813e-06
Epoch 18/50, Loss: 2.4232097371826042e-06
Epoch 19/50, Loss: 2.385869642519045e-06
Epoch 20/50, Loss: 2.3726382674999756e-06
Epoch 21/50, Loss: 2.357703439913133e-06
Epoch 22/50, Loss: 2.3123021483904797e-06
Epoch 23/50, Loss: 2.321617929512197e-06
Epoch 24/50, Loss: 2.2809540032637e-06
Epoch 25/50, L

In [16]:
from sklearn.decomposition import PCA
# PCA 적용
pca = PCA(n_components=0.95)  # 95% of variance retained
pca_data = pca.fit_transform(X.cpu().numpy())  # Convert to CPU numpy array for PCA

# Reconstruct the data using PCA
reconstructed_data_pca = pca.inverse_transform(pca_data)

# Compute reconstruction error for PCA
reconstruction_error_pca = np.mean((X.cpu().numpy() - reconstructed_data_pca) ** 2, axis=1)

# Set threshold for PCA-based anomaly detection (95th percentile)
threshold_pca = np.percentile(reconstruction_error_pca, 95)
print(f"PCA Threshold: {threshold_pca}")

# Predict anomalies using PCA
pca_predictions = (reconstruction_error_pca > threshold_pca).astype(int)


PCA Threshold: 0.00025355956604471424


In [17]:
# Compute reconstruction errors for Autoencoder
with torch.no_grad():
    reconstructions = model(X)
    reconstruction_error_ae = torch.mean((X - reconstructions) ** 2, axis=1).cpu().numpy()

# Set threshold for Autoencoder-based anomaly detection (95th percentile)
threshold_ae = np.percentile(reconstruction_error_ae, 95)
print(f"Autoencoder Threshold: {threshold_ae}")

# Predict anomalies using Autoencoder
ae_predictions = (reconstruction_error_ae > threshold_ae).astype(int)


Autoencoder Threshold: 0.0002896377132856286


In [18]:
# Combine PCA and Autoencoder predictions (e.g., OR operation)
final_predictions = (pca_predictions + ae_predictions) > 0  # Logical OR
final_predictions = final_predictions.astype(int)

# Save predictions
answer_sample = pd.read_csv("answer_sample.csv")
answer_sample['Anomaly'] = final_predictions
answer_sample.to_csv("final_combined_answer_sample.csv", index=False)
print("Results saved to final_combined_answer_sample.csv")


Results saved to final_combined_answer_sample.csv


In [19]:
answer_sample['Anomaly'].value_counts()

Unnamed: 0_level_0,count
Anomaly,Unnamed: 1_level_1
0,97052
1,8068
