In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# Step 1: Load the data
df = pd.read_parquet('cleaned_data_806016.parquet')

# Step 2: Preprocess the data
# Drop the 'Datetime' column (or convert it if needed)
df.drop(columns=['Datetime', 'State', 'Type', 'Event'], inplace=True)

print(df.head(5))

# Step 3: Normalize the data
scaler = MinMaxScaler()
data_normalized = scaler.fit_transform(df)

# Step 4: Create a custom Dataset class
class FreezerDataset(Dataset):
    def __init__(self, data):
        self.data = torch.tensor(data, dtype=torch.float32)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        x = self.data[index]
        return x, x  # For autoencoders, input and target are the same

# Step 5: Create the dataset and dataloaders
freezer_dataset = FreezerDataset(data_normalized)
batch_size = 1024
train_loader = DataLoader(freezer_dataset, batch_size=batch_size, shuffle=True)

In [None]:
import torch.nn as nn

# Define the number of features (i.e., columns in your dataset)
num_features = data_normalized.shape[1]

class AutoEncoder(nn.Module):
    def __init__(self, hidden_units, latent_features=2):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(in_features=num_features, out_features=hidden_units),
            nn.ReLU(),
            nn.Linear(in_features=hidden_units, out_features=latent_features)
        )
        self.decoder = nn.Sequential(
            nn.Linear(in_features=latent_features, out_features=hidden_units),
            nn.ReLU(),
            nn.Linear(in_features=hidden_units, out_features=num_features)
        )

    def forward(self, x): 
        z = self.encoder(x)
        x_hat = torch.sigmoid(self.decoder(z))
        return {'z': z, 'x_hat': x_hat}

# Initialize the autoencoder model
hidden_units = 64
latent_features = 2
net = AutoEncoder(hidden_units, latent_features)
cuda = torch.cuda.is_available()

if cuda:
    net = net.cuda()

print(net)

In [None]:
import torch
cuda = torch.cuda.is_available()
print(f"Model is on GPU: {next(net.parameters()).is_cuda}")

In [None]:
import torch
import torch.optim as optim
import torch.nn as nn

# Initialize the optimizer and loss function
optimizer = optim.Adam(net.parameters(), lr=0.001)
loss_function = nn.MSELoss()

num_epochs = 10
train_loss = []

# Move the model to the GPU if available
if cuda:
    net = net.cuda()

for epoch in range(num_epochs):
    batch_loss = []
    net.train()
    
    for x, _ in train_loader:
        # Move input data to GPU if available
        if cuda:
            x = x.cuda()
        
        # Forward pass
        outputs = net(x)
        x_hat = outputs['x_hat']
        
        # Compute the loss
        loss = loss_function(x_hat, x)
        
        # Backward pass and optimization step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Store the batch loss
        batch_loss.append(loss.item())
    
    train_loss.append(np.mean(batch_loss))
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {train_loss[-1]:.4f}")

### Gem model

In [None]:
# Save only the model's state dictionary
model_save_path = 'autoencoder_weights.pth'
torch.save(net.state_dict(), model_save_path)
print(f"Model weights saved to {model_save_path}")

### Hent model

In [None]:
# Create a new instance of your model
net_loaded = AutoEncoder(hidden_units=64, latent_features=2)

# Move the model to GPU if available
if cuda:
    net_loaded = net_loaded.cuda()

# Load the saved weights into the model
model_save_path = 'autoencoder_weights.pth'
net_loaded.load_state_dict(torch.load(model_save_path))
print(f"Model weights loaded from {model_save_path}")

# Set the model to evaluation mode if you are using it for inference
net_loaded.eval()

In [None]:
import matplotlib.pyplot as plt

def detect_anomalies(data, model, threshold=0.05):
    model.eval()
    data_tensor = torch.tensor(data, dtype=torch.float32)
    
    if cuda:
        data_tensor = data_tensor.cuda()
    
    with torch.no_grad():
        outputs = model(data_tensor)
        x_hat = outputs['x_hat']
        reconstruction_error = torch.mean((data_tensor - x_hat) ** 2, dim=1).cpu().numpy()
    
    anomalies = reconstruction_error > threshold
    return anomalies, reconstruction_error

## Afprøve på træningsdata

In [None]:
anomalies, reconstruction_errors = detect_anomalies(data_normalized, net)

# Plot the reconstruction error distribution
plt.figure(figsize=(10, 6))
plt.hist(reconstruction_errors, bins=50, alpha=0.75)
plt.axvline(x=0.05, color='r', linestyle='--')
plt.title('Reconstruction Error Distribution')
plt.xlabel('Reconstruction Error')
plt.ylabel('Frequency')
plt.show()

# Print out the indices of anomalies
anomaly_indices = np.where(anomalies)[0]
print(f"Detected {len(anomaly_indices):,} anomalies out of {len(data_normalized):,} samples.")

## Afprøve på valid sæt

In [None]:
# Step 1: Load the data
df = pd.read_parquet('around_events_data_806016.parquet')
#df = pd.read_parquet('around_events_data_806018.parquet')

# Step 2: Preprocess the data
df.drop(columns=['Datetime', 'State', 'Type', 'Event'], inplace=True)
#df.drop(columns=['Datetime', 'State', 'Type', 'Event', 'main_fault'], inplace=True)

# Step 3: Normalize the data
scaler = MinMaxScaler()
valid_data_normalized = scaler.fit_transform(df)

# Example: Detect anomalies on the training data
anomalies, reconstruction_errors = detect_anomalies(valid_data_normalized, net)

# Plot the reconstruction error distribution
plt.figure(figsize=(10, 6))
plt.hist(reconstruction_errors, bins=50, alpha=0.75)
plt.axvline(x=0.05, color='r', linestyle='--')
plt.title('Reconstruction Error Distribution')
plt.xlabel('Reconstruction Error')
plt.ylabel('Frequency')
plt.show()

# Print out the indices of anomalies
anomaly_indices = np.where(anomalies)[0]
percentage_accuracy = (len(anomaly_indices) / len(valid_data_normalized)) * 100
print(f"Detected {len(anomaly_indices):,} anomalies out of {len(valid_data_normalized):,} samples.")
print(f"Accuracy percentage: {percentage_accuracy:.2f}%")