In [24]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader, ConcatDataset


In [33]:
device = "cuda" if torch.cuda.is_available else "cpu"

In [4]:
train_data = pd.read_csv("../data/preprocessed/preprocessed_train.csv")
test_data = pd.read_csv("../data/preprocessed/preprocessed_test.csv")

### Data Preprocessing

In [19]:
deriver_d_path1 = train_data[(train_data['Class']=="D") & (train_data['PathOrder']==1) ]
deriver_d_path2 = train_data[(train_data['Class']=="D") & (train_data['PathOrder']==2) ]

In [21]:
deriver_d_path1.drop(columns=['Time(s)', 'PathOrder', 'Class'], inplace=True)
deriver_d_path2.drop(columns=['Time(s)', 'PathOrder', 'Class'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  deriver_d_path1.drop(columns=['Time(s)', 'PathOrder', 'Class'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  deriver_d_path2.drop(columns=['Time(s)', 'PathOrder', 'Class'], inplace=True)


In [22]:
one_deriver_data_path1 = deriver_d_path1.values
one_deriver_data_path2 = deriver_d_path2.values

In [2]:
class TimeSeriesDataset(Dataset):
    def __init__(self, data, sequence_length):
        self.data = data.float()
        self.sequence_length = sequence_length

    def __len__(self):
        return len(self.data) - self.sequence_length

    def __getitem__(self, idx):
        return self.data[idx:idx+self.sequence_length]

In [25]:
sequence_length = 60
dataset_path1 = TimeSeriesDataset(one_deriver_data_path1, sequence_length)
dataset_path2 = TimeSeriesDataset(one_deriver_data_path2, sequence_length)
concatenated_data = ConcatDataset([dataset_path1, dataset_path2])
dataloader = DataLoader(concatenated_data, batch_size=32, shuffle=True)


### Modeling

In [40]:
class ComplexTimeSeriesAutoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim, sequence_lengt):
        super(ComplexTimeSeriesAutoencoder, self).__init__()
        
        # Define the CNN Encoder
        self.cnn_encoder = nn.Sequential(
            nn.Conv1d(in_channels=input_dim, out_channels=32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, padding=1),
            nn.ReLU()
        )
        
        # Define the LSTM Encoder
        self.lstm_encoder = nn.LSTM(input_size=64, hidden_size=latent_dim, batch_first=True)
        
        # Define the LSTM Decoder
        self.lstm_decoder = nn.LSTM(input_size=latent_dim, hidden_size=64, batch_first=True)
        
        # Define the CNN Decoder
        self.cnn_decoder = nn.Sequential(
            nn.ConvTranspose1d(in_channels=64, out_channels=32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.ConvTranspose1d(in_channels=32, out_channels=input_dim, kernel_size=3, padding=1),
            nn.ReLU()
        )

    def forward(self, x):
        # Pass the input through the CNN Encoder
        x = x.permute(0, 2, 1)  # Change shape to (batch_size, input_dim, sequence_length) for CNN
        x = self.cnn_encoder(x)
        x = x.permute(0, 2, 1)  # Change shape to (batch_size, sequence_length, num_channels) for LSTM
        
        # Pass through the LSTM Encoder
        x, _ = self.lstm_encoder(x)
        
        # Pass through the LSTM Decoder
        x, _ = self.lstm_decoder(x)
        
        # Pass through the CNN Decoder
        x = x.permute(0, 2, 1)  # Change shape to (batch_size, num_channels, sequence_length) for CNN
        x = self.cnn_decoder(x)
        x = x.permute(0, 2, 1)  # Change shape back to (batch_size, sequence_length, input_dim)
        
        return x

In [27]:
one_deriver_data_path1.shape

(1726, 64)

In [41]:
input_dim = one_deriver_data_path1.shape[1]
latent_dim = 20
model = ComplexTimeSeriesAutoencoder(input_dim, latent_dim, sequence_length).to(device)

In [44]:
num_epochs = 20
learning_rate = 0.001
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    for batch in dataloader:
        batch = batch.float().to(device)
        outputs = model(batch)
        loss = criterion(outputs, batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/20], Loss: 5.3163
Epoch [2/20], Loss: 7.7159
Epoch [3/20], Loss: 6.0024
Epoch [4/20], Loss: 7.2494
Epoch [5/20], Loss: 6.4511
Epoch [6/20], Loss: 6.7225
Epoch [7/20], Loss: 4.8788
Epoch [8/20], Loss: 5.5105
Epoch [9/20], Loss: 5.5099
Epoch [10/20], Loss: 4.9683
Epoch [11/20], Loss: 5.9810
Epoch [12/20], Loss: 5.1820
Epoch [13/20], Loss: 6.6218
Epoch [14/20], Loss: 5.0998
Epoch [15/20], Loss: 6.5156
Epoch [16/20], Loss: 7.2608
Epoch [17/20], Loss: 4.8451
Epoch [18/20], Loss: 5.7954
Epoch [19/20], Loss: 7.2420
Epoch [20/20], Loss: 5.9700


In [84]:
test_data_one_deriver_2 = test_data[(test_data['Class']=="D") & (test_data['PathOrder']==2) ]
test_data_one_deriver_2.drop(columns=['Time(s)', 'PathOrder', 'Class'], inplace=True)
test_data_one_deriver_1 = test_data[(test_data['Class']=="D") & (test_data['PathOrder']==1) ]
test_data_one_deriver_1.drop(columns=['Time(s)', 'PathOrder', 'Class'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data_one_deriver_2.drop(columns=['Time(s)', 'PathOrder', 'Class'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data_one_deriver_1.drop(columns=['Time(s)', 'PathOrder', 'Class'], inplace=True)


In [85]:
test_dataset_path2 = TimeSeriesDataset(test_data_one_deriver_2.values, sequence_length)
test_dataset_path1 = TimeSeriesDataset(test_data_one_deriver_1.values, sequence_length)
concatenated_data = ConcatDataset([test_dataset_path1, test_dataset_path2])
test_dataloader = DataLoader(concatenated_data, batch_size=1, shuffle=True)


In [86]:
def calculate_reconstruction_error(model, dataloader):
    model.eval()
    errors = []
    with torch.no_grad():
        for batch in dataloader:
            batch = batch.float().to(device)
            outputs = model(batch)
            loss = criterion(outputs, batch)
            errors.append(loss.item())
    return errors

validation_errors = calculate_reconstruction_error(model, test_dataloader)
threshold = np.mean(validation_errors) + 3 * np.std(validation_errors)
print(f'Reconstruction error threshold: {threshold:.4f}')

Reconstruction error threshold: 46.8258


### Evaluate the model

In [87]:
anomalies = []
with torch.no_grad():
    for batch in test_dataloader:
        batch = batch.float().to(device)
        outputs = model(batch)
        loss = criterion(outputs, batch)
        print(loss)
        if loss.item() > threshold:
            anomalies.append(True)
        else:
            anomalies.append(False)

tensor(5.5402, device='cuda:0')
tensor(14.7570, device='cuda:0')
tensor(48.6009, device='cuda:0')
tensor(9.3668, device='cuda:0')
tensor(18.9716, device='cuda:0')
tensor(18.6518, device='cuda:0')
tensor(22.8744, device='cuda:0')
tensor(15.1353, device='cuda:0')
tensor(48.1355, device='cuda:0')
tensor(15.7007, device='cuda:0')
tensor(12.0709, device='cuda:0')
tensor(16.8051, device='cuda:0')
tensor(11.2119, device='cuda:0')
tensor(3.9423, device='cuda:0')
tensor(15.8493, device='cuda:0')
tensor(15.3346, device='cuda:0')
tensor(18.3159, device='cuda:0')
tensor(39.6163, device='cuda:0')
tensor(14.8113, device='cuda:0')
tensor(18.7510, device='cuda:0')
tensor(12.8046, device='cuda:0')
tensor(13.4714, device='cuda:0')
tensor(15.8405, device='cuda:0')
tensor(17.1328, device='cuda:0')
tensor(3.5730, device='cuda:0')
tensor(15.0925, device='cuda:0')
tensor(12.9499, device='cuda:0')
tensor(15.7605, device='cuda:0')
tensor(12.8337, device='cuda:0')
tensor(6.4118, device='cuda:0')
tensor(15.3563,

In [88]:
anomalies

[False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,


### Evalute on another driver

In [120]:
driver_cls = "A"
test_data_one_deriver_1 = train_data[(train_data['Class']==driver_cls) & (train_data['PathOrder']==1) ]
test_data_one_deriver_2 = train_data[(train_data['Class']==driver_cls) & (train_data['PathOrder']==2) ]
test_data_one_deriver_1.drop(columns=['Time(s)', 'PathOrder', 'Class'], inplace=True)
test_data_one_deriver_2.drop(columns=['Time(s)', 'PathOrder', 'Class'], inplace=True)
test_dataset_path2 = TimeSeriesDataset(test_data_one_deriver_2.values, sequence_length)
test_dataset_path1 = TimeSeriesDataset(test_data_one_deriver_1.values, sequence_length)
concatenated_data = ConcatDataset([test_dataset_path1, test_dataset_path2])
test_dataloader = DataLoader(concatenated_data, batch_size=1, shuffle=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data_one_deriver_1.drop(columns=['Time(s)', 'PathOrder', 'Class'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data_one_deriver_2.drop(columns=['Time(s)', 'PathOrder', 'Class'], inplace=True)


In [121]:
anomalies = []
with torch.no_grad():
    for batch in test_dataloader:
        batch = batch.float().to(device)
        outputs = model(batch)
        loss = criterion(outputs, batch)
        print(loss)
        if loss.item() > threshold:
            anomalies.append(True)
        else:
            anomalies.append(False)

tensor(84.8270, device='cuda:0')
tensor(77.0593, device='cuda:0')
tensor(110.7549, device='cuda:0')
tensor(897.2250, device='cuda:0')
tensor(38.4169, device='cuda:0')
tensor(897.0742, device='cuda:0')
tensor(951.0508, device='cuda:0')
tensor(957.7681, device='cuda:0')
tensor(147.2378, device='cuda:0')
tensor(897.8118, device='cuda:0')
tensor(915.7648, device='cuda:0')
tensor(74.6155, device='cuda:0')
tensor(66.3323, device='cuda:0')
tensor(82.6785, device='cuda:0')
tensor(113.0067, device='cuda:0')
tensor(889.6147, device='cuda:0')
tensor(187.0710, device='cuda:0')
tensor(41.7836, device='cuda:0')
tensor(925.2104, device='cuda:0')
tensor(103.9254, device='cuda:0')
tensor(72.2017, device='cuda:0')
tensor(56.6852, device='cuda:0')
tensor(888.8254, device='cuda:0')
tensor(880.0399, device='cuda:0')
tensor(923.7220, device='cuda:0')
tensor(932.4471, device='cuda:0')
tensor(94.1146, device='cuda:0')
tensor(97.7039, device='cuda:0')
tensor(69.5024, device='cuda:0')
tensor(927.1940, device='c

In [122]:
anomalies

[True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,


In [123]:
true_count = 0
for res in anomalies:
    if res:
        true_count += 1
print(f"acc: {true_count/len(anomalies)}")

acc: 0.9302067946824224


In [140]:
def evalute_per_class():
    deriver_classes = train_data['Class'].unique()
    for deriver_cls in deriver_classes:
        test_data_one_deriver_1 = train_data[(train_data['Class']==deriver_cls) & (train_data['PathOrder']==1) ]
        test_data_one_deriver_2 = train_data[(train_data['Class']==deriver_cls) & (train_data['PathOrder']==2) ]
        test_data_one_deriver_1.drop(columns=['Time(s)', 'PathOrder', 'Class'], inplace=True)
        test_data_one_deriver_2.drop(columns=['Time(s)', 'PathOrder', 'Class'], inplace=True)
        test_dataset_path2 = TimeSeriesDataset(test_data_one_deriver_2.values, sequence_length)
        test_dataset_path1 = TimeSeriesDataset(test_data_one_deriver_1.values, sequence_length)
        concatenated_data = ConcatDataset([test_dataset_path1, test_dataset_path2])
        test_dataloader = DataLoader(concatenated_data, batch_size=1, shuffle=True)
        anomalies_cnt = 0
        with torch.no_grad():
            for batch in test_dataloader:
                batch = batch.float().to(device)
                outputs = model(batch)
                loss = criterion(outputs, batch)
                if loss.item() > 11:
                    anomalies_cnt += 1
        print(f"class {deriver_cls}  acc: {anomalies_cnt/(len(test_data_one_deriver_1)+len(test_data_one_deriver_2))}")


In [141]:
evalute_per_class()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data_one_deriver_1.drop(columns=['Time(s)', 'PathOrder', 'Class'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data_one_deriver_2.drop(columns=['Time(s)', 'PathOrder', 'Class'], inplace=True)


class A  acc: 0.9575671852899575


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data_one_deriver_1.drop(columns=['Time(s)', 'PathOrder', 'Class'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data_one_deriver_2.drop(columns=['Time(s)', 'PathOrder', 'Class'], inplace=True)


class D  acc: 0.0023860021208907743


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data_one_deriver_1.drop(columns=['Time(s)', 'PathOrder', 'Class'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data_one_deriver_2.drop(columns=['Time(s)', 'PathOrder', 'Class'], inplace=True)


class B  acc: 0.4149377593360996


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data_one_deriver_1.drop(columns=['Time(s)', 'PathOrder', 'Class'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data_one_deriver_2.drop(columns=['Time(s)', 'PathOrder', 'Class'], inplace=True)


class C  acc: 0.8813216453135536


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data_one_deriver_1.drop(columns=['Time(s)', 'PathOrder', 'Class'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data_one_deriver_2.drop(columns=['Time(s)', 'PathOrder', 'Class'], inplace=True)


class I  acc: 0.8969477183439105


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data_one_deriver_1.drop(columns=['Time(s)', 'PathOrder', 'Class'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data_one_deriver_2.drop(columns=['Time(s)', 'PathOrder', 'Class'], inplace=True)


class J  acc: 0.9698037242073477


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data_one_deriver_1.drop(columns=['Time(s)', 'PathOrder', 'Class'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data_one_deriver_2.drop(columns=['Time(s)', 'PathOrder', 'Class'], inplace=True)


class H  acc: 0.9237187127532777


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data_one_deriver_1.drop(columns=['Time(s)', 'PathOrder', 'Class'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data_one_deriver_2.drop(columns=['Time(s)', 'PathOrder', 'Class'], inplace=True)


class G  acc: 0.8048200950441277


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data_one_deriver_1.drop(columns=['Time(s)', 'PathOrder', 'Class'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data_one_deriver_2.drop(columns=['Time(s)', 'PathOrder', 'Class'], inplace=True)


class F  acc: 0.7366903283052352


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data_one_deriver_1.drop(columns=['Time(s)', 'PathOrder', 'Class'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data_one_deriver_2.drop(columns=['Time(s)', 'PathOrder', 'Class'], inplace=True)


class E  acc: 0.9681359532660648
