In [1162]:
import csv
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
import torch.nn.functional as F
from sklearn.model_selection import StratifiedKFold
torch.manual_seed(205)

<torch._C.Generator at 0x1e0ffa05310>

In [1163]:
# if torch.cuda.is_available():
#     device = torch.device("cuda")
# else:
#     device = torch.device("cpu")

device = torch.device("cpu")

In [1164]:
data = []
min_len = float('inf')

In [1165]:
with open('../data/git-data/2023-10-06-F.csv', 'r',newline='', encoding='utf-8') as csvfile:
    spikereader = csv.reader(csvfile, delimiter=';')
    for row in spikereader:
        data.append(row)
        if len(row) < min_len:
            min_len = len(row)

In [1166]:
data = [[float(item) for item in sublist[:min_len]] for sublist in data]
labels = [d[0] for d in data]

In [1167]:
data_tensor = torch.tensor(data).to(device)

In [1168]:
print(data_tensor)
print(data_tensor.shape)

tensor([[   0., 1382., 1380.,  ..., 1372., 1355., 1371.],
        [   0., 1387., 1394.,  ..., 1369., 1348., 1366.],
        [   0., 1354., 1376.,  ..., 1375., 1381., 1356.],
        ...,
        [   9., 1336., 1376.,  ..., 1380., 1369., 1381.],
        [   9., 1382., 1354.,  ..., 1370., 1361., 1388.],
        [   9., 1364., 1368.,  ..., 1358., 1335., 1287.]])
torch.Size([100, 8601])


In [1169]:
class CustomTensorDataset(Dataset):
    def __init__(self, data_tensor, transform=None):
        self.data_tensor = data_tensor
        self.transform = transform

    def __len__(self):
        return len(self.data_tensor)

    def __getitem__(self, idx):
        sample = self.data_tensor[idx]
        label = torch.tensor(int(sample[0])).to(device)
        item = sample[1:]

        # Apply transformation only if it's specified and idx is in the training subset
        if self.transform is not None:
            for transform in self.transform:
                item = transform(item)

        return item, label

In [1170]:
custom_dataset = CustomTensorDataset(data_tensor)

In [1171]:
# train_size = 0.4  # Proportion of the dataset for training
# train_dataset, test_dataset = train_test_split(custom_dataset, train_size=train_size, shuffle=True)

In [1172]:
# number_of_values = [0] * 2

In [1173]:
# print(train_dataset[5])

In [1174]:
# batch_size = 2
# train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [1175]:
# Create a custom transformation to add Gaussian noise
class AddGaussianNoise(object):
    def __init__(self, mean=0.0, std=1.0):
        self.mean = mean
        self.std = std

    def __call__(self, tensor):
        return tensor + torch.randn(tensor.size()) * self.std + self.mean

# Define the mean and standard deviation for Gaussian noise
mean_value = 0.0
std_value = 0.2

# Apply transformation only to the training subset
# gaussian_noise_transform = AddGaussianNoise(mean=22, std=105)
# gaussian_noise_transform2 = AddGaussianNoise(mean=11, std=72)

gaussian_noise_transform = AddGaussianNoise(mean=0.0, std=11)
gaussian_noise_transform2 = AddGaussianNoise(mean=0.0, std=11)

In [1176]:
class WindowWarp(object):
    def __init__(self, window_size=5, magnitude=0.1):
        self.window_size = window_size
        self.magnitude = magnitude

    def __call__(self, sequence):
        seq_len = sequence.size(0)
        
        # Determine the number of windows
        num_windows = max(1, seq_len // self.window_size)
        
        # Randomly select a window to warp
        selected_window = torch.randint(0, num_windows, (1,)).item()
        start_idx = selected_window * self.window_size
        end_idx = min(start_idx + self.window_size, seq_len)

        # Generate random warping values for the selected window
        warp_values = torch.normal(0, 0.1, size=(end_idx - start_idx,))  # Adjust the parameters as needed

        # Apply warping to the selected window in the sequence
        sequence[start_idx:end_idx] += self.magnitude * warp_values

        return sequence

# window_warp_transform = WindowWarp(window_size=100, magnitude=55.1)
window_warp_transform = WindowWarp(window_size=100, magnitude=1.1)
# window_warp_transform = WindowWarp(window_size=100, magnitude=500.1)

In [1177]:
class SimpleMulticlassNN(nn.Module):
    def __init__(self, input_size, num_classes):
        super(SimpleMulticlassNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(128, 512)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(512, num_classes)
        self.relu3 = nn.ReLU()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        # x = self.relu1(x)
        return x

In [1178]:
# i, (data, label) = next(enumerate(train_dataloader))
# print(data.shape)
# print(data, label)

In [1179]:
class SimpleTransformer(nn.Module):
    def __init__(self, input_size, num_classes, hidden_size=64, num_layers=2, num_heads=4):
        super(SimpleTransformer, self).__init__()
        
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.transformer = nn.Transformer(
            d_model=hidden_size,
            nhead=num_heads,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers
        )
        self.fc = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        # x: (batch_size, sequence_length)
        
        embedded = self.embedding(x)
        embedded = embedded.permute(1, 0, 2)  # (sequence_length, batch_size, hidden_size)
        
        # Since this is a simple classification task, we can use the same sequence as input and output
        output = self.transformer(embedded, embedded)
        output = output.permute(1, 0, 2)  # (batch_size, sequence_length, hidden_size)
        
        # Pooling or other aggregation methods can be applied here
        pooled_output = torch.mean(output, dim=1)  # Average pooling
        
        logits = self.fc(pooled_output)
        return logits

In [1180]:
num_classes = 10
# model = SimpleTransformer(min_len-1, num_classes)
# model = SimpleLSTM(min_len-1,64,2,10)

model = SimpleMulticlassNN(min_len-1, num_classes)
model.to(device)

criterion = nn.CrossEntropyLoss()  # Cross-entropy loss for multiclass classification
optimizer = optim.AdamW(model.parameters(), lr=1e-5)

# scheduler = StepLR(optimizer, step_size=40, gamma=1e-1)

In [1181]:
import numpy as np
from sklearn import metrics

In [1182]:
results = {}
k_folds = 6
kfold = StratifiedKFold(n_splits=k_folds, shuffle=True)
for fold, (train_ids, test_ids) in enumerate(kfold.split(data_tensor, labels)):
    train_ids = torch.tensor(train_ids)
    test_ids = torch.tensor(test_ids)
    train_rows = torch.index_select(data_tensor, 0, train_ids)
    test_rows = torch.index_select(data_tensor, 0, test_ids)
    train_dataset = CustomTensorDataset(data_tensor=train_rows, transform=[gaussian_noise_transform, window_warp_transform])
    test_dataset = CustomTensorDataset(data_tensor=test_rows, transform=[gaussian_noise_transform2, window_warp_transform])
    # train_dataset = CustomTensorDataset(data_tensor=train_rows, transform=[gaussian_noise_transform, window_warp_transform])
    # test_dataset = CustomTensorDataset(data_tensor=test_rows)
    # train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
    # test_subsampler = torch.utils.data.SubsetRandomSampler(test_ids)
    # raise ValueError(train_subsampler)
    
    # Define data loaders for training and testing data in this fold
    train_dataloader = torch.utils.data.DataLoader(
                      train_dataset, 
                      batch_size=2, shuffle=True)
    test_dataloader = torch.utils.data.DataLoader(
                      test_dataset,
                      batch_size=2, shuffle=False)
    model = SimpleMulticlassNN(min_len-1, num_classes)
    model.to(device)

    criterion = nn.CrossEntropyLoss()  # Cross-entropy loss for multiclass classification
    optimizer = optim.AdamW(model.parameters(), lr=5e-6)

    scheduler = StepLR(optimizer, step_size=50, gamma=1e-1)    
    best_acu = 0
    best_auc = 0
    best_f1 = 0
    epochs = 100
    for epoch in range(epochs):
        train_loss = 0
        model.train()  # Set the model to training mode
        for batch_data, batch_labels in train_dataloader:
            optimizer.zero_grad()  # Zero the gradients
            outputs = model(batch_data)  # Forward pass
            loss = criterion(outputs, batch_labels)  # Calculate loss
            train_loss += loss
            loss.backward()  # Backpropagation
            optimizer.step()  # Update weights

        print(train_loss.item())
        scheduler.step()  # Adjust learning rate
        model.eval()  # Set the model to evaluation mode
        val_loss = 0.0
        correct = 0
        total = 0
        labels = 0
        preds = 0
        probs = 0
        with torch.no_grad():
            for batch_data, batch_labels in test_dataloader:
                outputs = model(batch_data)
                val_loss += criterion(outputs, batch_labels).item()
                _, predicted = outputs.max(1)
                np_labels = batch_labels.detach().cpu().numpy()
                np_predictions = predicted.detach().cpu().numpy()
                if total > 0:
                    labels = np.concatenate((labels,np_labels),axis=0)
                    preds = np.concatenate((preds,np_predictions),axis=0)
                    probs = np.concatenate((probs, torch.softmax(outputs, dim=1).detach().cpu().numpy()),axis=0)
                else:
                    labels = np_labels
                    preds = np_predictions
                    probs = torch.softmax(outputs, dim=1).detach().cpu().numpy()
                    # all_outputs = outputs.detach().cpu().numpy()
                total += batch_labels.size(0)
                correct += predicted.eq(batch_labels).sum().item()

        avg_val_loss = val_loss / len(test_dataloader)
        accuracy = 100.0 * correct / total
        acu = metrics.balanced_accuracy_score(labels, preds) * 100
        f1 = metrics.f1_score(labels, preds, average='weighted')
        if num_classes > 2:
            auc = metrics.roc_auc_score(labels, probs, multi_class='ovo')
        else:
            # raise ValueError(probs)
            auc = metrics.roc_auc_score(labels, preds)
        if acu > best_acu:
            best_acu = acu
        if auc > best_auc:
            best_auc = auc
        if f1 > best_f1:
            best_f1 = f1

        # print(f"Epoch [{epoch+1}/{epochs}] - Validation Loss: {avg_val_loss:.4f} - Validation Accuracy: {accuracy:.2f}% - sklearn Accuracy: {acu:.2f}%")
        print(f"Epoch [{epoch+1}/{epochs}] - Balanced Accuracy: {acu:.2f}% - AUC: {auc:.4f} - F1: {f1:.4f}")
    # all_acu.append(best_acu)
    # all_auc.append(best_auc)
    # all_f1.append(best_f1)
    results[fold] = (best_acu, best_auc, best_f1)
    print(f"Best Accuracy: {best_acu:.2f}% - Best AUC: {best_auc:.4f} - Best F1: {best_f1:.4f}")
print("\r\n")
# print(f"Mean Accuracy: {np.mean(all_acu):.2f}% - Mean AUC: {np.mean(all_auc):.4f} - Mean F1: {np.mean(all_f1):.4f}")
for key, value in results.items():
    print(f'Fold {key}: {value} %')

2488.950439453125
Epoch [1/100] - Balanced Accuracy: 10.00% - AUC: 0.4361 - F1: 0.0074
966.9230346679688
Epoch [2/100] - Balanced Accuracy: 5.00% - AUC: 0.3472 - F1: 0.0235
634.3021850585938
Epoch [3/100] - Balanced Accuracy: 10.00% - AUC: 0.4167 - F1: 0.0065
581.2311401367188
Epoch [4/100] - Balanced Accuracy: 10.00% - AUC: 0.4444 - F1: 0.0529
613.8017578125
Epoch [5/100] - Balanced Accuracy: 10.00% - AUC: 0.4639 - F1: 0.0248
554.2620239257812
Epoch [6/100] - Balanced Accuracy: 10.00% - AUC: 0.3944 - F1: 0.0261
460.2218933105469
Epoch [7/100] - Balanced Accuracy: 10.00% - AUC: 0.4417 - F1: 0.0065
514.5568237304688
Epoch [8/100] - Balanced Accuracy: 10.00% - AUC: 0.4917 - F1: 0.0261
585.9656982421875
Epoch [9/100] - Balanced Accuracy: 10.00% - AUC: 0.4750 - F1: 0.0490
498.2380676269531
Epoch [10/100] - Balanced Accuracy: 5.00% - AUC: 0.4639 - F1: 0.0392
526.6045532226562
Epoch [11/100] - Balanced Accuracy: 10.00% - AUC: 0.4583 - F1: 0.0277
448.672607421875
Epoch [12/100] - Balanced Acc

In [1183]:
sum_acc = 0
sum_auc = 0
sum_f1 = 0
for key, (acc, auc, f1) in results.items():
    sum_acc += acc
    sum_auc += auc
    sum_f1 += f1
sum_acc /= len(results)
sum_auc /= len(results)
sum_f1 /= len(results)

print(sum_acc, sum_auc, sum_f1)

25.0 0.6337962962962963 0.18195757469654528
