In [None]:
import pandas as pd
from tqdm import tqdm

import numpy as np
import matplotlib.pyplot as plt

import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim

from sklearn.metrics import f1_score, accuracy_score

import os
os.chdir('C:\\Users\\mathi\\SimpleSequenceClassif')
from modules.preprocessing import categories_fit_one_hot, seq_pipeline
from modules.datasets import SeqCatDataset
from modules.data_specific import cleaning
from modules.models import SimpleClassifier

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
base_path = 'C:\\Users\\mathi\\Documents\\sequence_data\\'
df0, df1, df2, df3, df4, test = [pd.read_csv(base_path + suffix) for suffix in [
    'fold_0.csv', 'fold_1.csv', 'fold_2.csv', 'fold_3.csv', 'fold_4.csv', 'test.csv']]
for df in [df0, df1, df2, df3, df4, test]:
  cleaning(df)
full_data = pd.concat([df0, df1, df2, df3, df4])

In [None]:
sequence = 'peptide'
categories = ['class', 'gene', 'variant']

In [None]:
# Data Loaders setup

dataframe = pd.concat([df0, df1, df2, df3, df4])
_, cat_encoders = categories_fit_one_hot(dataframe, categories)

# Create PyTorch DataLoader for training and validation
train_dataset = SeqCatDataset(dataframe,
                              sequence,
                              seq_pipeline,
                              categories,
                              cat_encoders)

val_dataset = SeqCatDataset(test,
                            sequence,
                            seq_pipeline,
                            categories,
                            cat_encoders)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [None]:
# Training pipeline

# Initialize the model
input_dim = train_dataset[0][0].shape[1]
hidden_dim = 128
model = SimpleClassifier(input_dim, hidden_dim)
model.to(device)

# Define loss and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5)

# Define the number of iterations
num_epochs = 1
num_iterations = len(train_loader) * num_epochs
losses = []

# Define loss and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5)

# Train the model
for epoch in range(num_epochs):
    model.train()
    for batch_x, batch_y in tqdm(train_loader, desc=f"Epoch {epoch + 1}", ncols=100):
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        optimizer.zero_grad()
        output = model(batch_x)
        loss = criterion(output, batch_y)
        loss.backward()
        optimizer.step()

        # Append the loss to the list
        losses.append(loss.item())

# Final loss plot
plt.figure(figsize=(10, 6))
plt.plot(losses, label="Training Loss")
plt.xlabel("Iterations")
plt.ylabel("Loss")
plt.legend()
plt.show()

In [None]:
# Validation pipeline
# Set the model to evaluation mode
model.eval()

# Initialize variables to store predictions and ground truth
val_predictions = []
gt = []

# Iterate through the validation dataset
with torch.no_grad():
    for batch_x, batch_y in val_loader:
        val_output = model(batch_x)
        val_predictions.extend((val_output >= 0.5).float().cpu().numpy())
        gt.extend(batch_y.cpu().numpy())

# Convert predictions and ground truth to NumPy arrays
val_predictions = np.array(val_predictions)
gt = np.array(gt)

In [None]:
# Calculate the F1 score
val_f1 = f1_score(gt.reshape((-1)), val_predictions.reshape((-1)))

# Print the F1 score
print("Validation F1 Score:", val_f1)