Import Libraries

In [45]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split, KFold
from typing import List, Union, Tuple, Any
import statistics

# Check if GPU is available, and if not, use the CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [46]:
def verify_paths(biosignals_path, annotation_file):
    if not os.path.exists(biosignals_path):
        print(f"Biosignals path does not exist: {biosignals_path}")
        return False
    if not os.path.exists(annotation_file):
        print(f"Annotation file does not exist: {annotation_file}")
        return False
    return True


Define the Dataset Class

In [47]:
class GSRDataset(Dataset):
    def __init__(self, annotationfile_path, biosignals_path, missing_files_log, num_classes=5):
        self.biosignals_path = biosignals_path
        self.annotationfile_path = annotationfile_path
        self.missing_files_log = missing_files_log
        self.num_classes = num_classes
        self._parse_annotationfile()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        label = self.data[index]['label']
        gsr_data = self._load_biosignals(self.data[index]['path'])
        if gsr_data is None:
            return None, None
        gsr_data = torch.tensor(gsr_data.values, dtype=torch.float32)
        return gsr_data, label

    def _load_biosignals(self, filepath):
        csv_path = os.path.join(self.biosignals_path, filepath)
        try:
            physio_df = pd.read_csv(csv_path)
            gsr_data = physio_df['gsr']
            return gsr_data
        except FileNotFoundError:
            with open(self.missing_files_log, 'a') as log_file:
                log_file.write(f"File not found: {csv_path}\n")
            return None

    def _parse_annotationfile(self):
        self.data = []
        with open(self.annotationfile_path, 'r') as file:
            for line in file:
                parts = line.strip().split()
                if len(parts) >= 2:
                    path = parts[0]
                    if not path.endswith("_bio.csv"):
                        path = path + "_bio.csv"
                    try:
                        label = int(parts[3])  # Ensure correct index for label
                    except ValueError:
                        print(f"Skipping invalid label: {parts[3]} in line: {line.strip()}")
                        continue

                    if label < 0 or label >= self.num_classes:
                        print(f"Skipping invalid label: {label} in line: {line.strip()}")
                        continue

                    subdir = str(label)
                    filepath = os.path.join(subdir, path)
                    self.data.append({'path': filepath, 'label': label})
                else:
                    print(f"Skipping invalid line: {line.strip()}")


Define the Model

In [48]:
class Conv1D_model(nn.Module):
    def __init__(self, num_classes=5):
        super(Conv1D_model, self).__init__()
        
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=32, kernel_size=5, stride=2)
        self.relu1 = nn.ReLU()
        self.maxpool1 = nn.MaxPool1d(kernel_size=2)
        
        self.conv2 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=5)
        self.relu2 = nn.ReLU()
        self.maxpool2 = nn.MaxPool1d(kernel_size=2)
        
        self.fc1 = nn.Linear(22336, 512)
        self.relu3 = nn.ReLU()
        self.fc2 = nn.Linear(512, num_classes)
    
    def forward(self, x):
        x = self.conv1(x)
        x = self.relu1(x)
        x = self.maxpool1(x)
        
        x = self.conv2(x)
        x = self.relu2(x)
        x = self.maxpool2(x)
        
        x = x.view(x.size(0), -1)  # Flatten the tensor to 1D
        x = self.fc1(x)
        x = self.fc2(x)
        return x


 Define Validation Function

In [49]:
def validate_physio_gsr_only(physio_model, val_dataloader, criterion, device):
    physio_model.eval()
    val_correct = 0
    val_total = 0
    val_loss = 0.0

    with torch.no_grad():
        for val_data in tqdm(val_dataloader, total=len(val_dataloader), desc='Validation'):
            val_inputs, val_labels = val_data
            if val_inputs is None:
                continue
            val_inputs = val_inputs.unsqueeze(1).to(device, dtype=torch.float)
            val_labels = val_labels.to(device)
            
            val_outputs = physio_model(val_inputs)
            loss = criterion(val_outputs, val_labels)
            val_loss += loss.item()

            _, val_predicted = torch.max(val_outputs.data, 1)
            val_total += val_labels.size(0)
            val_correct += (val_predicted == val_labels).sum().item()

    val_accuracy = 100 * val_correct / val_total
    avg_val_loss = val_loss / len(val_dataloader)
    print(f'Validation accuracy: {val_accuracy}%')
    print(f'Validation loss: {avg_val_loss}')
    return val_accuracy, avg_val_loss


Define Training Function

In [50]:
def train(train_annotation, test_annotation, weight_name, missing_files_log):
    batch_size = 1024
    num_epochs = 100 
    lr = 0.0001
    num_classes = 5
    check_every = 1
    best_val_acc = 0

    biosignals_path = '/projets2/AS84330/Datasets/Biovid/PartA/physio/physio_organised'
    five_fold_annotations_path = '/projets2/AS84330/Datasets/Biovid/PartA/5folds_annotations_five/'
    train_annotation_file = os.path.join(five_fold_annotations_path, train_annotation)
    val_annotation_file = os.path.join(five_fold_annotations_path, test_annotation)

    if not verify_paths(biosignals_path, train_annotation_file):
        return 0

    train_dataset = GSRDataset(train_annotation_file, biosignals_path, missing_files_log, num_classes=num_classes)
    train_data = [(data, label) for data, label in train_dataset if data is not None]
    if len(train_data) == 0:
        print(f"No valid data found for training in {train_annotation}")
        return 0
    train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

    val_dataset = GSRDataset(val_annotation_file, biosignals_path, missing_files_log, num_classes=num_classes)
    val_data = [(data, label) for data, label in val_dataset if data is not None]
    if len(val_data) == 0:
        print(f"No valid data found for validation in {test_annotation}")
        return 0
    val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

    physio_model = Conv1D_model(num_classes).to(device)

    criterion = nn.CrossEntropyLoss()
    physio_optimizer = optim.SGD(physio_model.parameters(), lr=lr, momentum=0.9)

    for epoch in tqdm(range(num_epochs), desc='Epochs'):
        physio_model.train()
        
        running_loss = 0
        correct = 0
        total = 0
        
        for physio_batch, labels in train_dataloader:
            physio_optimizer.zero_grad()
            physio_batch = physio_batch.unsqueeze(1).to(device, dtype=torch.float)
            labels = labels.to(device)
            
            physio_outputs = physio_model(physio_batch)
            
            physio_loss = criterion(physio_outputs, labels)
            physio_loss.backward()
            physio_optimizer.step()
            
            running_loss += physio_loss.item()
            
            _, physio_predicted = torch.max(physio_outputs.data, 1)
            total += labels.size(0)
            correct += (physio_predicted == labels).sum().item()

        print(f"Accuracy after epoch {epoch + 1}: {100 * correct / total}%")

        if epoch % check_every == 0:
            val_acc, val_loss = validate_physio_gsr_only(physio_model, val_dataloader, criterion, device)
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                model_save_path = f'/projets2/AS84330/Projets/MM_transformer/biovid_codes/Biovid_vis_phy/{weight_name}{round(best_val_acc,2)}.pth'
                torch.save(physio_model.state_dict(), model_save_path)
                print('Best model saved at epoch: ', epoch + 1)

    print("Finished Training")
    train_accuracy = 100 * correct / total
    avg_train_loss = running_loss / len(train_dataloader)
    print(f'Training accuracy: {train_accuracy}%')
    print(f'Training loss: {avg_train_loss}')
    print("Best validation accuracy: ", best_val_acc)
    
    return best_val_acc


Define Testing Function

In [51]:
def test(test_annotation, test_weights):
    batch_size = 1024
    num_classes = 5
    biosignals_path = '/projets2/AS84330/Datasets/Biovid/PartA/physio/physio_organised'
    val_annotation_file = os.path.join(biosignals_path, '../../5folds_annotations_five', test_annotation)

    val_dataset = GSRDataset(val_annotation_file, biosignals_path)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    physio_model = Conv1D_model(num_classes).to(device)
    physio_model.load_state_dict(torch.load(test_weights))

    criterion = nn.CrossEntropyLoss()
    val_acc, _ = validate_physio_gsr_only(physio_model, val_dataloader, criterion, device)
    
    return val_acc


Main Execution for Training and Validation

In [52]:
if __name__ == '__main__':
    dir_name = '/home/ens/AU59350/LIVIA/resultsCNN/'
    os.makedirs(dir_name, exist_ok=True)
    missing_files_log = os.path.join(dir_name, 'missing_files.log')
    
    if os.path.exists(missing_files_log):
        os.remove(missing_files_log)

    def print_directory_structure(path, level=0):
        for item in os.listdir(path):
            item_path = os.path.join(path, item)
            print('  ' * level + f"- {item}")
            if os.path.isdir(item_path):
                print_directory_structure(item_path, level + 1)

    biosignals_path = '/projets2/AS84330/Datasets/Biovid/PartA/physio/physio_organised'
    print(f"\nDirectory structure of {biosignals_path}:")
    print_directory_structure(biosignals_path)

    kfold_accuracy = []
    for i in range(1, 6):
        train_annotation = f'train_fold{i}.txt'
        test_annotation = f'test_fold{i}.txt'
        weight_name = f'model_best_gsr_fold{i}_'
        best_accuracy = train(train_annotation, test_annotation, weight_name, missing_files_log)
        if best_accuracy > 0:
            kfold_accuracy.append(round(best_accuracy, 1))
    
    if kfold_accuracy:
        with open(os.path.join(dir_name, 'accuracy.txt'), 'w') as f:
            for acc in kfold_accuracy:
                f.write(f'{acc}\n')
            f.write(f'Mean: {statistics.mean(kfold_accuracy)}\n')
    else:
        print("No valid folds were processed. Ensure the dataset files are correctly placed.")



Directory structure of /projets2/AS84330/Datasets/Biovid/PartA/physio/physio_organised:
- 2
  - 101015_w_43-PA2-037_bio.csv
  - 112009_w_43-PA2-071_bio.csv
  - 102414_w_58-PA2-068_bio.csv
  - 080614_m_24-PA2-043_bio.csv
  - 092014_m_56-PA2-039_bio.csv
  - 101609_m_36-PA2-017_bio.csv
  - 071614_m_20-PA2-022_bio.csv
  - 082208_w_45-PA2-077_bio.csv
  - 082909_m_47-PA2-061_bio.csv
  - 100117_w_36-PA2-014_bio.csv
  - 092009_m_54-PA2-034_bio.csv
  - 083109_m_60-PA2-011_bio.csv
  - 083009_w_42-PA2-003_bio.csv
  - 082109_m_53-PA2-028_bio.csv
  - 082909_m_47-PA2-003_bio.csv
  - 102309_m_61-PA2-031_bio.csv
  - 102316_w_50-PA2-015_bio.csv
  - 073109_w_28-PA2-043_bio.csv
  - 072609_w_23-PA2-045_bio.csv
  - 101514_w_36-PA2-035_bio.csv
  - 101809_m_59-PA2-049_bio.csv
  - 080209_w_26-PA2-030_bio.csv
  - 091914_m_46-PA2-023_bio.csv
  - 080614_m_24-PA2-053_bio.csv
  - 082315_w_60-PA2-034_bio.csv
  - 073109_w_28-PA2-008_bio.csv
  - 111609_m_65-PA2-046_bio.csv
  - 082714_m_22-PA2-021_bio.csv
  - 082109_