Knowledge Distillation 
===============================

**Author**: [Clara Martinez](https://github.com/moonblume/LIVIA.git)

Knowledge distillation is a technique that enables knowledge transfer
from large, computationally expensive models to smaller ones without
losing validity. This allows for deployment on less powerful hardware,
making evaluation faster and more efficient.

Librairies
================


In [23]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
import pandas as pd
import numpy as np
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, TensorDataset, Dataset
from torch.optim.lr_scheduler import ReduceLROnPlateau

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from scipy.signal import savgol_filter
from sklearn.metrics import mean_absolute_error, mean_squared_error

from typing import List, Union, Tuple, Any
import statistics

# Check if GPU is available, and if not, use the CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Loading Dataset
================

The GSRDataset class is a custom dataset class that inherits from PyTorch's Dataset class. It is designed to handle and preprocess physiological data, specifically Galvanic Skin Response (GSR) signals, along with their corresponding labels from video recordings.


In [24]:
import os
import torch
import pandas as pd
from torch.utils.data import Dataset

class GSRDataset(Dataset):
    def __init__(self, annotationfile_path, biosignals_path):
        self.biosignals_path = biosignals_path
        self.annotationfile_path = annotationfile_path
        self._parse_annotationfile()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        label = self.data[index]['label']
        gsr_data = self._load_biosignals(self.data[index]['path'])
        gsr_data = torch.tensor(gsr_data.values, dtype=torch.float32)
        return gsr_data, label

    def _load_biosignals(self, filepath):
        csv_path = os.path.join(self.biosignals_path, filepath)
        physio_df = pd.read_csv(csv_path, sep='\t')
        gsr_data = physio_df['gsr']
        return gsr_data

    def _parse_annotationfile(self):
        self.data = []
        with open(self.annotationfile_path, 'r') as file:
            for line in file:
                path, label = line.strip().split()
                self.data.append({'path': path, 'label': int(label)})

    def __len__(self):
        return len(self.data)

In [25]:
import pandas as pd
import os

def load_sample_gsr_data(biosignals_path, sample_file):
    csv_path = os.path.join(biosignals_path, sample_file)
    physio_df = pd.read_csv(csv_path, sep='\t')
    gsr_data = physio_df['gsr']
    return gsr_data

# Replace these with actual paths
biosignals_path = '/projets2/AS84330/Datasets/Biovid/PartA/physio/physio_organised/0'
sample_file = '071309_w_21-BL1-081_bio.csv'

gsr_data = load_sample_gsr_data(biosignals_path, sample_file)
print(gsr_data.shape)

(2816,)


In [21]:
print(f'GSR data shape: {gsr_data.shape}')
print(f'Number of samples (length of GSR signal): {len(gsr_data)}')

GSR data shape: (2816,)
Number of samples (length of GSR signal): 2816


Model
================


In [26]:
import torch.nn as nn

class Conv1D_model(nn.Module):
    def __init__(self, num_classes=2):
        super(Conv1D_model, self).__init__()
        
        # First Convolutional Layer
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=32, kernel_size=5, stride=2)
        self.relu1 = nn.ReLU()
        self.maxpool1 = nn.MaxPool1d(kernel_size=2)
        
        # Second Convolutional Layer
        self.conv2 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=5)
        self.relu2 = nn.ReLU()
        self.maxpool2 = nn.MaxPool1d(kernel_size=2)
        
        # Fully Connected Layers
        self.fc1 = nn.Linear(64 * 349, 512)  # Update the input dimension based on your GSR data length
        self.relu3 = nn.ReLU()
        self.fc2 = nn.Linear(512, num_classes)
    
    def forward(self, x):
        x = self.conv1(x)
        x = self.relu1(x)
        x = self.maxpool1(x)
        
        x = self.conv2(x)
        x = self.relu2(x)
        x = self.maxpool2(x)
        
        x = x.view(x.size(0), -1)  # Flatten the tensor to 1D
        x = self.fc1(x)
        x = self.relu3(x)
        x = self.fc2(x)
        return x


Validation
================


In [27]:
def validate_physio_gsr_only(physio_model, val_dataloader, criterion, device):
    # Validation phase
    physio_model.eval() 
    val_correct = 0
    val_total = 0
    val_vis_loss = 0.0
    val_physio_loss = 0.0

    with torch.no_grad():
        for val_data in tqdm(val_dataloader, total=len(val_dataloader), desc=f'Validation'):
            val_inputs, val_labels = val_data

            val_inputs = val_inputs.reshape(val_inputs.shape[0],1,val_inputs.shape[1])
            
            val_inputs = val_inputs.to(device, dtype=torch.float)
            val_labels = val_labels.to(device)
        


            val_physio_outputs = physio_model(val_inputs)
            # val_vis_outputs = vis_model(val_inputs)

            val_physio_loss += criterion(val_physio_outputs, val_labels)
            # val_vis_loss += criterion(val_vis_outputs, val_labels).item()

            # val_both_outputs = val_physio_outputs + val_vis_outputs

            _,val_predicted = torch.max(val_physio_outputs.data, 1)

            # _, val_both_predicted = torch.max(val_both_outputs.data, 1)
            
            val_total += val_labels.size(0)
            val_correct += (val_predicted == val_labels).sum().item()

    val_accuracy = 100 * val_correct / val_total
    avg_val_loss = ((val_physio_loss)) / len(val_dataloader)
    print(f'Validation accuracy: {val_accuracy}%')
    print(f'Validation loss: {avg_val_loss}')
    return val_accuracy, avg_val_loss

Training
================


In [28]:
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm

def validate_physio_gsr_only(physio_model, val_dataloader, criterion, device):
    physio_model.eval() 
    val_correct = 0
    val_total = 0
    val_loss = 0.0

    with torch.no_grad():
        for val_data in tqdm(val_dataloader, total=len(val_dataloader), desc=f'Validation'):
            val_inputs, val_labels = val_data
            val_inputs = val_inputs.unsqueeze(1).to(device, dtype=torch.float)
            val_labels = val_labels.to(device)
            
            val_outputs = physio_model(val_inputs)
            loss = criterion(val_outputs, val_labels)
            val_loss += loss.item()

            _, val_predicted = torch.max(val_outputs.data, 1)
            val_total += val_labels.size(0)
            val_correct += (val_predicted == val_labels).sum().item()

    val_accuracy = 100 * val_correct / val_total
    avg_val_loss = val_loss / len(val_dataloader)
    print(f'Validation accuracy: {val_accuracy}%')
    print(f'Validation loss: {avg_val_loss}')
    return val_accuracy, avg_val_loss

def train(train_annotation, test_annotation, weight_path):
    batch_size = 1024
    num_epochs = 200
    lr = 0.0001
    num_classes = 2
    check_every = 1
    best_val_acc = 0

    biosignals_path = '/projets2/AS84330/Datasets/Biovid/PartA/physio/physio_organised'
    five_fold_annotations_path = '/projets2/AS84330/Datasets/Biovid/PartA/5folds_annotations2/'
    train_annotation_file = os.path.join(five_fold_annotations_path, train_annotation)
    val_annotation_file = os.path.join(five_fold_annotations_path, test_annotation)

    train_dataset = GSRDataset(train_annotation_file, biosignals_path)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    val_dataset = GSRDataset(val_annotation_file, biosignals_path)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    physio_model = Conv1D_model(num_classes).to(device)

    criterion = nn.CrossEntropyLoss()
    physio_optimizer = optim.SGD(physio_model.parameters(), lr=lr, momentum=0.9)

    for epoch in tqdm(range(num_epochs), desc='Epochs'):
        physio_model.train()
        running_loss = 0
        correct = 0
        total = 0
        
        for physio_batch, labels in train_dataloader:
            physio_optimizer.zero_grad()
            physio_batch = physio_batch.unsqueeze(1).to(device, dtype=torch.float)
            labels = labels.to(device)
            
            outputs = physio_model(physio_batch)
            loss = criterion(outputs, labels)
            loss.backward()
            physio_optimizer.step()
            
            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        print(f"Accuracy after epoch {epoch + 1}: {100 * correct / total}%")

        if epoch % check_every == 0:
            val_acc, val_loss = validate_physio_gsr_only(physio_model, val_dataloader, criterion, device)
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                model_save_path = f'{weight_path}{round(best_val_acc, 2)}.pth'
                torch.save(physio_model.state_dict(), model_save_path)
                print('Best model saved at epoch: ', epoch + 1)
                best_epoch = epoch + 1

    print("Finished Training")
    train_accuracy = 100 * correct / total
    avg_train_loss = running_loss / len(train_dataloader)
    print(f'Training accuracy: {train_accuracy}%')
    print(f'Training loss: {avg_train_loss}')
    print("Best model saved at epoch: ", best_epoch)
    print("Best validation accuracy: ", best_val_acc)
    
    return best_val_acc

def test(test_annotation, test_weights):
    batch_size = 1024
    num_classes = 2
    biosignals_path = '/projets2/AS84330/Datasets/Biovid/PartA/physio/physio_organised'
    val_annotation_file = os.path.join(biosignals_path, '../../5folds_annotations', test_annotation)

    val_dataset = GSRDataset(val_annotation_file, biosignals_path)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    physio_model = Conv1D_model(num_classes).to(device)
    physio_model.load_state_dict(torch.load(test_weights))

    criterion = nn.CrossEntropyLoss()
    val_acc, _ = validate_physio_gsr_only(physio_model, val_dataloader, criterion, device)
    
    return val_acc

##### Train + Evaluate #####
if __name__ == '__main__':
    dir_name = '/home/ens/AU59350/LIVIA/resultsCNN/'
    os.makedirs(dir_name, exist_ok=True)
    kfold_accuracy = []
    for i in range(1, 6):
        train_annotation = f'train_fold{i}.txt'
        test_annotation = f'test_fold{i}.txt'
        weight_name = f'model_best_gsr_fold{i}_'
        weight_path = os.path.join(dir_name, weight_name)
        best_accuracy = train(train_annotation, test_annotation, weight_path)
        kfold_accuracy.append(round(best_accuracy, 1))

    with open(os.path.join(dir_name, 'accuracy.txt'), 'w') as f:
        for acc in kfold_accuracy:
            f.write(f'{acc}\n')
        f.write(f'Mean: {statistics.mean(kfold_accuracy)}\n')

##### Test#####
# if __name__ == '__main__':
#     kfold_accuracy = []
#     for i in range(1, 6):
#         test_annotation = f'test_fold{i}.txt'
#         weight_name = f'model_best_gsr_fold{i}.pth'
#         best_accuracy = test(test_annotation, weight_name)
#         kfold_accuracy.append(round(best_accuracy, 1))

#     print('Accuracy on fold 1:', kfold_accuracy[0])
#     print('Accuracy on fold 2:', kfold_accuracy[1])
#     print('Accuracy on fold 3:', kfold_accuracy[2])
#     print('Accuracy on fold 4:', kfold_accuracy[3])
#     print('Accuracy on fold 5:', kfold_accuracy[4])
#     print('Mean:', statistics.mean(kfold_accuracy))


ValueError: too many values to unpack (expected 2)

In [16]:
def train(train_annotation,test_annotation,weight_path):
    batch_size = 1024
    num_epochs = 200
    lr = 0.0001
    num_classes = 2
    check_every = 1
    best_val_acc = 0

    biosignals_path = '/projets2/AS84330/Datasets/Biovid/PartA/physio/physio_organised'
    five_fold_annotations_path = '/projets2/AS84330/Datasets/Biovid/PartA/5folds_annotations2/'
    train_annotation_file = os.path.join(five_fold_annotations_path, train_annotation)
    val_annotation_file = os.path.join(five_fold_annotations_path, test_annotation)


    train_dataset = GSRDataset(train_annotation_file, biosignals_path)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    val_dataset = GSRDataset(val_annotation_file, biosignals_path)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    physio_model = Conv1D_model(num_classes).to(device)

    criterion = nn.CrossEntropyLoss()
    physio_optimizer = optim.SGD(physio_model.parameters(), lr=lr, momentum=0.9)
    #scheduler = ReduceLROnPlateau(physio_optimizer, mode='min', factor=0.1, patience=5, verbose=True)

    for epoch in tqdm(range(num_epochs), desc='Epochs'):
        physio_model.train()
        
        
        running_loss = 0
        correct = 0
        total = 0
        
        for i,(physio_batch, labels) in enumerate(train_dataloader):
            physio_optimizer.zero_grad()
            physio_batch = physio_batch.reshape(physio_batch.shape[0],1,physio_batch.shape[1])
            physio_batch = physio_batch.to(device, dtype=torch.float)
            labels = labels.to(device)
            
            physio_outputs = physio_model(physio_batch)
            
            physio_loss = criterion(physio_outputs, labels)
            
            physio_loss.backward()
            physio_optimizer.step()
            # print(physio_loss.data)
            
            running_loss += physio_loss.item()
            
            _, physio_predicted = torch.max(physio_outputs.data, 1)
            total += labels.size(0)
            # print('output: ', physio_outputs)
            # print('predicted: ', physio_predicted)
            # print('labels: ', labels)
            # print('**************************')
            correct += (physio_predicted == labels).sum().item()
            #print(physio_loss.item())

        print(f"Accuracy after epoch {epoch + 1}: {100 * correct / total}%")

        if epoch % check_every == 0:
                val_acc, val_loss = validate_physio_gsr_only(physio_model, val_dataloader, criterion, device)
                # scheduler.step(val_loss)
                # print( "Validation accuracy: ", val_acc)
                if val_acc > best_val_acc:
                    best_val_acc = val_acc
                    remove_previous_files(weight_path)
                    model_save_path = f'{weight_path}{round(best_val_acc,2)}.pth'
                    torch.save(physio_model.state_dict(), model_save_path)
                    print('Best model saved at epoch: ', epoch+1)
                    best_epoch = epoch+1

    print("Finished Training")

    train_accuracy = 100 * correct / total
    avg_train_loss = running_loss / len(train_dataloader)
    print(f'Training accuracy: {train_accuracy}%')
    print(f'Training loss: {avg_train_loss}')

    print("Best model saved at epoch: ", best_epoch)
    print("Best validation accuracy: ", best_val_acc)
    
    return best_val_acc