# NameProj
The idea of this project is to implement and evaluate a multi-biometric system using modern Deep Learning techniques.


# Mount drive and load data

In [11]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Unzipping data

In [None]:
import os
import zipfile
from google.colab import drive

source_folder = '/content/drive/MyDrive/BiometricSystems/SpeakingFaces'
destination_folder = '/content/drive/MyDrive/BiometricSystems/SpeakingFacesExt'
files_to_delete = []

for filename in os.listdir(source_folder):
    if files_to_delete != []: print(f"Please delete these files from {destination_folder}\n{files_to_delete}")

    if filename.split(".")[0] in os.listdir(destination_folder):
      print( f"Unpacked {filename} already exists in the destination folder")
      files_to_delete.append(filename)
      continue

    if filename.endswith(".zip"):
        zip_filepath = os.path.join(source_folder, filename)
        try:
            with zipfile.ZipFile(zip_filepath, 'r') as zip_ref:
                # Extract all contents to the destination folder
                zip_ref.extractall(destination_folder)
            print(f"Successfully extracted {filename} to {destination_folder}")

            # Delete the unzipped file from the source folder
            os.remove(zip_filepath)
            print(f"Deleted {filename} from {source_folder} (Also check manually)")

        except zipfile.BadZipFile:
            print(f"Error: {filename} is not a valid zip file.")
        except Exception as e:
            print(f"An error occurred while processing {filename}: {e}")

# Load Libraries 📚

In [6]:
#!pip install faiss-cpu
!pip install speechbrain

Collecting speechbrain
  Downloading speechbrain-1.0.2-py3-none-any.whl.metadata (23 kB)
Collecting hyperpyyaml (from speechbrain)
  Downloading HyperPyYAML-1.2.2-py3-none-any.whl.metadata (7.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.9->speechbrain)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.9->speechbrain)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.9->speechbrain)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.9->speechbrain)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.9->speechbrain)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3

In [1]:
import torch
import torch.nn as nn
from torch.nn import init
import torch.optim as optim
from torchvision import models
import torch.nn.functional as F
from torch.autograd import Variable
from torch.optim.lr_scheduler import StepLR
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torchaudio

import os
import shutil
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
import timm
import json
import numpy as np
from PIL import Image
import copy
import random
import itertools
import math

device = "cuda" if torch.cuda.is_available() else "cpu"
torch.manual_seed(404)
np.random.seed(404)
random.seed(404)

# Data 🗃️


The dataset has a total of 142 subjects that we can use to finetune each model. The split will be virtually assigned for each model in order to add some noise.

In [187]:
data_dir= "./drive/MyDrive/BiometricSystems/SpeakingFacesExt"

IDENTITIES_NUMBERS = 141
train_dir, test_dir , val_dir = "train", "test", "val"
subjects_ids = [] #[str(i) for i in range(1,142)]

Checking the sanity of the dataset

In [188]:
for file in os.listdir(data_dir):
  if file not in ["train", "val", "test"]:

    index = file.split("_")[1]

    sub_dir = os.listdir(os.path.join(data_dir, file))
    if "trial_1" in sub_dir and "trial_2" in sub_dir:
      subj_id = index
      subjects_ids.append(int(subj_id))
    #print(file,index)
    #shutil.move(os.path.join(data_dir, file), os.path.join(data_dir, train_dir))


print(sorted(subjects_ids))
print(len(subjects_ids))


[1, 2, 3, 4, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 43, 44, 47, 48, 49, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141]
104


In [189]:
def generate_split(data_path, subjects_ids):
  shuffled_data = subjects_ids.copy()
  random.shuffle(shuffled_data)
  pivot1, pivot2, n = math.ceil(0.7*len(shuffled_data)), math.ceil(0.15*len(shuffled_data)), len(shuffled_data)
  #print(pivot1)
  return shuffled_data[:pivot1], shuffled_data[pivot1:n-pivot2+1], shuffled_data[n-pivot2+1:]

In [190]:
train,val,test = generate_split(data_dir, subjects_ids)
print(train)
print(val)
print(test)
print(len(train), len(val), len(test))

[43, 117, 139, 70, 53, 130, 107, 30, 39, 110, 113, 121, 111, 44, 127, 119, 35, 13, 51, 54, 115, 22, 2, 29, 11, 33, 59, 3, 36, 102, 116, 12, 99, 112, 52, 105, 123, 106, 141, 118, 103, 68, 124, 14, 23, 10, 101, 129, 15, 109, 1, 32, 20, 134, 58, 120, 126, 104, 73, 31, 21, 138, 61, 74, 27, 71, 40, 38, 56, 25, 140, 26, 75]
[49, 48, 4, 128, 122, 135, 69, 72, 24, 137, 37, 19, 136, 108, 76, 125]
[18, 60, 114, 47, 17, 9, 34, 132, 66, 131, 67, 57, 133, 100, 55]
73 16 15


Here's the overview of the designed architecture

# Custom Data Loader 🔃
*We need to define the class desiged to extract the training, validation and test data*

In [191]:
class CustomDataset(Dataset):
    def __init__(self, root, labels, data_limit, label_limit, transform=None):

        self.data_limit = data_limit
        self.label_limit = label_limit

        self.root = root
        self.labels = labels[:data_limit]
        self.rgb_images_names = {label: "" for label in self.labels}
        self.thrml_images_names = {label: "" for label in self.labels}
        self.audio_names = {label: "" for label in self.labels}

        self.valid_indices = list(self.rgb_images_names.keys())
        self.add_data()
        self.transform = transform

    def data_path_extractor(self, attribute, label, subpath, data_folder):
      limit_data = self.data_limit
      for data_element in os.listdir(os.path.join(self.root, subpath, data_folder)):
          if limit_data==0: break
          elif attribute == 0: self.rgb_images_names[label]= os.path.join(self.root, subpath, data_folder, data_element)
          elif attribute == 1: self.thrml_images_names[label]= os.path.join(self.root, subpath, data_folder, data_element)
          elif attribute == 2: self.audio_names[label]= os.path.join(self.root, subpath, data_folder, data_element)
          limit_data -= 1

    def add_data(self):
      limit_labels = self.label_limit
      for label in self.labels:
          if limit_labels==0: break

          trial = random.choice([1, 2])
          mic_number = random.choice([1, 2])
          subpath = os.path.join(f"sub_{label}_ia",f"trial_{trial}")

          self.data_path_extractor(0, label, subpath, "rgb_image_cmd_aligned")
          self.data_path_extractor(1, label, subpath, "thr_image_cmd")
          self.data_path_extractor(2, label, subpath, f"mic{mic_number}_audio_cmd_trim")
          '''
          for image_name in os.listdir(os.path.join(self.root, subpath, "rgb_image_cmd_aligned")):
              if not limit_images: break
              self.images_names[label]= os.path.join(self.root, subpath, image_name)
              limit_images -= 1
          '''
          limit_labels -= 1

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, id):

        idx = self.labels[id]
        '''
        print("ID: ", id)
        print("LABEL CHIESTA: ", idx, "\nLA LABEL E' PRESENTE? ", idx in self.labels, "\nLABELS: ", self.labels)
        print(self.rgb_images_names)
        print("RGB PATH: ", self.rgb_images_names[idx])
        print("THRM PATH: ", self.thrml_images_names[idx])
        print("AUDIO PATH: ", self.audio_names[idx])
        '''
        img = Image.open(self.rgb_images_names[idx])
        img_thr = Image.open(self.thrml_images_names[idx])
        audio_tensor, sample_rate = torchaudio.load(self.audio_names[idx])
        if self.transform:
            img = self.transform(img)
            img_thr = self.transform(img_thr)

        return img, img_thr, (audio_tensor, sample_rate), torch.Tensor([1 if idx==i else 0 for i in range(IDENTITIES_NUMBERS)])

In [193]:
train_labels, val_labels, test_labels = generate_split(data_dir, subjects_ids)

print(train_labels)

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(), # 256 -> 0.0 ~ 1.0
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

[54, 43, 40, 73, 118, 126, 9, 135, 120, 117, 13, 27, 22, 32, 106, 113, 109, 133, 3, 102, 107, 19, 58, 104, 69, 31, 121, 139, 33, 57, 59, 15, 136, 123, 129, 137, 74, 47, 2, 35, 124, 70, 103, 105, 100, 127, 138, 141, 119, 132, 12, 51, 108, 76, 29, 14, 53, 21, 114, 61, 10, 130, 112, 26, 25, 110, 18, 48, 128, 72, 17, 39, 134]


In [194]:
DATA_LIMIT, LABEL_LIMIT = 1, 1

train_dataset = CustomDataset(data_dir, train_labels, DATA_LIMIT, LABEL_LIMIT, transform=transform)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)

val_dataset = CustomDataset(data_dir, val_labels, DATA_LIMIT, LABEL_LIMIT, transform=transform)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=True)

In [195]:
for img, img_thr, audio, label in train_dataset:
  a,b,c,d = img, img_thr, audio, label
  print(a)
  print(b)
  print(c)
  print(d)
  print("=======================================")

tensor([[[ 0.8961,  0.8961,  0.8961,  ...,  0.8618,  0.8618,  0.8447],
         [ 0.8961,  0.8961,  0.8961,  ...,  0.8447,  0.8447,  0.8447],
         [ 0.8961,  0.8961,  0.9132,  ...,  0.8276,  0.8447,  0.8618],
         ...,
         [-2.1179, -2.1179, -2.1179,  ..., -2.1179, -2.1179, -2.1179],
         [-2.1179, -2.1179, -2.1179,  ..., -2.1179, -2.1179, -2.1179],
         [-2.1179, -2.1179, -2.1179,  ..., -2.1179, -2.1179, -2.1179]],

        [[ 1.0630,  1.0630,  1.0630,  ...,  1.0455,  1.0455,  1.0280],
         [ 1.0630,  1.0630,  1.0630,  ...,  1.0280,  1.0280,  1.0280],
         [ 1.0630,  1.0630,  1.0805,  ...,  1.0105,  1.0280,  1.0455],
         ...,
         [-2.0357, -2.0357, -2.0357,  ..., -2.0357, -2.0357, -2.0357],
         [-2.0357, -2.0357, -2.0357,  ..., -2.0357, -2.0357, -2.0357],
         [-2.0357, -2.0357, -2.0357,  ..., -2.0357, -2.0357, -2.0357]],

        [[ 1.2980,  1.2980,  1.2980,  ...,  1.1934,  1.1934,  1.1759],
         [ 1.2980,  1.2980,  1.2980,  ...,  1

In [None]:
'''
def get_data(data_dir="/content/BiometricsSystems/data"):
  batch_size = 32

  transform_train_list = transforms.Compose([
      transforms.Resize((224,224), interpolation=3),
      transforms.RandomHorizontalFlip(),
      transforms.ToTensor(),
      transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
      ])
  transform_val_list = transforms.Compose([
      transforms.Resize(size=(224,224),interpolation=3),
      transforms.ToTensor(),
      transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
      ])

  dataset_train = datasets.ImageFolder(os.path.join(data_dir, 'train'),transform=transform_train_list)
  dataset_val = datasets.ImageFolder(os.path.join(data_dir, 'val'),transform=transform_val_list)
  train_loader = DataLoader(dataset = dataset_train, batch_size=batch_size, shuffle=True)
  val_loader = DataLoader(dataset = dataset_val, batch_size=batch_size, shuffle=True)

  return train_loader, val_loader
  '''

# Architecture 🏛️

# Models ⚠️🚀
In this section there will be implemented the fundamental models of this project

## LA Transformer
This architecture will be used to extract features from RGB and thermal images. There will be a dedicated model for each type of image in order to have a specialized set of features, which will be merged with the audio features.

In [None]:
def weights_init_kaiming(m):
    classname = m.__class__.__name__
    # print(classname)
    if classname.find('Conv') != -1:
        init.kaiming_normal_(m.weight.data, a=0, mode='fan_in')
    elif classname.find('Linear') != -1:
        init.kaiming_normal_(m.weight.data, a=0, mode='fan_out')
        init.constant_(m.bias.data, 0.0)
    elif classname.find('BatchNorm1d') != -1:
        init.normal_(m.weight.data, 1.0, 0.02)
        init.constant_(m.bias.data, 0.0)

def weights_init_classifier(m):
    classname = m.__class__.__name__
    if classname.find('Linear') != -1:
        init.normal_(m.weight.data, std=0.001)
        init.constant_(m.bias.data, 0.0)

class ClassBlock(nn.Module):
    def __init__(self, input_dim, class_num, droprate, relu=False, bnorm=True, num_bottleneck=512, linear=True, return_f = False):
        super(ClassBlock, self).__init__()
        self.return_f = return_f
        add_block = []
        if linear:
            add_block += [nn.Linear(input_dim, num_bottleneck)]
        else:
            num_bottleneck = input_dim
        if bnorm:
            add_block += [nn.BatchNorm1d(num_bottleneck)]
        if relu:
            add_block += [nn.LeakyReLU(0.1)]
        if droprate>0:
            add_block += [nn.Dropout(p=droprate)]
        add_block = nn.Sequential(*add_block)
        add_block.apply(weights_init_kaiming)

        classifier = []
        classifier += [nn.Linear(num_bottleneck, class_num)]
        classifier = nn.Sequential(*classifier)
        classifier.apply(weights_init_classifier)

        self.add_block = add_block
        self.classifier = classifier
    def forward(self, x):
        x = self.add_block(x)
        if self.return_f:
            f = x
            x = self.classifier(x)
            return [x,f]
        else:
            x = self.classifier(x)
            return x

class LATransformer(nn.Module):
    def __init__(self, model, lmbd, print_verbose = False, test=False, pretraining=False):
        super(LATransformer, self).__init__()

        if print_verbose:
            self._print = print
        else:
            self._print = lambda *args, **kwargs: None
        self.class_num = 751
        self.part = 14 # We cut the pool5 to sqrt(N) parts
        self.num_blocks = 12
        self.model = model
        self.model.head.requires_grad_ = False
        self.cls_token = self.model.cls_token
        self.pos_embed = self.model.pos_embed
        self.avgpool = nn.AdaptiveAvgPool2d((self.part,768))
        self.dropout = nn.Dropout(p=0.5)
        self.lmbd = lmbd
        self.test = test
        self.pretraining = pretraining
        if not (self.test or self.pretraining):
          for i in range(self.part):
              name = 'classifier'+str(i)
              setattr(self, name, ClassBlock(768, self.class_num, droprate=0.5, relu=False, bnorm=True, num_bottleneck=256))

        if self.pretraining:
          self.fc = nn.Sequential(nn.Conv1d(14, 32, 3),
                                  nn.BatchNorm1d(32),
                                  nn.LeakyReLU(0.1),
                                  nn.Conv1d(32, 3, 3),
                                  nn.BatchNorm1d(3),
                                  nn.LeakyReLU(0.1),
                                  nn.Flatten(),
                                  nn.Linear(2292, 1024),
                                  nn.LeakyReLU(0.1),
                                  nn.Linear(1024,128))

          self.fc.apply(weights_init_kaiming)



    def forward(self,x):

        # Divide input image into patch embeddings and add position embeddings
        # cls token is a learnable parameter added to the start of sequence
        # It contains global info about the whole image
        # Used in classical Transformers like BERT and ViT to do classification with just itself
        # Here it is later combined to enrich local features with global features of the image
        self._print(f"x before pos embedding: {x.shape}")
        x = self.model.patch_embed(x)
        self._print(f"x after pos embedding: {x.shape}")
        cls_token = self.cls_token.expand(x.shape[0], -1, -1)
        self._print(f"cls token: {cls_token.shape}")
        x = torch.cat((cls_token, x), dim=1)
        self._print(f"x with concatenation with cls token: {x.shape}")
        x = self.model.pos_drop(x + self.pos_embed)
        self._print(f"x after pos drop: {x.shape}")

        # Feed forward through transformer blocks
        for i in range(self.num_blocks):
            self._print(f"x before block {i}: {x.shape}")
            x = self.model.blocks[i](x)
        x = self.model.norm(x)
        self._print(f"x after blocks: {x.shape}")

        # extract the cls token
        cls_token_out = x[:, 0].unsqueeze(1)
        self._print(f"cls token out: {cls_token_out.shape}")

        # Average pool
        x = self.avgpool(x[:, 1:])
        self._print(f"x after avgpool: {x.shape}")

        if self.test:
          return x

        # Add global cls token to each local token
        for i in range(self.part):
            self._print(f"x before mul: {x.shape}")
            out = torch.mul(x[:, i, :], self.lmbd)
            x[:,i,:] = torch.div(torch.add(cls_token_out.squeeze(),out), 1+self.lmbd)

        if self.pretraining:
          x = x.reshape(x.size(0), 14, -1)
          x = self.fc(x)
          return x

        # Locally aware network
        part = {}
        predict = {}
        for i in range(self.part):
            part[i] = x[:,i,:]
            name = 'classifier'+str(i)
            c = getattr(self,name)
            predict[i] = c(part[i])
        return predict

##  Time Delay Neural Networks (TDNN)

This architecture is used in order to extract features from audio files containing human speakers. The choice is ECAPA, a pre-trained model that will be finetuned on the chosen dataset

In [202]:
from speechbrain.lobes.models.ECAPA_TDNN import ECAPA_TDNN

class CustomECAPA(ECAPA_TDNN):
    def __init__(self, num_classes=141):
        super().__init__(num_classes)
        self.output_layer = nn.Linear(192, num_classes)

    def forward(self, x):
        embeddings = super().forward(x)
        return self.output_layer(embeddings)

ecapa_model = CustomECAPA(num_classes=141)

## Multi Layer Perceptron (MLP)

The MLP will serve as a dimensionality reduction mechanism for the merged features of the 3 main models

In [None]:
class MLPReducer(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim=128):
        super(MLPReducer, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Linear(hidden_dim // 2, output_dim)
        )

    def forward(self, x):
        return self.model(x)

Another MLP that will do the classification, so the ending layer in the bigger model

In [None]:
class MLPClassifier(nn.Module):
    def __init__(self, reduced_dim, num_classes, hidden_dim=64):
        super(MLPClassifier, self).__init__()
        self.classifier = nn.Sequential(
            nn.Linear(reduced_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, num_classes)
        )

    def forward(self, x):
        return self.classifier(x)

# Freezing Layers ❄️
*In order to fine-tune the Visual Transformer we need to keep training the model on new data, without losing the knowledge gained from the pre-trained part. This implies freezing the initial layers and gradually unfreeze the next ones without completely overriding the gained knowledge. This requires lower epochs, dropout and a small learning rate*

In [None]:
def freeze_all_blocks(model):
    frozen_blocks = 12
    for block in model.model.blocks[:frozen_blocks]:
        for param in block.parameters():
            param.requires_grad=False

def unfreeze_blocks(model, amount= 1):
    for block in model.model.blocks[11-amount:]:
        for param in block.parameters():
            param.requires_grad=True
    return model

def get_training_objects(pretraining_path = None):
  backbone = timm.create_model('vit_base_patch16_224', pretrained=True, num_classes=751).to(device)
  lmbd = 8 #Weight in averaging with CLS token
  model = LATransformer(backbone, lmbd).to(device)

  if pretraining_path:
    model.load_state_dict(torch.load(pretraining_path), strict=False)

  freeze_all_blocks(model)

  criterion = nn.CrossEntropyLoss()

  optimizer = optim.AdamW(model.parameters(),weight_decay=5e-4, lr=3e-4)

  return model, criterion, optimizer

*We'll create the complete model that will make use of each module (which is a model) in order to classify the subjects correctly*

# Training ⏳⚙️
...

In [7]:
from speechbrain.pretrained import SpeakerRecognition

rgb_model = timm.create_model('vit_base_patch16_224', pretrained=True, num_classes=141).to(device)
thermal_model = timm.create_model('vit_base_patch16_224', pretrained=True, num_classes=141).to(device)
audio_model = SpeakerRecognition.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", savedir="tmp_ecapa")

  from speechbrain.pretrained import SpeakerRecognition
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

hyperparams.yaml:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)


embedding_model.ckpt:   0%|          | 0.00/83.3M [00:00<?, ?B/s]

mean_var_norm_emb.ckpt:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

classifier.ckpt:   0%|          | 0.00/5.53M [00:00<?, ?B/s]

label_encoder.txt:   0%|          | 0.00/129k [00:00<?, ?B/s]

  state_dict = torch.load(path, map_location=device)
  stats = torch.load(path, map_location=device)


In [215]:
from speechbrain.lobes.features import Fbank


class FinalModel(nn.Module):
  def __init__(self, rgb_model, thermal_model, audio_model, mlp_reducer, classifier, num_classes=141):
    super(FinalModel, self).__init__()
    self.rgb_model = rgb_model
    self.thermal_model = thermal_model
    self.audio_model = audio_model
    self.mlp_reducer = mlp_reducer
    self.classifier = classifier

  def forward(self, rgb_input, thermal_input, audio_input):
    rgb_output = self.rgb_model(rgb_input)
    thermal_output = self.thermal_model(thermal_input)

    #fbank = Fbank(n_mels=80)
    #audio_features = fbank(audio_input)
    #audio_output = self.audio_model(audio_features.squeeze())

    combined = torch.cat((rgb_output, thermal_output), dim=1) #torch.cat((rgb_output, thermal_output, audio_output), dim=1)
    reduced = self.mlp_reducer(combined)
    return self.classifier(reduced)


## Training function for the final model

In [243]:
def train(num_epochs, model, train_loader, val_loader, optimizer, loss_fn, save_model_path, log_file):

    os.makedirs(save_model_path, exist_ok=True)
    log_data = []
    unfrozen_blocks = 0
    unfreeze_after = 2
    lr_decay = .8

    with tqdm(total=num_epochs, desc='Total Progress', unit='epoch') as epoch_pbar:
      for epoch in range(num_epochs):

          '''
          if epoch%unfreeze_after==0:
            unfrozen_blocks += 1
            model = unfreeze_blocks(model, unfrozen_blocks)
            optimizer.param_groups[0]['lr'] *= lr_decay
            trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
            print("Unfrozen Blocks: {}, Current lr: {}, Trainable Params: {}".format(unfrozen_blocks,
                                                                                optimizer.param_groups[0]['lr'],
                                                                                trainable_params))
          '''
          model.train()
          train_loss = 0.0
          train_accuracy = 0.0

          with tqdm(train_loader, unit="batch") as pbar:
              for rgb, thrm, audio_data, target in pbar:
                  audio,hz = audio_data
                  rgb, thrm, audio, target = rgb.to(device), thrm.to(device), audio.to(device), target.to(device)

                  optimizer.zero_grad()
                  output = model(rgb,thrm,audio)

                  score = sum(nn.Softmax(dim=0)(v) for v in output) #.values())   dim=1
                  _, preds = torch.max(score, 0)

                  loss = sum(loss_fn(v, target) for v in output.values()) #.values()
                  loss.backward()
                  optimizer.step()

                  train_loss += loss.item()
                  train_accuracy += (preds == target).float().mean().item()

                  current_loss = train_loss / (pbar.n + 1)
                  current_accuracy = train_accuracy / (pbar.n + 1)

                  pbar.set_description(f"Epoch {epoch+1}/{num_epochs}")
                  pbar.set_postfix(loss=current_loss, accuracy=current_accuracy)

          avg_train_loss = train_loss / len(train_loader)
          avg_train_accuracy = train_accuracy / len(train_loader)

          model.eval()
          val_loss = 0.0
          val_accuracy = 0.0

          with torch.no_grad():
              with tqdm(val_loader, unit="batch") as val_pbar:
                  for  rgb, thrm, audio_data, target in val_pbar:
                      rgb, thrm, audio, target = rgb.to(device), thrm.to(device), audio.to(device), target.to(device)

                      output = model(rgb,thrm,audio)

                      score = sum(nn.Softmax(dim=0)(v) for v in output)  #dim=1  .values()
                      _, preds = torch.max(score, 0) #1

                      loss = sum(loss_fn(v, target) for v in output) #values()

                      val_loss += loss.item()
                      val_accuracy += (preds == target).float().mean().item()

                      current_loss = val_loss / (val_pbar.n + 1)
                      current_accuracy = val_accuracy / (val_pbar.n + 1)

                      val_pbar.set_description(f"Epoch {epoch+1}/{num_epochs} [Val]")
                      val_pbar.set_postfix(loss=current_loss, accuracy=current_accuracy)

          avg_val_loss = val_loss / len(val_loader)
          avg_val_accuracy = val_accuracy / len(val_loader)

          model_save_path = os.path.join(save_model_path, f'model_epoch_{epoch+1}.pth')
          torch.save(model.state_dict(), model_save_path)

          epoch_log = {
              'epoch': epoch + 1,
              'training_loss': avg_train_loss,
              'training_accuracy': avg_train_accuracy,
              'val_loss': avg_val_loss,
              'val_accuracy': avg_val_accuracy
          }
          log_data.append(epoch_log)

          with open(log_file, 'w') as f:
              json.dump(log_data, f, indent=4)

          print(f"Epoch {epoch+1}/{num_epochs} - Training Loss: {avg_train_loss:.4f} - Validation Loss: {avg_val_loss:.4f} - Training Accuracy: {avg_train_accuracy:.4f} - Validation Accuracy: {avg_val_accuracy:.4f}")

          epoch_pbar.update(1)
          epoch_pbar.set_postfix(
                train_loss=avg_train_loss,
                train_accuracy=avg_train_accuracy,
                val_loss=avg_val_loss,
                val_accuracy=avg_val_accuracy
          )
          print("==================================================================================")


In [30]:
dummy_rgb = torch.randn(1, 3, 224, 224)
dummy_thermal = torch.randn(1, 3, 224, 224)
dummy_audio = torch.randn(1, 16000)

rgb_features = rgb_model(dummy_rgb)
print(f"RGB Model Output Shape: {rgb_features.shape}")  # (1, D1)

thermal_features = thermal_model(dummy_thermal)
print(f"Thermal Model Output Shape: {thermal_features.shape}")  # (1, D2)

audio_features = audio_model.encode_batch(dummy_audio)
print(f"Audio Model Output Shape: {audio_features.shape}")  # (1, 1, D3)

audio_features = audio_features.squeeze(1)
print(f"Flattened Audio Output Shape: {audio_features.shape}")  # (1, D3)

input_dim = rgb_features.shape[1] + thermal_features.shape[1] + audio_features.shape[1]
print(f"Total Feature Dimension (MLPReducer input): {input_dim}")

RGB Model Output Shape: torch.Size([1, 141])
Thermal Model Output Shape: torch.Size([1, 141])
Audio Model Output Shape: torch.Size([1, 1, 192])
Flattened Audio Output Shape: torch.Size([1, 192])
Total Feature Dimension (MLPReducer input): 474


In [244]:
def train_model(save_path = "/content/drive/MyDrive/BiometricSystems/", pretraining_path = None):
  #train_loader, val_loader = get_data()
  train_loader, val_loader = train_dataloader, val_dataloader
  dim_vit = 141
  dim_audio = 0

  #model, criterion, optimizer = get_training_objects(pretraining_path)
  model = FinalModel(rgb_model, thermal_model, audio_model, MLPReducer(input_dim=2*dim_vit+dim_audio, output_dim=128), MLPClassifier(reduced_dim=128,num_classes=141))
  criterion = nn.CrossEntropyLoss()
  optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4)
  train(30, model, train_loader, val_loader, optimizer, criterion, save_path, os.path.join(save_path, "training_results.json"))

In [245]:
train_model()

Total Progress:   0%|          | 0/30 [00:00<?, ?epoch/s]

  0%|          | 0/1 [00:00<?, ?batch/s]

RuntimeError: values expected sparse tensor layout but got Strided