In [1]:
!pip install gdown
!pip install sentence_transformers



In [2]:
import torch
import numpy as np
import torch.nn.functional as F
import torch.nn as nn
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.model_selection import train_test_split
from time import time
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score
import os
import pandas as pd
import gc
import random
import pickle
import gdown
import json
import string
from zipfile import ZipFile
from torch.nn.functional import cosine_similarity
from tabulate import tabulate
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import matplotlib.pyplot as plt
import torch.optim.lr_scheduler as lr_scheduler
from copy import deepcopy



## Training

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
url = "https://drive.google.com/file/d/1PwoBj4Fv4qDIaEq75tydQ67XaUC36eAF/view?usp=drive_link"
output = "total_dataset_v5.pkl"
gdown.download(url, output, quiet=False, fuzzy=True)

In [None]:
url = 'https://drive.google.com/file/d/1kPpC2Hyy4H0adWbzbQpb6UbJZ2Au_APF/view?usp=sharing'
output = "lasp_concoat.pt"
gdown.download(url, output, quiet=False, fuzzy=True)

In [None]:
url = 'https://drive.google.com/file/d/15y2oFmuRX_OWvEDvmeyQMoHkAxJlx6AC/view?usp=sharing'
output = "lasp_gating.pt"
gdown.download(url, output, quiet=False, fuzzy=True)

In [None]:
with open('total_dataset_v5.pkl', 'rb') as f:
    total_dataset = pickle.load(f)

In [None]:
total_dataset['train'].keys()

In [4]:
url = "https://drive.google.com/file/d/1PwoBj4Fv4qDIaEq75tydQ67XaUC36eAF/view?usp=sharing"
output = "total_dataset_v11.pkl"
gdown.download(url, output, quiet=False, fuzzy=True)

Downloading...
From (uriginal): https://drive.google.com/uc?id=1PwoBj4Fv4qDIaEq75tydQ67XaUC36eAF
From (redirected): https://drive.google.com/uc?id=1PwoBj4Fv4qDIaEq75tydQ67XaUC36eAF&confirm=t&uuid=5aa2dbe2-6370-4455-bacb-2dbf5ebec878
To: /kaggle/working/total_dataset_v11.pkl
100%|██████████| 2.18G/2.18G [00:27<00:00, 78.7MB/s]


'total_dataset_v11.pkl'

In [4]:
with open('total_dataset_v11.pkl', 'rb') as f:
    total_dataset = pickle.load(f)

In [5]:
total_dataset['train'].keys()

dict_keys(['audio', 'text', 'image', 'xlmr-emb', 'hubert-emb', 'pure-text', 'id', 'source', 'audio_path'])

In [14]:
class HubertLabseConcat(nn.Module):
    def __init__(self, in_features_text, in_features_image):
        super(HubertLabseConcat, self).__init__()
        self.image_seq = nn.Sequential(
            nn.Linear(in_features_image, 768),
            nn.BatchNorm1d(768),
            nn.ReLU(),
            nn.Dropout(p=0.15),
            nn.Linear(768, 576),
            nn.BatchNorm1d(576),
            nn.LeakyReLU(),
            nn.Dropout(p=0.1),
            nn.Linear(576, 768),
        )
        self.audio_seq = nn.Sequential(
            nn.Linear(in_features_text, 768),
            nn.BatchNorm1d(768),
            nn.ReLU(),
            nn.Dropout(p=0.15),
            nn.Linear(768, 576),
            nn.BatchNorm1d(576),
            nn.LeakyReLU(),
            nn.Dropout(p=0.1),
            nn.Linear(576, 768),
        )
        
        self.mix_seq = nn.Sequential(
            nn.Linear(2 * 768, 1024),
            nn.BatchNorm1d(1024),
            nn.LeakyReLU(),
            nn.Linear(1024, 800),
            nn.LeakyReLU(),
            nn.Linear(800, 768),
        )
    
    def forward(self, x_audio, x_image):
        x1 = self.audio_seq(x_audio)
        x2 = self.image_seq(x_image)
        concats = torch.cat((x1, x2), dim=1)
        x = self.mix_seq(concats)
        return x

In [6]:
class HubertLabseConcat(nn.Module):
    def __init__(self, in_features_text, in_features_image, mode = 'joint'):
        super(HubertLabseConcat, self).__init__()
        self.mode = mode
        self.image_seq = nn.Sequential(
            nn.Linear(in_features_image, 768),
            nn.BatchNorm1d(768),
            nn.ReLU(),
            nn.Dropout(p=0.15),
            nn.Linear(768, 576),
            nn.BatchNorm1d(576),
            nn.LeakyReLU(),
            nn.Dropout(p=0.1),
            nn.Linear(576, 768),
        )
        self.audio_seq = nn.Sequential(
            nn.Linear(in_features_text, 768),
            nn.BatchNorm1d(768),
            nn.ReLU(),
            nn.Dropout(p=0.15),
            nn.Linear(768, 576),
            nn.BatchNorm1d(576),
            nn.LeakyReLU(),
            nn.Dropout(p=0.1),
            nn.Linear(576, 768),
        )
        
        self.mix_seq = nn.Sequential(
            nn.Linear(2 * 768, 1024),
            nn.BatchNorm1d(1024),
            nn.LeakyReLU(),
            nn.Linear(1024, 800),
            nn.LeakyReLU(),
            nn.Linear(800, 768),
        )
    
    def forward(self, x_audio, x_image):
        x1 = self.audio_seq(x_audio)
        if self.mode == 'audio':
            return x1
        x2 = self.image_seq(x_image)
        if self.mode == 'image':
            return x2
        concats = torch.cat((x1, x2), dim=1)
        x = self.mix_seq(concats)
        return x

In [7]:
class Wav2vecConcat(nn.Module):
    def __init__(self, in_features_text, in_features_image):
        super(Wav2vecConcat, self).__init__()
        self.image_seq = nn.Sequential(
            nn.Linear(in_features_image, 768),
            nn.BatchNorm1d(768),
            nn.ReLU(),
            nn.Dropout(p=0.15),
            nn.Linear(768, 576),
            nn.BatchNorm1d(576),
            nn.LeakyReLU(),
            nn.Dropout(p=0.1),
            nn.Linear(576, 768),
        )
        self.audio_seq = nn.Sequential(
            nn.Linear(in_features_text, 700),
            nn.BatchNorm1d(700),
            nn.ReLU(),
            nn.Dropout(p=0.15),
            nn.Linear(700, 576),
            nn.BatchNorm1d(576),
            nn.LeakyReLU(),
            nn.Dropout(p=0.1),
            nn.Linear(576, 768),
        )
        
        self.mix_seq = nn.Sequential(
            nn.Linear(2 * in_features_text, 1024),
            nn.BatchNorm1d(1024),
            nn.LeakyReLU(),
            nn.Linear(1024, 800),
            nn.LeakyReLU(),
            nn.Linear(800, in_features_text),
        )
    
    def forward(self, x_audio, x_image):
        x1 = self.audio_seq(x_audio)
        x2 = self.image_seq(x_image)
        concats = torch.cat((x1, x2), dim=1)
        x = self.mix_seq(concats)
        return x

In [8]:
class HubertLabseGating(nn.Module):
    def __init__(self, in_features_text, in_features_image):
        super(HubertLabseGating, self).__init__()
        self.image_seq = nn.Sequential(
            nn.Linear(in_features_image, 768),
            nn.BatchNorm1d(768),
            nn.ReLU(),
            nn.Dropout(p=0.15),
            nn.Linear(768, 576),
            nn.BatchNorm1d(576),
            nn.LeakyReLU(),
            nn.Dropout(p=0.1),
            nn.Linear(576, 768),
        )
        self.text_seq = nn.Sequential(
            nn.Linear(in_features_text, 768),
            nn.BatchNorm1d(768),
            nn.ReLU(),
            nn.Dropout(p=0.15),
            nn.Linear(768, 576),
            nn.BatchNorm1d(576),
            nn.LeakyReLU(),
            nn.Dropout(p=0.1),
            nn.Linear(576, 768),
        )
        
        # Input gate
        self.input_gate_text = nn.Sequential(
            nn.Linear(768, 768),
            nn.Sigmoid()
        )
        self.input_gate_image = nn.Sequential(
            nn.Linear(768, 768),
            nn.Sigmoid()
        )
        
        # Forget gate
        self.forget_gate_text = nn.Sequential(
            nn.Linear(768, 768),
            nn.Sigmoid()
        )
        self.forget_gate_image = nn.Sequential(
            nn.Linear(768, 768),
            nn.Sigmoid()
        )
        
        # Tanh function for new cell state computation
        self.tanh_text = nn.Sequential(
            nn.Linear(768, 768),
            nn.Tanh()
        )
        self.tanh_image = nn.Sequential(
            nn.Linear(768, 768),
            nn.Tanh()
        )
        
        # Weighting mechanism for final embedding computation
        self.weight = nn.Sequential(
            nn.Linear(768 * 2, 768 * 2)
        )

        
    def forward(self, x_text, x_image):
        x1 = self.text_seq(x_text)
        x2 = self.image_seq(x_image)
        
        # Apply input gate
        input_gate_text = self.input_gate_text(x1)
        input_gate_image = self.input_gate_image(x2)
        
        # Apply forget gate
        forget_gate_text = self.forget_gate_text(x1)
        forget_gate_image = self.forget_gate_image(x2)
        
        # Compute new cell state using tanh function
        new_cell_state_text = self.tanh_text(x1)
        new_cell_state_image = self.tanh_image(x2)
        
        # Update cell state using input and forget gates
        x1 = input_gate_text * new_cell_state_text + forget_gate_text * x1
        x2 = input_gate_image * new_cell_state_image + forget_gate_image * x2
        
        # Compute weighted average of updated cell states
        weight = torch.softmax(self.weight(torch.cat((x1, x2), dim=-1)), dim=-1)
        x = weight[:, :768] * x1 + weight[:, 768:] * x2
        
        return x


In [7]:
def train_the_model(model, train_dataloader, val_dataloader, model_path_save, num_epochs=100, learning_rate=5e-6, delta=0.6, temperature=np.log(0.07)):
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    def eval_epoch(model: nn.Module, dataloader: torch.utils.data.DataLoader, test_mode=False):
        eval_loss = 0
        model.eval()

        with torch.no_grad(), tqdm(enumerate(dataloader), total=len(dataloader)) as pbar:
            for i, (text_emb, audio_emb, image_emb) in pbar:
                text_emb = text_emb.to(device)
                audio_emb = audio_emb.to(device)
                image_emb = image_emb.to(device)

                final_emb = model(audio_emb, image_emb)

                # L2 normalize the embeddings
                final_emb = l2_normalize(final_emb)
                text_emb = l2_normalize(text_emb)

                # Compute similarity matrix
                sim_matrix = torch.matmul(final_emb, text_emb.t())
                
                # Scale similarity matrix by temperature
                temperature_tensor = torch.tensor(temperature).to(device)
                sim_matrix *= torch.exp(temperature_tensor)

                # Compute contrastive loss
                labels = torch.arange(sim_matrix.size(0)).to(device)
                loss = (nn.CrossEntropyLoss()(sim_matrix, labels) + nn.CrossEntropyLoss()(sim_matrix.t(), labels)) / 2

                eval_loss += loss.item()

                discription = 'Validation' if not test_mode else 'Test'
                pbar.set_description(f'{discription} Loss: {loss.item():.4f}')
        return eval_loss

    def l2_normalize(x, dim=-1):
        return x / x.norm(2, dim=dim, keepdim=True)
    
    def train_epoch(model: nn.Module, optimizer: torch.optim.Optimizer, dataloader: torch.utils.data.DataLoader, temperature):
        train_loss = 0
        model.train()
        with tqdm(enumerate(train_loader), total=len(train_loader)) as pbar:
            for i, (text_emb, audio_emb, image_emb) in pbar:
                
                text_emb = text_emb.to(device)
                audio_emb = audio_emb.to(device)
                image_emb = image_emb.to(device)

                final_emb = model(audio_emb, image_emb)

                # L2 normalize the embeddings
                final_emb = l2_normalize(final_emb)
                text_emb = l2_normalize(text_emb)

                # Compute similarity matrix
                sim_matrix = torch.matmul(final_emb, text_emb.t())
                
                # Scale similarity matrix by temperature
                temperature_tensor = torch.tensor(temperature).to(device)
                sim_matrix *= torch.exp(temperature_tensor)

                # Compute contrastive loss
                labels = torch.arange(sim_matrix.size(0)).to(device)
                loss = (nn.CrossEntropyLoss()(sim_matrix, labels) + nn.CrossEntropyLoss()(sim_matrix.t(), labels)) / 2

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                train_loss += loss.item()

                pbar.set_description(f'Train Loss: {loss.item():.4f}')

        return train_loss
    

    def train(model: nn.Module, optimizer: torch.optim.Optimizer, train_dataloader: torch.utils.data.DataLoader, val_dataloader: torch.utils.data.DataLoader, model_path_save, epochs: int, temperature, patience=10):
        train_losses = []
        val_losses = []
        best_val_loss = float('inf')
        counter = 0
        epoch_num = num_epochs
        best_model = None

        # Define the learning rate scheduler
        scheduler = lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.4)

        for epoch in range(epochs):
            gc.collect()
            torch.cuda.empty_cache()
            torch.cuda.ipc_collect()
            start_time = time()

            train_loss = train_epoch(model, optimizer, train_dataloader, temperature)
            val_loss = eval_epoch(model, val_dataloader, temperature)
            train_losses.append(train_loss)
            val_losses.append(val_loss)

            end_time = time()

            print(f'Epoch {epoch + 1} finished in {end_time - start_time:.2f}s')
            print(f"[Epoch {epoch + 1}]\t"
                f"Train Loss: {train_loss:.6f}\t"
                f"Validation Loss: {val_loss:.6f}")

            # Check if the validation loss has improved
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model = deepcopy(model)
                counter = 0
            else:
                counter += 1

            # If the validation loss didn't improve for 'patience' epochs, stop the training
            if counter >= patience:
                print(f'Early stopping after {patience} epochs without improvement in validation loss.')
                with open('stopped_epoch.txt', 'w') as f:
                    print(f'Stopped at epoch: {epoch + 1}')
                    f.write(f'Stopped at epoch: {epoch + 1}\n')
                    torch.save(best_model, model_path_save)
                    epoch_num = epoch + 1
                break

            # Step the learning rate scheduler
            scheduler.step()

        return train_losses, val_losses, epoch_num, best_model

    
    def plot_loss(loss, num_epochs, label):
        ls_epoch = [_ + 1 for _ in range(epoch_num)]
        plt.plot(ls_epoch, loss, color='r', label=label)
        plt.title('Loss plot')
        plt.ylabel('Loss')
        plt.xlabel('epoch')
        plt.legend()
        plt.show()
    
    
    train_losses, val_losses, epoch_num, best_model = train(model, optimizer, train_dataloader, val_dataloader, model_path_save, num_epochs, temperature)
    plot_loss(train_losses, epoch_num, 'train')
    plot_loss(val_losses, epoch_num, 'validation')
    return best_model

In [23]:
with open('stopped_epoch.txt', 'r') as f:
    print(f.read())

Stopped at epoch: 107



In [6]:
class CusDataset(Dataset):
    def __init__(self, dataset, audio_name, text_name):
        self.dataset = dataset
        self.text_name = text_name
        self.audio_name = audio_name
        
    def __len__(self):
        return len(self.dataset[self.audio_name])

    def __getitem__(self, i):
        return self.dataset[self.text_name][i], self.dataset[self.audio_name][i], self.dataset['image'][i]

In [None]:
train_loader = DataLoader(dataset=CusDataset(total_dataset['train'], 'hubert-emb', 'text'), batch_size=32, shuffle=True)
val_loader = DataLoader(dataset=CusDataset(total_dataset['validation'], 'hubert-emb', 'text'), batch_size=16, shuffle=False)
hubert_labse_model = HubertLabseConcat(1024, 1000).to(device)
hubert_labse_model = train_the_model(hubert_labse_model, train_loader, val_loader, num_epochs=400, temperature=float(np.log(0.07)))
torch.save(hubert_labse_model, 'es_models/hubert_labse_concat_es.pt')

In [None]:
train_loader = DataLoader(dataset=CusDataset(total_dataset['train'], 'hubert-emb', 'text'), batch_size=32, shuffle=True)
val_loader = DataLoader(dataset=CusDataset(total_dataset['validation'], 'hubert-emb', 'text'), batch_size=16, shuffle=False)
hubert_labse_model = torch.load('hubert_labse_model_v3.pt')
hubert_labse_model = train_the_model(hubert_labse_model, train_loader, val_loader, 'hubert_labse_model_es.pt', num_epochs=400, temperature=float(np.log(0.07)), learning_rate=1e-7)
torch.save(hubert_labse_model, 'hubert_labse_model_es.pt')

Train Loss: 3.3567: 100%|██████████| 2853/2853 [00:19<00:00, 147.77it/s]
Test Loss: 1.3637: 100%|██████████| 714/714 [00:02<00:00, 345.37it/s]


Epoch 1 finished in 21.38s
[Epoch 1]	Train Loss: 9762.914602	Validation Loss: 1950.041477


Train Loss: 3.3562: 100%|██████████| 2853/2853 [00:19<00:00, 146.32it/s]
Test Loss: 1.3637: 100%|██████████| 714/714 [00:02<00:00, 353.16it/s]


Epoch 2 finished in 21.53s
[Epoch 2]	Train Loss: 9762.780412	Validation Loss: 1949.998144


Train Loss: 3.3578: 100%|██████████| 2853/2853 [00:19<00:00, 146.17it/s]
Test Loss: 1.3634: 100%|██████████| 714/714 [00:02<00:00, 352.37it/s]


Epoch 3 finished in 21.55s
[Epoch 3]	Train Loss: 9762.796779	Validation Loss: 1949.952221


Train Loss: 3.3569: 100%|██████████| 2853/2853 [00:19<00:00, 145.98it/s]
Test Loss: 1.3635: 100%|██████████| 714/714 [00:02<00:00, 350.58it/s]


Epoch 4 finished in 21.59s
[Epoch 4]	Train Loss: 9762.779109	Validation Loss: 1949.970239


Train Loss: 3.4219:  44%|████▎     | 1246/2853 [00:08<00:10, 147.76it/s]

In [None]:
train_loader = DataLoader(dataset=CusDataset(total_dataset['train'], 'hubert-emb', 'text'), batch_size=32, shuffle=True)
val_loader = DataLoader(dataset=CusDataset(total_dataset['validation'], 'hubert-emb', 'text'), batch_size=16, shuffle=False)
hubert_labse_model = torch.load('es_models/hubert_labse_concat_es.pt')
hubert_labse_model = train_the_model(hubert_labse_model, train_loader, val_loader, 'es_models/hubert_labse_concat_es_v2.pt', num_epochs=400, temperature=float(np.log(0.07)), learning_rate=1e-7)
torch.save(hubert_labse_model, 'es_models/hubert_labse_concat_es_v2.pt')

In [None]:
train_loader = DataLoader(dataset=CusDataset(total_dataset['train'], 'hubert-emb', 'text'), batch_size=32, shuffle=True)
val_loader = DataLoader(dataset=CusDataset(total_dataset['validation'], 'hubert-emb', 'text'), batch_size=16, shuffle=False)
hubert_labse_model = HubertLabseConcat(1024, 1000, mode='image').to(device)
hubert_labse_model = train_the_model(hubert_labse_model, train_loader, val_loader, num_epochs=400, temperature=float(np.log(0.07)))
torch.save(hubert_labse_model, 'es_models/image_es.pt')

In [None]:
train_loader = DataLoader(dataset=CusDataset(total_dataset['train'], 'hubert-emb', 'text'), batch_size=32, shuffle=True)
val_loader = DataLoader(dataset=CusDataset(total_dataset['validation'], 'hubert-emb', 'text'), batch_size=16, shuffle=False)
hubert_labse_model = HubertLabseConcat(1024, 1000, mode='audio').to(device)
hubert_labse_model = train_the_model(hubert_labse_model, train_loader, val_loader, num_epochs=400, temperature=float(np.log(0.07)))
torch.save(hubert_labse_model, 'es_models/audio_es.pt')

In [None]:
train_loader = DataLoader(dataset=CusDataset(total_dataset['train'], 'audio', 'text'), batch_size=32, shuffle=True)
val_loader = DataLoader(dataset=CusDataset(total_dataset['validation'], 'audio', 'text'), batch_size=16, shuffle=False)
wave2vec_labse_concat = Wav2vecConcat(768, 1000).to(device)
wave2vec_labse_concat = train_the_model(wave2vec_labse_concat, train_loader, val_loader, num_epochs=50, temperature=float(np.log(0.07)))
torch.save(wave2vec_labse_concat, 'wave2vec_labse_concat.pt')

In [None]:
wave2vec_labse_concat = torch.load('wave2vec_labse_concat.pt')
train_loader = DataLoader(dataset=CusDataset(total_dataset['train'], 'audio', 'text'), batch_size=32, shuffle=True)
val_loader = DataLoader(dataset=CusDataset(total_dataset['validation'], 'audio', 'text'), batch_size=16, shuffle=False)
wave2vec_labse_concat = train_the_model(wave2vec_labse_concat, train_loader, val_loader, num_epochs=100)
torch.save(wave2vec_labse_concat, 'wave2vec_labse_concat_v2.pt')

In [None]:
wave2vec_labse_concat = torch.load('wave2vec_labse_concat_v2.pt')
train_loader = DataLoader(dataset=CusDataset(total_dataset['train'], 'audio', 'text'), batch_size=32, shuffle=True)
val_loader = DataLoader(dataset=CusDataset(total_dataset['validation'], 'audio', 'text'), batch_size=16, shuffle=False)
wave2vec_labse_concat = train_the_model(wave2vec_labse_concat, train_loader, val_loader, num_epochs=100, learning_rate=1e-6)
torch.save(wave2vec_labse_concat, 'wave2vec_labse_concat_v3.pt')

In [None]:
wave2vec_labse_concat = torch.load('wave2vec_labse_concat_v3.pt')
train_loader = DataLoader(dataset=CusDataset(total_dataset['train'], 'audio', 'text'), batch_size=32, shuffle=True)
val_loader = DataLoader(dataset=CusDataset(total_dataset['validation'], 'audio', 'text'), batch_size=16, shuffle=False)
wave2vec_labse_concat = train_the_model(wave2vec_labse_concat, train_loader, val_loader, num_epochs=50, learning_rate=5e-5)
torch.save(wave2vec_labse_concat, 'wave2vec_labse_concat_v4.pt')

In [None]:
train_loader = DataLoader(dataset=CusDataset(total_dataset['train'], 'hubert-emb', 'text'), batch_size=32, shuffle=True)
val_loader = DataLoader(dataset=CusDataset(total_dataset['validation'], 'hubert-emb', 'text'), batch_size=16, shuffle=False)
hubert_labse_model = HubertLabseConcat(1024, 1000).to(device)
hubert_labse_model = train_the_model(hubert_labse_model, train_loader, val_loader, num_epochs=100, temperature=float(np.log(0.07)))
torch.save(hubert_labse_model, 'hubert_labse_model.pt')

In [None]:
hubert_labse_model = torch.load('hubert_labse_model.pt')
train_loader = DataLoader(dataset=CusDataset(total_dataset['train'], 'hubert-emb', 'text'), batch_size=32, shuffle=True)
val_loader = DataLoader(dataset=CusDataset(total_dataset['validation'], 'hubert-emb', 'text'), batch_size=16, shuffle=False)
hubert_labse_model = train_the_model(hubert_labse_model, train_loader, val_loader, num_epochs=100)
torch.save(hubert_labse_model, 'hubert_labse_model_v2.pt')

In [None]:
hubert_labse_model = torch.load('hubert_labse_model_v2.pt')
train_loader = DataLoader(dataset=CusDataset(total_dataset['train'], 'hubert-emb', 'text'), batch_size=32, shuffle=True)
val_loader = DataLoader(dataset=CusDataset(total_dataset['validation'], 'hubert-emb', 'text'), batch_size=16, shuffle=False)
hubert_labse_model = train_the_model(hubert_labse_model, train_loader, val_loader, num_epochs=50)
torch.save(hubert_labse_model, 'hubert_labse_model_v3.pt')

In [None]:
hubert_labse_gating = HubertLabseGating(1024, 1000).to(device)
train_loader = DataLoader(dataset=CusDataset(total_dataset['train'], 'hubert-emb', 'text'), batch_size=32, shuffle=True)
val_loader = DataLoader(dataset=CusDataset(total_dataset['validation'], 'hubert-emb', 'text'), batch_size=16, shuffle=False)
hubert_labse_gating = train_the_model(hubert_labse_gating, train_loader, val_loader, num_epochs=150)
torch.save(hubert_labse_gating, 'hubert_labse_gating.pt')

In [None]:
hubert_labse_gating = torch.load('hubert_labse_gating.pt')
train_loader = DataLoader(dataset=CusDataset(total_dataset['train'], 'hubert-emb', 'text'), batch_size=32, shuffle=True)
val_loader = DataLoader(dataset=CusDataset(total_dataset['validation'], 'hubert-emb', 'text'), batch_size=16, shuffle=False)
hubert_labse_gating = train_the_model(hubert_labse_gating, train_loader, val_loader, num_epochs=100, learning_rate=1e-6)
torch.save(hubert_labse_gating, 'hubert_labse_gating_v2.pt')

In [None]:
hubert_labse_gating = torch.load('hubert_labse_gating_v2.pt')
train_loader = DataLoader(dataset=CusDataset(total_dataset['train'], 'hubert-emb', 'text'), batch_size=32, shuffle=True)
val_loader = DataLoader(dataset=CusDataset(total_dataset['validation'], 'hubert-emb', 'text'), batch_size=16, shuffle=False)
hubert_labse_gating = train_the_model(hubert_labse_gating, train_loader, val_loader, num_epochs=50, learning_rate=5e-7)
torch.save(hubert_labse_gating, 'hubert_labse_gating_v3.pt')

In [None]:
hubert_labse_model = torch.load('hubert_labse_model_v3.pt')
train_loader = DataLoader(dataset=CusDataset(total_dataset['train'], 'hubert-emb', 'text'), batch_size=32, shuffle=True)
val_loader = DataLoader(dataset=CusDataset(total_dataset['validation'], 'hubert-emb', 'text'), batch_size=16, shuffle=False)
hubert_labse_model = train_the_model(hubert_labse_model, train_loader, val_loader, num_epochs=50, learning_rate=2e-6)
torch.save(hubert_labse_model, 'hubert_labse_model_v4.pt')

# Soft Evaluation

In [11]:
test_len_data = len(total_dataset['test']['text'])

number_of_candidates_per_sample = 100
test_metadata = []

for index in range(test_len_data):
    candidate_indexes = random.sample([i for i in range(test_len_data) if i != index], number_of_candidates_per_sample - 1)
    candidate_indexes += [index]
    test_metadata.append(candidate_indexes)
len(test_metadata)

11411

In [37]:
class TestDataset(Dataset):
    def __init__(self, test_dataset, metadata, audio_name, text_name):
        self.data = test_dataset
        self.metadata = metadata
        self.audio_name = audio_name
        self.text_name = text_name

    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, index):
        candidate_indexes = self.metadata[index]
        text_embedding = self.data[self.text_name][index]
        audio_embeddings = [self.data[self.audio_name][i] for i in candidate_indexes]
        image_embeddings = [self.data['image'][i] for i in candidate_indexes]
        label_index = len(candidate_indexes) - 1
        audio_embeddings = torch.stack(audio_embeddings)
        image_embeddings = torch.stack(image_embeddings)

        return text_embedding, audio_embeddings, image_embeddings, label_index

In [38]:
def evaluate(model_path, threshold=0.5):
    model = torch.load(model_path)
    model.eval()
    model = model.to(device)

    def compute_cosine_similarity(embedding1: torch.Tensor, embedding2: torch.Tensor) -> float:
        similarity = cosine_similarity(embedding1.unsqueeze(0), embedding2.unsqueeze(0)).item()
        return similarity

    def cosine_similarity(embedding1, embedding2):
        dim = 1
        embedding1 = F.normalize(embedding1, p=2, dim=dim)
        embedding2 = F.normalize(embedding2, p=2, dim=dim)

        dot_product = torch.sum(embedding1 * embedding2, dim=dim)

        magnitude1 = torch.norm(embedding1, p=2, dim=dim)
        magnitude2 = torch.norm(embedding2, p=2, dim=dim)

        cosine_sim = dot_product / (magnitude1 * magnitude2)

        return cosine_sim
    
    def _evaluate(model, dataloader, threshold=0.5):
        total_hits_1 = 0
        total_mrr = 0
        total_instances = 0
        total_labels = []
        total_predictions = []
        number_of_golden_predictions = 0

        with torch.no_grad():
            for text_embedding, audio_candidates, image_candidates, label in tqdm(dataloader):
                label = label[0]
                text_embedding = text_embedding[0].to(device)
                label = label.to(device)

                audio_candidates = audio_candidates[0]
                audio_candidates = audio_candidates.to(device)
                
                image_candidates = image_candidates[0]
                image_candidates = image_candidates.to(device)
                final_embs = model(audio_candidates, image_candidates)
                
                text_candidate_cosine_similarities = [compute_cosine_similarity(text_embedding, item) for item in final_embs]
                predicted_idx = np.argmax(text_candidate_cosine_similarities)
                
                label_similarity = text_candidate_cosine_similarities[label.item()]

                # Compute Hits@1
                if predicted_idx == label.item():   
                    total_hits_1 += 1

                # Compute MRR
                label_rank = sum([1 for x in text_candidate_cosine_similarities if x > text_candidate_cosine_similarities[label.item()]])
                reciprocal_rank = 1 / (label_rank + 1)
                total_mrr += reciprocal_rank

                # Record predictions and labels
                predictions = [0 if sim < threshold else 1 for sim in text_candidate_cosine_similarities]
                total_labels.extend([0 if i != label.item() else 1 for i in range(len(text_candidate_cosine_similarities))])
                total_predictions.extend(predictions)
                if label_similarity >= threshold:
                    number_of_golden_predictions += 1

                total_instances += 1

        # Compute average metrics over all instances
        avg_hits_1 = total_hits_1 / total_instances
        avg_mrr = total_mrr / total_instances
        precision = precision_score(total_labels, total_predictions, average='macro')
        recall = recall_score(total_labels, total_predictions, average='macro')
        f1 = f1_score(total_labels, total_predictions, average='macro')
        precision_micro = precision_score(total_labels, total_predictions, average='micro')
        recall_micro = recall_score(total_labels, total_predictions, average='micro')
        f1_micro = f1_score(total_labels, total_predictions, average='micro')
        accuracy = accuracy_score(total_labels, total_predictions)
        golden_prediction_accuracy = number_of_golden_predictions / total_instances

        return {
            'Hits@1': avg_hits_1,
            'MRR': avg_mrr,
            'Macro Precision': precision,
            'Macro Recall': recall,
            'Macro F1': f1,
            'Micro Precision': precision_micro,
            'Micro Recall': recall_micro,
            'Micro F1': f1_micro,
            'Accuracy': accuracy,
            'Golden Accuracy': golden_prediction_accuracy,
        }
    
    results = _evaluate(model, test_final_loader, threshold=threshold)
    table = []
    for i in range(len(results)):
        table.append([list(results.keys())[i], list(results.values())[i]])
    print(tabulate(table, ['Metrics', 'Values'], tablefmt="grid"))


In [39]:
test_dataset = TestDataset(total_dataset['test'], test_metadata, 'hubert-emb', 'text')
test_final_loader = DataLoader(dataset=test_dataset, batch_size=1, shuffle=False)
evaluate('es_models/hubert_labse_concat_es_v2.pt', threshold=0.4)

100%|██████████| 11411/11411 [04:22<00:00, 43.43it/s]


+-----------------+----------+
| Metrics         |   Values |
| Hits@1          | 0.990185 |
+-----------------+----------+
| MRR             | 0.99355  |
+-----------------+----------+
| Macro Precision | 0.959362 |
+-----------------+----------+
| Macro Recall    | 0.979199 |
+-----------------+----------+
| Macro F1        | 0.969066 |
+-----------------+----------+
| Micro Precision | 0.998749 |
+-----------------+----------+
| Micro Recall    | 0.998749 |
+-----------------+----------+
| Micro F1        | 0.998749 |
+-----------------+----------+
| Accuracy        | 0.998749 |
+-----------------+----------+
| Golden Accuracy | 0.95925  |
+-----------------+----------+


In [14]:
test_dataset = TestDataset(total_dataset['test'], test_metadata, 'hubert-emb', 'text')
test_final_loader = DataLoader(dataset=test_dataset, batch_size=1, shuffle=False)
evaluate('es_models/hubert_labse_concat_es.pt', threshold=0.4)

100%|██████████| 11411/11411 [04:34<00:00, 41.52it/s]


+-----------------+----------+
| Metrics         |   Values |
| Hits@1          | 0.990798 |
+-----------------+----------+
| MRR             | 0.993901 |
+-----------------+----------+
| Macro Precision | 0.957193 |
+-----------------+----------+
| Macro Recall    | 0.979393 |
+-----------------+----------+
| Macro F1        | 0.968024 |
+-----------------+----------+
| Micro Precision | 0.998703 |
+-----------------+----------+
| Micro Recall    | 0.998703 |
+-----------------+----------+
| Micro F1        | 0.998703 |
+-----------------+----------+
| Accuracy        | 0.998703 |
+-----------------+----------+
| Golden Accuracy | 0.959688 |
+-----------------+----------+


In [None]:
test_dataset = TestDataset(total_dataset['test'], test_metadata, 'audio', 'text')
test_final_loader = DataLoader(dataset=test_dataset, batch_size=1, shuffle=False)
evaluate('wave2vec_labse_concat.pt', threshold=0.4)

In [None]:
test_dataset = TestDataset(total_dataset['test'], test_metadata, 'audio', 'text')
test_final_loader = DataLoader(dataset=test_dataset, batch_size=1, shuffle=False)
evaluate('wave2vec_labse_concat_v2.pt', threshold=0.4)

In [None]:
test_dataset = TestDataset(total_dataset['test'], test_metadata, 'audio', 'text')
test_final_loader = DataLoader(dataset=test_dataset, batch_size=1, shuffle=False)
evaluate('wave2vec_labse_concat_v3.pt', threshold=0.4)

In [None]:
test_dataset = TestDataset(total_dataset['test'], test_metadata, 'audio', 'text')
test_final_loader = DataLoader(dataset=test_dataset, batch_size=1, shuffle=False)
evaluate('wave2vec_labse_concat_v4.pt', threshold=0.4)

In [14]:
test_dataset = TestDataset(total_dataset['test'], test_metadata, 'hubert-emb', 'text')
test_final_loader = DataLoader(dataset=test_dataset, batch_size=1, shuffle=False)
evaluate('hubert_labse_model.pt', threshold=0.4)

100%|██████████| 11411/11411 [04:36<00:00, 41.31it/s]


+-----------------+----------+
| Metrics         |   Values |
| Hits@1          | 0.989571 |
+-----------------+----------+
| MRR             | 0.993373 |
+-----------------+----------+
| Macro Precision | 0.958441 |
+-----------------+----------+
| Macro Recall    | 0.975774 |
+-----------------+----------+
| Macro F1        | 0.966943 |
+-----------------+----------+
| Micro Precision | 0.998666 |
+-----------------+----------+
| Micro Recall    | 0.998666 |
+-----------------+----------+
| Micro F1        | 0.998666 |
+-----------------+----------+
| Accuracy        | 0.998666 |
+-----------------+----------+
| Golden Accuracy | 0.952414 |
+-----------------+----------+


In [15]:
test_dataset = TestDataset(total_dataset['test'], test_metadata, 'hubert-emb', 'text')
test_final_loader = DataLoader(dataset=test_dataset, batch_size=1, shuffle=False)
evaluate('hubert_labse_model_v2.pt', threshold=0.5)

100%|██████████| 11411/11411 [04:41<00:00, 40.51it/s]


+-----------------+----------+
| Metrics         |   Values |
| Hits@1          | 0.990535 |
+-----------------+----------+
| MRR             | 0.993866 |
+-----------------+----------+
| Macro Precision | 0.986895 |
+-----------------+----------+
| Macro Recall    | 0.916422 |
+-----------------+----------+
| Macro F1        | 0.948855 |
+-----------------+----------+
| Micro Precision | 0.998121 |
+-----------------+----------+
| Micro Recall    | 0.998121 |
+-----------------+----------+
| Micro F1        | 0.998121 |
+-----------------+----------+
| Accuracy        | 0.998121 |
+-----------------+----------+
| Golden Accuracy | 0.833056 |
+-----------------+----------+


In [None]:
test_dataset = TestDataset(total_dataset['test'], test_metadata, 'hubert-emb', 'text')
test_final_loader = DataLoader(dataset=test_dataset, batch_size=1, shuffle=False)
evaluate('hubert_labse_model_v3.pt', threshold=0.4)

In [None]:
test_dataset = TestDataset(total_dataset['test'], test_metadata, 'hubert-emb', 'text')
test_final_loader = DataLoader(dataset=test_dataset, batch_size=1, shuffle=False)
evaluate('hubert_labse_model_v4.pt', threshold=0.4)

In [17]:
test_dataset = TestDataset(total_dataset['test'], test_metadata, 'hubert-emb', 'text')
test_final_loader = DataLoader(dataset=test_dataset, batch_size=1, shuffle=False)
evaluate('lasp_concoat.pt', threshold=0.5)

100%|██████████| 11411/11411 [04:34<00:00, 41.52it/s]


+-----------------+----------+
| Metrics         |   Values |
| Hits@1          | 0.993603 |
+-----------------+----------+
| MRR             | 0.996226 |
+-----------------+----------+
| Macro Precision | 0.904683 |
+-----------------+----------+
| Macro Recall    | 0.983863 |
+-----------------+----------+
| Macro F1        | 0.94066  |
+-----------------+----------+
| Micro Precision | 0.99742  |
+-----------------+----------+
| Micro Recall    | 0.99742  |
+-----------------+----------+
| Micro F1        | 0.99742  |
+-----------------+----------+
| Accuracy        | 0.99742  |
+-----------------+----------+
| Golden Accuracy | 0.970029 |
+-----------------+----------+


In [18]:
test_dataset = TestDataset(total_dataset['test'], test_metadata, 'hubert-emb', 'text')
test_final_loader = DataLoader(dataset=test_dataset, batch_size=1, shuffle=False)
evaluate('lasp_gating.pt', threshold=0.5)

100%|██████████| 11411/11411 [04:35<00:00, 41.40it/s]


+-----------------+----------+
| Metrics         |   Values |
| Hits@1          | 0.992463 |
+-----------------+----------+
| MRR             | 0.995251 |
+-----------------+----------+
| Macro Precision | 0.904098 |
+-----------------+----------+
| Macro Recall    | 0.586168 |
+-----------------+----------+
| Macro F1        | 0.640389 |
+-----------------+----------+
| Micro Precision | 0.991339 |
+-----------------+----------+
| Micro Recall    | 0.991339 |
+-----------------+----------+
| Micro F1        | 0.991339 |
+-----------------+----------+
| Accuracy        | 0.991339 |
+-----------------+----------+
| Golden Accuracy | 0.172728 |
+-----------------+----------+


In [34]:
test_dataset = TestDataset(total_dataset['test'], test_metadata, 'hubert-emb', 'text')
test_final_loader = DataLoader(dataset=test_dataset, batch_size=1, shuffle=False)
evaluate('hubert_labse_gating.pt', threshold=0.4)

100%|██████████| 11411/11411 [04:44<00:00, 40.15it/s]


+-----------------+----------+
| Metrics         |   Values |
| Hits@1          | 0.991149 |
+-----------------+----------+
| MRR             | 0.994257 |
+-----------------+----------+
| Macro Precision | 0.963398 |
+-----------------+----------+
| Macro Recall    | 0.981521 |
+-----------------+----------+
| Macro F1        | 0.972282 |
+-----------------+----------+
| Micro Precision | 0.998881 |
+-----------------+----------+
| Micro Recall    | 0.998881 |
+-----------------+----------+
| Micro F1        | 0.998881 |
+-----------------+----------+
| Accuracy        | 0.998881 |
+-----------------+----------+
| Golden Accuracy | 0.963807 |
+-----------------+----------+


In [35]:
test_dataset = TestDataset(total_dataset['test'], test_metadata, 'hubert-emb', 'text')
test_final_loader = DataLoader(dataset=test_dataset, batch_size=1, shuffle=False)
evaluate('hubert_labse_gating_v2.pt', threshold=0.4)

100%|██████████| 11411/11411 [04:41<00:00, 40.58it/s]


+-----------------+----------+
| Metrics         |   Values |
| Hits@1          | 0.991237 |
+-----------------+----------+
| MRR             | 0.994324 |
+-----------------+----------+
| Macro Precision | 0.962368 |
+-----------------+----------+
| Macro Recall    | 0.982254 |
+-----------------+----------+
| Macro F1        | 0.972097 |
+-----------------+----------+
| Micro Precision | 0.998871 |
+-----------------+----------+
| Micro Recall    | 0.998871 |
+-----------------+----------+
| Micro F1        | 0.998871 |
+-----------------+----------+
| Accuracy        | 0.998871 |
+-----------------+----------+
| Golden Accuracy | 0.965297 |
+-----------------+----------+


In [36]:
test_dataset = TestDataset(total_dataset['test'], test_metadata, 'hubert-emb', 'text')
test_final_loader = DataLoader(dataset=test_dataset, batch_size=1, shuffle=False)
evaluate('hubert_labse_gating_v3.pt', threshold=0.4)

100%|██████████| 11411/11411 [04:43<00:00, 40.26it/s]


+-----------------+----------+
| Metrics         |   Values |
| Hits@1          | 0.991412 |
+-----------------+----------+
| MRR             | 0.994425 |
+-----------------+----------+
| Macro Precision | 0.963132 |
+-----------------+----------+
| Macro Recall    | 0.981606 |
+-----------------+----------+
| Macro F1        | 0.972184 |
+-----------------+----------+
| Micro Precision | 0.998877 |
+-----------------+----------+
| Micro Recall    | 0.998877 |
+-----------------+----------+
| Micro F1        | 0.998877 |
+-----------------+----------+
| Accuracy        | 0.998877 |
+-----------------+----------+
| Golden Accuracy | 0.963982 |
+-----------------+----------+


# Hard Evaluation

In [21]:
model_to_test = torch.load('hubert_labse_model_v3.pt')
total_dataset['test']['clasp_emb'] = []

test_dataloader = DataLoader(dataset=CusDataset(total_dataset['test'], 'hubert-emb', 'text'), batch_size=32, shuffle=False)

with torch.no_grad():
    for text_embedding, audio_candidates, image_candidates in tqdm(test_dataloader):
        text_embedding = text_embedding.to(device)
        audio_candidates = audio_candidates.to(device)
        image_candidates = image_candidates.to(device)
        final_emb = model_to_test(audio_candidates, image_candidates)
        total_dataset['test']['clasp_emb'].extend(final_emb)

100%|██████████| 357/357 [00:00<00:00, 678.36it/s]


In [27]:
for i in range(len(total_dataset['test']['clasp_emb'])):
    total_dataset['test']['clasp_emb'][i] = total_dataset['test']['clasp_emb'][i].cpu()

In [28]:
len(total_dataset['test']['clasp_emb'])

11411

In [29]:
total_dataset['test']['clasp_emb'][0].device

device(type='cpu')

In [30]:
total_dataset['test']['text'][0].device

device(type='cpu')

In [31]:
similarity_dataset = {}
similarity_dataset['clasp_emb'] = total_dataset['test']['clasp_emb']
similarity_dataset['text'] = total_dataset['test']['text']

with open('similarity_dataset.pkl', 'wb') as f:
    pickle.dump(similarity_dataset, f)

In [9]:
def cosine_similarity(embedding1, embedding2):
    dim = 1
    embedding1 = F.normalize(embedding1, p=2, dim=dim)
    embedding2 = F.normalize(embedding2, p=2, dim=dim)

    dot_product = torch.sum(embedding1 * embedding2, dim=dim)

    magnitude1 = torch.norm(embedding1, p=2, dim=dim)
    magnitude2 = torch.norm(embedding2, p=2, dim=dim)

    cosine_sim = dot_product / (magnitude1 * magnitude2)

    return cosine_sim

In [33]:
# build similarity matrix between text and clasp_emb, similarity_matrix[i][j] shows cosine similarity betewen clasp_emb[i] and text[j]
similarity_matrix = []
for i in tqdm(range(len(similarity_dataset['clasp_emb']))):
    similarity_matrix.append([])
    for j in range(len(similarity_dataset['text'])):
        similarity_matrix[i].append(cosine_similarity(similarity_dataset['clasp_emb'][i].unsqueeze(0), similarity_dataset['text'][j].unsqueeze(0)).item())

100%|██████████| 11411/11411 [3:12:39<00:00,  1.01s/it] 


In [35]:
with open('similarity_matrix.pkl', 'wb') as f:
    pickle.dump(similarity_matrix, f)

In [18]:
def evaluate_(similarity_matrix, threshold=0.5):

    def _evaluate(similarity_matrix, threshold=0.5):
        total_hits_1 = 0
        total_mrr = 0
        total_instances = 0
        total_labels = []
        total_predictions = []
        number_of_golden_predictions = 0

        for i in tqdm(range(len(similarity_matrix))):
            predicted_idx = np.argmax(similarity_matrix[i])
            label_similarity = similarity_matrix[i][i]

            # Compute Hits@1
            if predicted_idx == i:   
                total_hits_1 += 1

            # Compute MRR
            label_rank = sum([1 for x in similarity_matrix[i] if x > similarity_matrix[i][i]])
            reciprocal_rank = 1 / (label_rank + 1)
            total_mrr += reciprocal_rank

            # Record predictions and labels
            predictions = [0 if sim < threshold else 1 for sim in similarity_matrix[i]]
            total_labels.extend([0 if k != i else 1 for k in range(len(similarity_matrix[i]))])
            total_predictions.extend(predictions)
            if label_similarity >= threshold:
                number_of_golden_predictions += 1

            total_instances += 1


        # Compute average metrics over all instances
        avg_hits_1 = total_hits_1 / total_instances
        avg_mrr = total_mrr / total_instances
        precision = precision_score(total_labels, total_predictions, average='macro')
        recall = recall_score(total_labels, total_predictions, average='macro')
        f1 = f1_score(total_labels, total_predictions, average='macro')
        precision_micro = precision_score(total_labels, total_predictions, average='micro')
        recall_micro = recall_score(total_labels, total_predictions, average='micro')
        f1_micro = f1_score(total_labels, total_predictions, average='micro')
        accuracy = accuracy_score(total_labels, total_predictions)
        golden_prediction_accuracy = number_of_golden_predictions / total_instances

        return {
            'Hits@1': avg_hits_1,
            'MRR': avg_mrr,
            'Macro Precision': precision,
            'Macro Recall': recall,
            'Macro F1': f1,
            'Micro Precision': precision_micro,
            'Micro Recall': recall_micro,
            'Micro F1': f1_micro,
            'Accuracy': accuracy,
            'Golden Accuracy': golden_prediction_accuracy,
        }
    
    results = _evaluate(similarity_matrix, threshold=threshold)
    table = []
    for i in range(len(results)):
        table.append([list(results.keys())[i], list(results.values())[i]])
    print(tabulate(table, ['Metrics', 'Values'], tablefmt="grid"))


In [39]:
evaluate(similarity_matrix, threshold=0.5)

100%|██████████| 11411/11411 [00:49<00:00, 229.05it/s]


+-----------------+----------+
| Metrics         |   Values |
| Hits@1          | 0.910525 |
+-----------------+----------+
| MRR             | 0.955484 |
+-----------------+----------+
| Macro Precision | 0.634033 |
+-----------------+----------+
| Macro Recall    | 0.918312 |
+-----------------+----------+
| Macro F1        | 0.702983 |
+-----------------+----------+
| Micro Precision | 0.999785 |
+-----------------+----------+
| Micro Recall    | 0.999785 |
+-----------------+----------+
| Micro F1        | 0.999785 |
+-----------------+----------+
| Accuracy        | 0.999785 |
+-----------------+----------+
| Golden Accuracy | 0.836824 |
+-----------------+----------+


In [41]:
model_name = 'lasp_concat'
model_to_test = torch.load(f'lasp_concoat.pt')
total_dataset['test'][f'{model_name}_emb'] = []

test_dataloader = DataLoader(dataset=CusDataset(total_dataset['test'], 'hubert-emb', 'text'), batch_size=32, shuffle=False)

with torch.no_grad():
    for text_embedding, audio_candidates, image_candidates in tqdm(test_dataloader):
        text_embedding = text_embedding.to(device)
        audio_candidates = audio_candidates.to(device)
        image_candidates = image_candidates.to(device)
        final_emb = model_to_test(audio_candidates, image_candidates)
        total_dataset['test'][f'{model_name}_emb'].extend(final_emb)

100%|██████████| 357/357 [00:00<00:00, 359.08it/s]


In [42]:
for i in range(len(total_dataset['test'][f'{model_name}_emb'])):
    total_dataset['test'][f'{model_name}_emb'][i] = total_dataset['test'][f'{model_name}_emb'][i].cpu()

In [43]:
similarity_dataset = {}
similarity_dataset[f'{model_name}_emb'] = total_dataset['test'][f'{model_name}_emb']
similarity_dataset['text'] = total_dataset['test']['text']

similarity_matrix = []
for i in tqdm(range(len(similarity_dataset[f'{model_name}_emb']))):
    similarity_matrix.append([])
    for j in range(len(similarity_dataset['text'])):
        similarity_matrix[i].append(cosine_similarity(similarity_dataset[f'{model_name}_emb'][i].unsqueeze(0), similarity_dataset['text'][j].unsqueeze(0)).item())
        
with open(f'similarity_matrix_{model_name}.pkl', 'wb') as f:
    pickle.dump(similarity_matrix, f)

100%|██████████| 11411/11411 [3:13:28<00:00,  1.02s/it] 


In [44]:
evaluate(similarity_matrix, threshold=0.5)

100%|██████████| 11411/11411 [00:49<00:00, 230.02it/s]


+-----------------+----------+
| Metrics         |   Values |
| Hits@1          | 0.853562 |
+-----------------+----------+
| MRR             | 0.911378 |
+-----------------+----------+
| Macro Precision | 0.517825 |
+-----------------+----------+
| Macro Recall    | 0.983865 |
+-----------------+----------+
| Macro F1        | 0.533813 |
+-----------------+----------+
| Micro Precision | 0.997698 |
+-----------------+----------+
| Micro Recall    | 0.997698 |
+-----------------+----------+
| Micro F1        | 0.997698 |
+-----------------+----------+
| Accuracy        | 0.997698 |
+-----------------+----------+
| Golden Accuracy | 0.970029 |
+-----------------+----------+


In [45]:
total_dataset['test'].keys()

dict_keys(['audio', 'text', 'image', 'xlmr-emb', 'hubert-emb', 'pure-text', 'id', 'source', 'audio_path', 'clasp_emb', 'lasp_concat_emb'])

In [None]:
model_name = 'hubert_labse_gating_v3'
model_to_test = torch.load(f'{model_name}.pt')
total_dataset['test'][f'{model_name}_emb'] = []

test_dataloader = DataLoader(dataset=CusDataset(total_dataset['test'], 'hubert-emb', 'text'), batch_size=32, shuffle=False)

with torch.no_grad():
    for text_embedding, audio_candidates, image_candidates in tqdm(test_dataloader):
        text_embedding = text_embedding.to(device)
        audio_candidates = audio_candidates.to(device)
        image_candidates = image_candidates.to(device)
        final_emb = model_to_test(audio_candidates, image_candidates)
        total_dataset['test'][f'{model_name}_emb'].extend(final_emb)

100%|██████████| 357/357 [00:00<00:00, 563.85it/s]


In [None]:
for i in range(len(total_dataset['test'][f'{model_name}_emb'])):
    total_dataset['test'][f'{model_name}_emb'][i] = total_dataset['test'][f'{model_name}_emb'][i].cpu()

In [None]:
similarity_dataset = {}
similarity_dataset[f'{model_name}_emb'] = total_dataset['test'][f'{model_name}_emb']
similarity_dataset['text'] = total_dataset['test']['text']

similarity_matrix = []
for i in tqdm(range(len(similarity_dataset[f'{model_name}_emb']))):
    similarity_matrix.append([])
    for j in range(len(similarity_dataset['text'])):
        similarity_matrix[i].append(cosine_similarity(similarity_dataset[f'{model_name}_emb'][i].unsqueeze(0), similarity_dataset['text'][j].unsqueeze(0)).item())
        
with open(f'similarity_matrix_{model_name}.pkl', 'wb') as f:
    pickle.dump(similarity_matrix, f)

100%|██████████| 11411/11411 [1:10:33<00:00,  2.70it/s]


In [None]:
evaluate(similarity_matrix, threshold=0.5)

100%|██████████| 11411/11411 [00:51<00:00, 222.30it/s]


+-----------------+----------+
| Metrics         |   Values |
| Hits@1          | 0.908159 |
+-----------------+----------+
| MRR             | 0.954224 |
+-----------------+----------+
| Macro Precision | 0.635481 |
+-----------------+----------+
| Macro Recall    | 0.912093 |
+-----------------+----------+
| Macro F1        | 0.703889 |
+-----------------+----------+
| Micro Precision | 0.99979  |
+-----------------+----------+
| Micro Recall    | 0.99979  |
+-----------------+----------+
| Micro F1        | 0.99979  |
+-----------------+----------+
| Accuracy        | 0.99979  |
+-----------------+----------+
| Golden Accuracy | 0.82438  |
+-----------------+----------+


In [None]:
def evaluate_matrix(similarity_matrix, threshold=0.5):

    def _evaluate(similarity_matrix, threshold=0.5):
        total_hits_1 = 0
        total_mrr = 0
        total_instances = 0
        total_labels = []
        total_predictions = []
        number_of_golden_predictions = 0

        for i in tqdm(range(len(similarity_matrix))):
            predicted_idx = np.argmax(similarity_matrix[i])
            label_similarity = similarity_matrix[i][i]

            # Compute Hits@1
            if predicted_idx == i:   
                total_hits_1 += 1

            # Compute MRR
            label_rank = sum([1 for x in similarity_matrix[i] if x > similarity_matrix[i][i]])
            reciprocal_rank = 1 / (label_rank + 1)
            total_mrr += reciprocal_rank

            # Record predictions and labels
            predictions = [0 if sim < threshold else 1 for sim in similarity_matrix[i]]
            total_labels.extend([0 if k != i else 1 for k in range(len(similarity_matrix[i]))])
            total_predictions.extend(predictions)
            if label_similarity >= threshold:
                number_of_golden_predictions += 1

            total_instances += 1


        # Compute average metrics over all instances
        avg_hits_1 = total_hits_1 / total_instances
        avg_mrr = total_mrr / total_instances
        precision = precision_score(total_labels, total_predictions, average='macro')
        recall = recall_score(total_labels, total_predictions, average='macro')
        f1 = f1_score(total_labels, total_predictions, average='macro')
        precision_micro = precision_score(total_labels, total_predictions, average='micro')
        recall_micro = recall_score(total_labels, total_predictions, average='micro')
        f1_micro = f1_score(total_labels, total_predictions, average='micro')
        accuracy = accuracy_score(total_labels, total_predictions)
        golden_prediction_accuracy = number_of_golden_predictions / total_instances

        return {
            'Hits@1': avg_hits_1,
            'MRR': avg_mrr,
            'Macro Precision': precision,
            'Macro Recall': recall,
            'Macro F1': f1,
            'Micro Precision': precision_micro,
            'Micro Recall': recall_micro,
            'Micro F1': f1_micro,
            'Accuracy': accuracy,
            'Golden Accuracy': golden_prediction_accuracy,
        }
    
    results = _evaluate(similarity_matrix, threshold=threshold)
    table = []
    for i in range(len(results)):
        table.append([list(results.keys())[i], list(results.values())[i]])
    print(tabulate(table, ['Metrics', 'Values'], tablefmt="grid"))


In [None]:
def hard_evaluate(model_name, directory=''):
    if directory == '':
        model_to_test = torch.load(f'{model_name}.pt')
    else:
        model_to_test = torch.load(f'{directory}/{model_name}.pt')
    total_dataset['test'][f'{model_name}_emb'] = []

    test_dataloader = DataLoader(dataset=CusDataset(total_dataset['test'], 'hubert-emb', 'text'), batch_size=32, shuffle=False)

    with torch.no_grad():
        for text_embedding, audio_candidates, image_candidates in tqdm(test_dataloader):
            text_embedding = text_embedding.to(device)
            audio_candidates = audio_candidates.to(device)
            image_candidates = image_candidates.to(device)
            final_emb = model_to_test(audio_candidates, image_candidates)
            total_dataset['test'][f'{model_name}_emb'].extend(final_emb)

    for i in range(len(total_dataset['test'][f'{model_name}_emb'])):
        total_dataset['test'][f'{model_name}_emb'][i] = total_dataset['test'][f'{model_name}_emb'][i].cpu()

    similarity_dataset = {}
    similarity_dataset[f'{model_name}_emb'] = total_dataset['test'][f'{model_name}_emb']
    similarity_dataset['text'] = total_dataset['test']['text']

    similarity_matrix = []
    for i in tqdm(range(len(similarity_dataset[f'{model_name}_emb']))):
        similarity_matrix.append([])
        for j in range(len(similarity_dataset['text'])):
            similarity_matrix[i].append(cosine_similarity(similarity_dataset[f'{model_name}_emb'][i].unsqueeze(0), similarity_dataset['text'][j].unsqueeze(0)).item())

    with open(f'similarity_matrix_{model_name}.pkl', 'wb') as f:
        pickle.dump(similarity_matrix, f)
    
    with open('model_name.txt', 'w') as f:
        f.write(f'{model_name}')
    
    evaluate_matrix(similarity_matrix, threshold=0.5)

In [None]:
hard_evaluate('hubert_labse_model_es')

100%|██████████| 357/357 [00:02<00:00, 167.71it/s]
100%|██████████| 11411/11411 [3:09:34<00:00,  1.00it/s] 
100%|██████████| 11411/11411 [00:50<00:00, 226.21it/s]


+-----------------+----------+
| Metrics         |   Values |
| Hits@1          | 0.911489 |
+-----------------+----------+
| MRR             | 0.9567   |
+-----------------+----------+
| Macro Precision | 0.63786  |
+-----------------+----------+
| Macro Recall    | 0.92423  |
+-----------------+----------+
| Macro F1        | 0.708063 |
+-----------------+----------+
| Micro Precision | 0.999791 |
+-----------------+----------+
| Micro Recall    | 0.999791 |
+-----------------+----------+
| Micro F1        | 0.999791 |
+-----------------+----------+
| Accuracy        | 0.999791 |
+-----------------+----------+
| Golden Accuracy | 0.848655 |
+-----------------+----------+


# Hubert ASR Hard Evaluation

In [46]:
url = "https://drive.google.com/file/d/1UChSPa_Uv6levN9pWN9jkNkBgkJWi6nL/view?usp=sharing"
output = "total_dataset_hubert_asr_v2.pkl"
gdown.download(url, output, quiet=False, fuzzy=True)

Downloading...
From (uriginal): https://drive.google.com/uc?id=1UChSPa_Uv6levN9pWN9jkNkBgkJWi6nL
From (redirected): https://drive.google.com/uc?id=1UChSPa_Uv6levN9pWN9jkNkBgkJWi6nL&confirm=t&uuid=abbf98d0-cc33-44f7-907a-6ed13364a030
To: /kaggle/working/total_dataset_hubert_asr_v2.pkl
100%|██████████| 1.50G/1.50G [00:14<00:00, 106MB/s] 


'total_dataset_hubert_asr_v2.pkl'

In [6]:
with open('total_dataset_hubert_asr_v2.pkl', 'rb') as f:
    total_dataset_test_asr = pickle.load(f)

In [7]:
total_dataset_test_asr.keys()

dict_keys(['audio', 'image', 'text', 'pure-text', 'audio_path', 'id', 'source', 'asr-text', 'asr-text-embedding', 'hubert-emb', 'hubert-emb-768', 'hubert-asr-text'])

In [8]:
total_dataset_test_asr['hubert-asr-text'][0]

'this area has to provide some insight into these farming adaptations'

In [9]:
total_dataset_test_asr['pure-text'][0]

'this area helps to provide some insight into these farming adaptations'

In [10]:
len(total_dataset_test_asr['hubert-asr-text'])

11411

In [11]:
len(total_dataset_test_asr['pure-text'])

11411

In [None]:
from sentence_transformers import SentenceTransformer

text_model = SentenceTransformer('sentence-transformers/LaBSE').to(device)

In [13]:
embeddings = text_model.encode(total_dataset_test_asr['hubert-asr-text'])
total_dataset_test_asr['hubert-asr-emb'] = [torch.from_numpy(emb) for emb in embeddings]

Batches:   0%|          | 0/357 [00:00<?, ?it/s]

In [14]:
similarity_dataset = {}
similarity_dataset['hubert-asr-emb'] = total_dataset_test_asr['hubert-asr-emb']
similarity_dataset['text'] = total_dataset_test_asr['text']

similarity_matrix = []
for i in tqdm(range(len(similarity_dataset['hubert-asr-emb']))):
    similarity_matrix.append([])
    for j in range(len(similarity_dataset['text'])):
        similarity_matrix[i].append(cosine_similarity(similarity_dataset['hubert-asr-emb'][i].unsqueeze(0), similarity_dataset['text'][j].unsqueeze(0)).item())
        
with open(f'similarity_matrix_hubert_asr.pkl', 'wb') as f:
    pickle.dump(similarity_matrix, f)

100%|██████████| 11411/11411 [1:07:59<00:00,  2.80it/s]


In [17]:
evaluate(similarity_matrix, threshold=0.5)

100%|██████████| 11411/11411 [00:48<00:00, 235.18it/s]


+-----------------+----------+
| Metrics         |   Values |
| Hits@1          | 0.926562 |
+-----------------+----------+
| MRR             | 0.963825 |
+-----------------+----------+
| Macro Precision | 0.611018 |
+-----------------+----------+
| Macro Recall    | 0.986004 |
+-----------------+----------+
| Macro F1        | 0.680684 |
+-----------------+----------+
| Micro Precision | 0.999699 |
+-----------------+----------+
| Micro Recall    | 0.999699 |
+-----------------+----------+
| Micro F1        | 0.999699 |
+-----------------+----------+
| Accuracy        | 0.999699 |
+-----------------+----------+
| Golden Accuracy | 0.972307 |
+-----------------+----------+


# Wav2vec2 ASR Hard Evaluation

In [7]:
with open('total_dataset_hubert_asr_v2.pkl', 'rb') as f:
    total_dataset_test_asr = pickle.load(f)

In [8]:
total_dataset_test_asr.keys()

dict_keys(['audio', 'image', 'text', 'pure-text', 'audio_path', 'id', 'source', 'asr-text', 'asr-text-embedding', 'hubert-emb', 'hubert-emb-768', 'hubert-asr-text'])

In [11]:
len(total_dataset_test_asr['asr-text-embedding'])

11411

In [10]:
total_dataset_test_asr['asr-text-embedding'][0] != total_dataset_test_asr['asr-text-embedding'][0]

torch.Size([768])

In [12]:
embeddings = text_model.encode(total_dataset_test_asr['asr-text'])
total_dataset_test_asr['asr-text-embedding'] = [torch.from_numpy(emb) for emb in embeddings]

Batches:   0%|          | 0/357 [00:00<?, ?it/s]

In [13]:
def cosine_similarity(embedding1, embedding2):
    dim = 1
    embedding1 = F.normalize(embedding1, p=2, dim=dim)
    embedding2 = F.normalize(embedding2, p=2, dim=dim)

    dot_product = torch.sum(embedding1 * embedding2, dim=dim)

    magnitude1 = torch.norm(embedding1, p=2, dim=dim)
    magnitude2 = torch.norm(embedding2, p=2, dim=dim)

    cosine_sim = dot_product / (magnitude1 * magnitude2)

    return cosine_sim

In [14]:
similarity_dataset = {}
similarity_dataset['asr-text-embedding'] = total_dataset_test_asr['asr-text-embedding']
similarity_dataset['text'] = total_dataset_test_asr['text']

similarity_matrix = []
for i in tqdm(range(len(similarity_dataset['asr-text-embedding']))):
    similarity_matrix.append([])
    for j in range(len(similarity_dataset['text'])):
        similarity_matrix[i].append(cosine_similarity(similarity_dataset['asr-text-embedding'][i].unsqueeze(0), similarity_dataset['text'][j].unsqueeze(0)).item())
        
with open(f'similarity_matrix_wav2vec2_asr.pkl', 'wb') as f:
    pickle.dump(similarity_matrix, f)

100%|██████████| 11411/11411 [3:19:03<00:00,  1.05s/it] 


In [19]:
def evaluate(similarity_matrix, threshold=0.5):

    def _evaluate(similarity_matrix, threshold=0.5):
        total_hits_1 = 0
        total_mrr = 0
        total_instances = 0
        total_labels = []
        total_predictions = []
        number_of_golden_predictions = 0

        for i in tqdm(range(len(similarity_matrix))):
            predicted_idx = np.argmax(similarity_matrix[i])
            label_similarity = similarity_matrix[i][i]

            # Compute Hits@1
            if predicted_idx == i:   
                total_hits_1 += 1

            # Compute MRR
            label_rank = sum([1 for x in similarity_matrix[i] if x > similarity_matrix[i][i]])
            reciprocal_rank = 1 / (label_rank + 1)
            total_mrr += reciprocal_rank

            # Record predictions and labels
            predictions = [0 if sim < threshold else 1 for sim in similarity_matrix[i]]
            total_labels.extend([0 if k != i else 1 for k in range(len(similarity_matrix[i]))])
            total_predictions.extend(predictions)
            if label_similarity >= threshold:
                number_of_golden_predictions += 1

            total_instances += 1


        # Compute average metrics over all instances
        avg_hits_1 = total_hits_1 / total_instances
        avg_mrr = total_mrr / total_instances
        precision = precision_score(total_labels, total_predictions, average='macro')
        recall = recall_score(total_labels, total_predictions, average='macro')
        f1 = f1_score(total_labels, total_predictions, average='macro')
        precision_micro = precision_score(total_labels, total_predictions, average='micro')
        recall_micro = recall_score(total_labels, total_predictions, average='micro')
        f1_micro = f1_score(total_labels, total_predictions, average='micro')
        accuracy = accuracy_score(total_labels, total_predictions)
        golden_prediction_accuracy = number_of_golden_predictions / total_instances

        return {
            'Hits@1': avg_hits_1,
            'MRR': avg_mrr,
            'Macro Precision': precision,
            'Macro Recall': recall,
            'Macro F1': f1,
            'Micro Precision': precision_micro,
            'Micro Recall': recall_micro,
            'Micro F1': f1_micro,
            'Accuracy': accuracy,
            'Golden Accuracy': golden_prediction_accuracy,
        }
    
    results = _evaluate(similarity_matrix, threshold=threshold)
    table = []
    for i in range(len(results)):
        table.append([list(results.keys())[i], list(results.values())[i]])
    print(tabulate(table, ['Metrics', 'Values'], tablefmt="grid"))


In [20]:
evaluate(similarity_matrix, threshold=0.5)

100%|██████████| 11411/11411 [00:50<00:00, 225.33it/s]


+-----------------+----------+
| Metrics         |   Values |
| Hits@1          | 0.901937 |
+-----------------+----------+
| MRR             | 0.940338 |
+-----------------+----------+
| Macro Precision | 0.610473 |
+-----------------+----------+
| Macro Recall    | 0.972293 |
+-----------------+----------+
| Macro F1        | 0.679002 |
+-----------------+----------+
| Micro Precision | 0.999703 |
+-----------------+----------+
| Micro Recall    | 0.999703 |
+-----------------+----------+
| Micro F1        | 0.999703 |
+-----------------+----------+
| Accuracy        | 0.999703 |
+-----------------+----------+
| Golden Accuracy | 0.944878 |
+-----------------+----------+


In [None]:
from IPython.display import FileLink
FileLink('hubert_labse_model_es.pt')

# Hard Evaluation based on sources

In [16]:
class CusDataset(Dataset):
    def __init__(self, dataset, audio_name, text_name):
        self.dataset = dataset
        self.text_name = text_name
        self.audio_name = audio_name
        
    def __len__(self):
        return len(self.dataset[self.audio_name])

    def __getitem__(self, i):
        return self.dataset[self.text_name][i], self.dataset[self.audio_name][i], self.dataset['image'][i]

In [19]:
from collections import defaultdict
from tqdm import tqdm
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, average_precision_score
from tabulate import tabulate

def evaluate_matrix(similarity_matrix, sources, threshold=0.5):
    
    def _evaluate(similarity_matrix, indices, threshold=0.5):
        total_hits_1 = 0
        total_mrr = 0
        total_instances = 0
        total_labels = []
        total_predictions = []
        number_of_golden_predictions = 0
        total_ap = 0
        total_recall_5 = 0
        total_rank = 0

        for i in tqdm(indices):
            predicted_idx = np.argmax(similarity_matrix[i])
            label_similarity = similarity_matrix[i][i]

            if predicted_idx != i and similarity_matrix[i][predicted_idx] == label_similarity: 
                continue

            # Compute Hits@1
            if predicted_idx == i:   
                total_hits_1 += 1 

            total_instances += 1

            # Compute MRR
            label_rank = sum([1 for x in similarity_matrix[i] if x > similarity_matrix[i][i]])
            reciprocal_rank = 1 / (label_rank + 1)
            total_mrr += reciprocal_rank

            # Compute meanR
            total_rank += (label_rank + 1)
            

            # Record predictions and labels
            predictions = [0 if sim < threshold else 1 for sim in similarity_matrix[i]]
            total_labels.extend([0 if k != i else 1 for k in range(len(similarity_matrix[i]))])
            total_predictions.extend(predictions)
            if label_similarity >= threshold:
                number_of_golden_predictions += 1
            
            
        print(total_instances)
        # Compute average metrics over all instances
        avg_hits_1 = total_hits_1 / total_instances
        avg_mrr = total_mrr / total_instances
        avg_rank = total_rank / total_instances
        f1 = f1_score(total_labels, total_predictions, average='macro')
        golden_prediction_accuracy = number_of_golden_predictions / total_instances


        return {
            'Hits@1': avg_hits_1,
            'MRR': avg_mrr,
            'meanR': avg_rank,
            'Macro F1': f1,
            'Golden Accuracy': golden_prediction_accuracy,
        }

    # Create a dictionary to store metrics for each source
    source_metrics = defaultdict(dict)
    
    # Get unique sources
    unique_sources = set(sources)
    
    # Evaluate metrics for each source
    for source in unique_sources:
        print(source)
        indices = [i for i, s in enumerate(sources) if s == source]
        source_metrics[source] = _evaluate(similarity_matrix, indices, threshold=threshold)
    
    # Print the metrics for each source
    for source, metrics in source_metrics.items():
        print(f"Metrics for source {source}:")
        table = []
        for metric, value in metrics.items():
            table.append([metric, value])
        print(tabulate(table, ['Metrics', 'Values'], tablefmt="grid"))
        print("\n")

# Example usage
# similarity_matrix = ... (your similarity matrix)
# sources = ... (your list of sources)
# evaluate_matrix(similarity_matrix, sources, threshold=0.5)


In [5]:
model_name = 'hubert_labse_model_es'  
with open(f'similarity_matrix_{model_name}.pkl', 'rb') as f:
    similarity_matrix = pickle.load(f)

In [24]:
total_dataset['test']['source'][0]

'common_voice'

In [20]:
evaluate_matrix(similarity_matrix, total_dataset['test']['source'])

brown


100%|██████████| 8567/8567 [00:36<00:00, 237.74it/s]


8519
fleurs


100%|██████████| 647/647 [00:01<00:00, 355.45it/s]


353
common_voice


100%|██████████| 2197/2197 [00:09<00:00, 229.95it/s]


2193
Metrics for source brown:
+-----------------+----------+
| Metrics         |   Values |
| Hits@1          | 0.944829 |
+-----------------+----------+
| MRR             | 0.959491 |
+-----------------+----------+
| meanR           | 3.69292  |
+-----------------+----------+
| Macro F1        | 0.713267 |
+-----------------+----------+
| Golden Accuracy | 0.854795 |
+-----------------+----------+


Metrics for source fleurs:
+-----------------+----------+
| Metrics         |   Values |
| Hits@1          | 0.974504 |
+-----------------+----------+
| MRR             | 0.982082 |
+-----------------+----------+
| meanR           | 1.16431  |
+-----------------+----------+
| Macro F1        | 0.80359  |
+-----------------+----------+
| Golden Accuracy | 0.76204  |
+-----------------+----------+


Metrics for source common_voice:
+-----------------+-----------+
| Metrics         |    Values |
| Hits@1          |  0.915641 |
+-----------------+-----------+
| MRR             |  0.934944 |
+

In [9]:
with open('similarity_matrix_hubert_asr.pkl', 'rb') as f:
    similarity_matrix2 = pickle.load(f)

In [22]:
evaluate_matrix(similarity_matrix2, total_dataset['test']['source'])

brown


100%|██████████| 8567/8567 [00:36<00:00, 234.85it/s]


8547
fleurs


100%|██████████| 647/647 [00:01<00:00, 329.95it/s]


350
common_voice


100%|██████████| 2197/2197 [00:09<00:00, 230.56it/s]


2194
Metrics for source brown:
+-----------------+-----------+
| Metrics         |    Values |
| Hits@1          |  0.947584 |
+-----------------+-----------+
| MRR             |  0.958399 |
+-----------------+-----------+
| meanR           | 20.4336   |
+-----------------+-----------+
| Macro F1        |  0.704701 |
+-----------------+-----------+
| Golden Accuracy |  0.970867 |
+-----------------+-----------+


Metrics for source fleurs:
+-----------------+----------+
| Metrics         |   Values |
| Hits@1          | 1        |
+-----------------+----------+
| MRR             | 1        |
+-----------------+----------+
| meanR           | 1        |
+-----------------+----------+
| Macro F1        | 0.781316 |
+-----------------+----------+
| Golden Accuracy | 1        |
+-----------------+----------+


Metrics for source common_voice:
+-----------------+-----------+
| Metrics         |    Values |
| Hits@1          |  0.968095 |
+-----------------+-----------+
| MRR             |  

In [12]:
with open('similarity_matrix_wav2vec2_asr.pkl', 'rb') as f:
    similarity_matrix3 = pickle.load(f)

In [24]:
evaluate_matrix(similarity_matrix3, total_dataset['test']['source'])

brown


100%|██████████| 8567/8567 [00:36<00:00, 232.81it/s]


8552
fleurs


100%|██████████| 647/647 [00:01<00:00, 330.25it/s]


351
common_voice


100%|██████████| 2197/2197 [00:09<00:00, 230.22it/s]


2194
Metrics for source brown:
+-----------------+-----------+
| Metrics         |    Values |
| Hits@1          |  0.934167 |
+-----------------+-----------+
| MRR             |  0.944619 |
+-----------------+-----------+
| meanR           | 37.2572   |
+-----------------+-----------+
| Macro F1        |  0.696978 |
+-----------------+-----------+
| Golden Accuracy |  0.955215 |
+-----------------+-----------+


Metrics for source fleurs:
+-----------------+----------+
| Metrics         |   Values |
| Hits@1          | 0.994302 |
+-----------------+----------+
| MRR             | 0.994338 |
+-----------------+----------+
| meanR           | 1.90028  |
+-----------------+----------+
| Macro F1        | 0.793493 |
+-----------------+----------+
| Golden Accuracy | 0.994302 |
+-----------------+----------+


Metrics for source common_voice:
+-----------------+-----------+
| Metrics         |    Values |
| Hits@1          |  0.890611 |
+-----------------+-----------+
| MRR             |  

# Evaluation after removing duplicates

In [4]:
from collections import defaultdict
from tqdm import tqdm
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, average_precision_score
from tabulate import tabulate

def evaluate_matrix_full(similarity_matrix, threshold=0.5):
    
    def _evaluate(similarity_matrix, threshold=0.5):
        total_hits_1 = 0
        total_mrr = 0
        total_instances = 0
        total_labels = []
        total_predictions = []
        number_of_golden_predictions = 0
        total_ap = 0
        total_recall_5 = 0
        total_rank = 0

        for i in tqdm(range(len(similarity_matrix))):
            predicted_idx = np.argmax(similarity_matrix[i])
            label_similarity = similarity_matrix[i][i]

            if predicted_idx != i and similarity_matrix[i][predicted_idx] == label_similarity: 
                continue

            # Compute Hits@1
            if predicted_idx == i:   
                total_hits_1 += 1 

            total_instances += 1

            # Compute MRR
            label_rank = sum([1 for x in similarity_matrix[i] if x > similarity_matrix[i][i]])
            reciprocal_rank = 1 / (label_rank + 1)
            total_mrr += reciprocal_rank

            # Compute meanR
            total_rank += (label_rank + 1)
            

            # Record predictions and labels
            predictions = [0 if sim < threshold else 1 for sim in similarity_matrix[i]]
            total_labels.extend([0 if k != i else 1 for k in range(len(similarity_matrix[i]))])
            total_predictions.extend(predictions)
            if label_similarity >= threshold:
                number_of_golden_predictions += 1
            
            
        print(total_instances)
        # Compute average metrics over all instances
        avg_hits_1 = total_hits_1 / total_instances
        avg_mrr = total_mrr / total_instances
        avg_rank = total_rank / total_instances
        f1 = f1_score(total_labels, total_predictions, average='macro')
        golden_prediction_accuracy = number_of_golden_predictions / total_instances


        return {
            'Hits@1': avg_hits_1,
            'MRR': avg_mrr,
            'meanR': avg_rank,
            'Macro F1': f1,
            'Golden Accuracy': golden_prediction_accuracy,
        }

    results = _evaluate(similarity_matrix, threshold=threshold)
    table = []
    for i in range(len(results)):
        table.append([list(results.keys())[i], list(results.values())[i]])
    print(tabulate(table, ['Metrics', 'Values'], tablefmt="grid"))

In [7]:
evaluate_matrix_full(similarity_matrix, threshold=0.5)

100%|██████████| 11411/11411 [00:47<00:00, 242.00it/s]


11065
+-----------------+----------+
| Metrics         |   Values |
| Hits@1          | 0.939991 |
+-----------------+----------+
| MRR             | 0.955346 |
+-----------------+----------+
| meanR           | 7.70981  |
+-----------------+----------+
| Macro F1        | 0.710431 |
+-----------------+----------+
| Golden Accuracy | 0.849616 |
+-----------------+----------+


In [10]:
evaluate_matrix_full(similarity_matrix2, threshold=0.5)

100%|██████████| 11411/11411 [00:47<00:00, 241.66it/s]


11091
+-----------------+-----------+
| Metrics         |    Values |
| Hits@1          |  0.953295 |
+-----------------+-----------+
| MRR             |  0.962781 |
+-----------------+-----------+
| meanR           | 17.8398   |
+-----------------+-----------+
| Macro F1        |  0.679489 |
+-----------------+-----------+
| Golden Accuracy |  0.971508 |
+-----------------+-----------+


In [13]:
evaluate_matrix_full(similarity_matrix3, threshold=0.5)

100%|██████████| 11411/11411 [00:48<00:00, 233.96it/s]


11097
+-----------------+-----------+
| Metrics         |    Values |
| Hits@1          |  0.927458 |
+-----------------+-----------+
| MRR             |  0.93865  |
+-----------------+-----------+
| meanR           | 38.3      |
+-----------------+-----------+
| Macro F1        |  0.677658 |
+-----------------+-----------+
| Golden Accuracy |  0.943408 |
+-----------------+-----------+


In [None]:
model_name = 'lasp_concat'  
with open(f'similarity_matrix_{model_name}.pkl', 'rb') as f:
    similarity_matrix = pickle.load(f)

In [None]:
evaluate_matrix_full(similarity_matrix, threshold=0.5)

100%|██████████| 11411/11411 [00:48<00:00, 236.77it/s]


11097
+-----------------+----------+
| Metrics         |   Values |
| Hits@1          | 0.877715 |
+-----------------+----------+
| MRR             | 0.90887  |
+-----------------+----------+
| meanR           | 9.09228  |
+-----------------+----------+
| Macro F1        | 0.533584 |
+-----------------+----------+
| Golden Accuracy | 0.969451 |
+-----------------+----------+


In [16]:
total_dataset['test'].keys()

dict_keys(['audio', 'text', 'image', 'xlmr-emb', 'hubert-emb', 'pure-text', 'id', 'source', 'audio_path'])

In [18]:
len(set(total_dataset['test']['pure-text']))

11056

In [21]:
from collections import defaultdict

# Assuming total_dataset is a dictionary and 'test' is a key in that dictionary
# and 'pure_text' and 'sources' are keys in the 'test' dataset

# Extract the 'pure_text' and 'sources' values
pure_text_values = total_dataset['test']['pure-text']
sources_values = total_dataset['test']['source']

# Create a dictionary to store unique counts for each source
unique_counts_by_source = defaultdict(set)

# Iterate through the dataset and count unique 'pure_text' values for each source
for text, source in zip(pure_text_values, sources_values):
    unique_counts_by_source[source].add(text)

# Convert sets to counts
unique_counts_by_source = {source: len(texts) for source, texts in unique_counts_by_source.items()}

# Print the unique counts for each source
for source, count in unique_counts_by_source.items():
    print(f"Source: {source}, Unique 'pure_text' count: {count}")


Source: common_voice, Unique 'pure_text' count: 2193
Source: brown, Unique 'pure_text' count: 8513
Source: fleurs, Unique 'pure_text' count: 350


In [22]:
11411 - 11056

355

In [23]:
from IPython.display import FileLink
FileLink('total_dataset_v11.pkl')

# Multilingaul Evaluation

In [7]:
url = "https://drive.google.com/file/d/1-3-mpLSKuUaYjstN4ZE88pg06rEisEoh/view?usp=sharing"
output = "translated-text-embeddings-test-fa.pkl"
gdown.download(url, output, quiet=False, fuzzy=True)

Downloading...
From (original): https://drive.google.com/uc?id=1-3-mpLSKuUaYjstN4ZE88pg06rEisEoh
From (redirected): https://drive.google.com/uc?id=1-3-mpLSKuUaYjstN4ZE88pg06rEisEoh&confirm=t&uuid=3799f763-b2f9-4fb9-96b5-9ed714fddd74
To: /kaggle/working/translated-text-embeddings-test-fa.pkl
100%|██████████| 1.12G/1.12G [00:07<00:00, 143MB/s] 


'translated-text-embeddings-test-fa.pkl'

In [9]:
dataset = total_dataset['test']

In [17]:
model_to_test = torch.load(f'hubert_labse_model_es.pt', map_location=device)
model_to_test = model_to_test.to(device)
test_dataloader = DataLoader(dataset=CusDataset(dataset, 'hubert-emb', 'text'), batch_size=32, shuffle=False)
dataset['clasp_emb'] = []

with torch.no_grad():
    for text_embedding, audio_candidates, image_candidates in tqdm(test_dataloader):
        audio_candidates = audio_candidates.to(device)
        image_candidates = image_candidates.to(device)
        final_emb = model_to_test(audio_candidates, image_candidates)
        dataset['clasp_emb'].extend(final_emb)

for i in range(len(dataset['clasp_emb'])):
    dataset['clasp_emb'][i] = dataset['clasp_emb'][i].cpu()

100%|██████████| 357/357 [00:00<00:00, 462.05it/s]


In [15]:
def create_similarity_matrix(dataset, target_lang):
    with open(f'translated-text-embeddings-test-{target_lang}.pkl', 'rb') as f:
        dataset['translated-text-embeddings'] = pickle.load(f)

    similarity_dataset = {}
    similarity_dataset['clasp_emb'] = dataset['clasp_emb']
    similarity_dataset['text'] = dataset['translated-text-embeddings']

    similarity_matrix = []
    for i in tqdm(range(len(similarity_dataset['text']))):
        similarity_matrix.append([])
        for j in range(len(similarity_dataset['clasp_emb'])):
            similarity_matrix[i].append(cosine_similarity(similarity_dataset['text'][i].unsqueeze(0), similarity_dataset['clasp_emb'][j].unsqueeze(0)).item())

    with open(f'similarity_matrix_{target_lang}.pkl', 'wb') as f:
        pickle.dump(similarity_matrix, f)
    
    with open('target_lang.txt', 'w') as f:
        f.write(f'{target_lang}')

In [16]:
create_similarity_matrix(dataset, 'fa')

100%|██████████| 11411/11411 [3:05:27<00:00,  1.03it/s] 


In [17]:
with open(f'similarity_matrix_fa.pkl', 'rb') as f:
    similarity_matrix_lg = pickle.load(f)
evaluate_matrix_full(similarity_matrix_lg, threshold=0.5)

100%|██████████| 11411/11411 [00:49<00:00, 232.19it/s]


11411
+-----------------+-----------+
| Metrics         |    Values |
| Hits@1          |  0.794409 |
+-----------------+-----------+
| MRR             |  0.848206 |
+-----------------+-----------+
| meanR           | 26.9598   |
+-----------------+-----------+
| Macro F1        |  0.670136 |
+-----------------+-----------+
| Golden Accuracy |  0.322671 |
+-----------------+-----------+


In [18]:
url = "https://drive.google.com/file/d/1jHNsQh1-QwRevXbw7G4JH0BT7rVeOD6t/view?usp=sharing"
output = "translated-text-embeddings-test-de.pkl"
gdown.download(url, output, quiet=False, fuzzy=True)

Downloading...
From (original): https://drive.google.com/uc?id=1jHNsQh1-QwRevXbw7G4JH0BT7rVeOD6t
From (redirected): https://drive.google.com/uc?id=1jHNsQh1-QwRevXbw7G4JH0BT7rVeOD6t&confirm=t&uuid=39a1786e-9262-4a07-b4c6-8f77ac7f8bc1
To: /kaggle/working/translated-text-embeddings-test-de.pkl
100%|██████████| 1.12G/1.12G [00:10<00:00, 105MB/s] 


'translated-text-embeddings-test-de.pkl'

In [20]:
import pickle
import torch
import torch.nn.functional as F
from tqdm import tqdm

def create_similarity_matrix(dataset, target_lang):
    # Load embeddings from file
    with open(f'translated-text-embeddings-test-{target_lang}.pkl', 'rb') as f:
        dataset['translated-text-embeddings'] = pickle.load(f)

    # Extract embeddings into tensors for efficient batch processing
    clasp_emb = torch.stack(dataset['clasp_emb'])  # Shape: (num_clasp, embedding_dim)
    text_emb = torch.stack(dataset['translated-text-embeddings'])  # Shape: (num_text, embedding_dim)

    # Normalize embeddings once to avoid repeated computation
    clasp_emb = F.normalize(clasp_emb, p=2, dim=1)
    text_emb = F.normalize(text_emb, p=2, dim=1)

    # Compute cosine similarity using matrix multiplication
    similarity_matrix = torch.mm(text_emb, clasp_emb.T).cpu().tolist()

    # Save similarity matrix
    with open(f'similarity_matrix_{target_lang}.pkl', 'wb') as f:
        pickle.dump(similarity_matrix, f)

    # Save target language information
    with open('target_lang.txt', 'w') as f:
        f.write(f'{target_lang}')


In [20]:
create_similarity_matrix(dataset, 'de')

In [21]:
with open(f'similarity_matrix_de.pkl', 'rb') as f:
    similarity_matrix_lg = pickle.load(f)
evaluate_matrix_full(similarity_matrix_lg, threshold=0.5)

100%|██████████| 11411/11411 [00:49<00:00, 231.22it/s]


11411
+-----------------+-----------+
| Metrics         |    Values |
| Hits@1          |  0.811761 |
+-----------------+-----------+
| MRR             |  0.863102 |
+-----------------+-----------+
| meanR           | 21.637    |
+-----------------+-----------+
| Macro F1        |  0.670901 |
+-----------------+-----------+
| Golden Accuracy |  0.286653 |
+-----------------+-----------+


In [22]:
!rm translated-text-embeddings-test-de.pkl
!rm translated-text-embeddings-test-fa.pkl

In [23]:
url = "https://drive.google.com/file/d/1--dkivt5LH-6eSxXcmlVWfwzOHCv9l-t/view?usp=sharing"
output = "translated-text-embeddings-test-fr.pkl"
gdown.download(url, output, quiet=False, fuzzy=True)

Downloading...
From (original): https://drive.google.com/uc?id=1--dkivt5LH-6eSxXcmlVWfwzOHCv9l-t
From (redirected): https://drive.google.com/uc?id=1--dkivt5LH-6eSxXcmlVWfwzOHCv9l-t&confirm=t&uuid=a64094c1-c803-42ad-ab2b-c82f9d498921
To: /kaggle/working/translated-text-embeddings-test-fr.pkl
100%|██████████| 1.12G/1.12G [00:15<00:00, 73.9MB/s]


'translated-text-embeddings-test-fr.pkl'

In [26]:
url = "https://drive.google.com/file/d/1-2x8KsEkPVFJuzWEzJRcuNZvmxSrK3gS/view?usp=sharing"
output = "translated-text-embeddings-test-zh.pkl"
gdown.download(url, output, quiet=False, fuzzy=True)

Downloading...
From (original): https://drive.google.com/uc?id=1-2x8KsEkPVFJuzWEzJRcuNZvmxSrK3gS
From (redirected): https://drive.google.com/uc?id=1-2x8KsEkPVFJuzWEzJRcuNZvmxSrK3gS&confirm=t&uuid=c3fc2526-88a1-4bbb-b8cc-1aa3855b1f7a
To: /kaggle/working/translated-text-embeddings-test-zh.pkl
100%|██████████| 1.12G/1.12G [00:11<00:00, 93.7MB/s]


'translated-text-embeddings-test-zh.pkl'

In [24]:
create_similarity_matrix(dataset, 'fr')
with open(f'similarity_matrix_fr.pkl', 'rb') as f:
    similarity_matrix_lg = pickle.load(f)
evaluate_matrix_full(similarity_matrix_lg, threshold=0.5)

100%|██████████| 11411/11411 [00:51<00:00, 223.17it/s]


11411
+-----------------+-----------+
| Metrics         |    Values |
| Hits@1          |  0.848655 |
+-----------------+-----------+
| MRR             |  0.891647 |
+-----------------+-----------+
| meanR           | 16.0997   |
+-----------------+-----------+
| Macro F1        |  0.717795 |
+-----------------+-----------+
| Golden Accuracy |  0.420997 |
+-----------------+-----------+


In [25]:
!rm translated-text-embeddings-test-fr.pkl

In [27]:
create_similarity_matrix(dataset, 'zh')
with open(f'similarity_matrix_zh.pkl', 'rb') as f:
    similarity_matrix_lg = pickle.load(f)
evaluate_matrix_full(similarity_matrix_lg, threshold=0.5)

100%|██████████| 11411/11411 [00:49<00:00, 230.32it/s]


11411
+-----------------+-----------+
| Metrics         |    Values |
| Hits@1          |  0.822189 |
+-----------------+-----------+
| MRR             |  0.870778 |
+-----------------+-----------+
| meanR           | 27.1251   |
+-----------------+-----------+
| Macro F1        |  0.743045 |
+-----------------+-----------+
| Golden Accuracy |  0.518622 |
+-----------------+-----------+


In [28]:
!rm translated-text-embeddings-test-zh.pkl

In [4]:
!rm similarity_matrix_fa.pkl

In [21]:
create_similarity_matrix(dataset, 'fa')
with open(f'similarity_matrix_fa.pkl', 'rb') as f:
    similarity_matrix_lg = pickle.load(f)
evaluate_matrix_full(similarity_matrix_lg, threshold=0.5)

100%|██████████| 11411/11411 [00:48<00:00, 232.94it/s]


11411
+-----------------+-----------+
| Metrics         |    Values |
| Hits@1          |  0.794409 |
+-----------------+-----------+
| MRR             |  0.848206 |
+-----------------+-----------+
| meanR           | 26.9598   |
+-----------------+-----------+
| Macro F1        |  0.670136 |
+-----------------+-----------+
| Golden Accuracy |  0.322671 |
+-----------------+-----------+
