# Data Processing (Done)

In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
!git clone https://github.com/mokcho/HALLM.git

Cloning into 'HALLM'...
remote: Enumerating objects: 85, done.[K
remote: Counting objects: 100% (85/85), done.[K
remote: Compressing objects: 100% (70/70), done.[K
remote: Total 85 (delta 30), reused 29 (delta 6), pack-reused 0 (from 0)[K
Receiving objects: 100% (85/85), 1.33 MiB | 18.35 MiB/s, done.
Resolving deltas: 100% (30/30), done.


In [None]:
%cd /content/HALLM
%ls

# need put Soham data in prepare folder

/content/HALLM
[0m[01;34mdata[0m/  LICENSE  README.md  requirements.txt


## Tackle the AudioCap data

In [None]:
!./data/prepare/audiocaps.sh


In [None]:
def download_audio(youtube_id, start_time, output_directory, duration=10):
    try:
        os.makedirs(output_directory, exist_ok=True)

        output_file = f'{output_directory}/{youtube_id}.wav'
        if os.path.exists(output_file):
            print(f"File already exists, skipping: {output_file}")
            return  # Skip downloading and processing if the file already exists


        ydl_opts = {
            'format': 'bestaudio/best',
            'noplaylist': True,
            'outtmpl': f'./tmp/{youtube_id}.%(ext)s',
            'quiet': True
        }
        url = f'https://www.youtube.com/watch?v={youtube_id}'

        with YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(url, download=True)
            original_file = ydl.prepare_filename(info)

            # Load the downloaded audio from tmp
            audio = AudioSegment.from_file(original_file)

            # Resample the audio to 44.1 kHz
            audio = audio.set_frame_rate(44100)

            # Extract the 10-second clip
            start_ms = start_time * 1000
            end_ms = start_ms + (duration * 1000)
            audio_clip = audio[start_ms:end_ms]

            # Ensure the output directory exists for storing the final clipped .wav file
            os.makedirs(output_directory, exist_ok=True)

            # Save the clipped file as a .wav file in the output directory
            output_file = f'{output_directory}/{youtube_id}.wav'
            audio_clip.export(output_file, format='wav')  # Save clipped file as .wav

            # Clean up the original file in tmp
            if os.path.exists(original_file):
                os.remove(original_file)
    except Exception as e:
        print(f"Failed to download or process {youtube_id}: {e}")

def process_dataset(csv_file, output_directory) :
    with open(csv_file, newline='') as file :
        tmp_directory = "./tmp"
        os.makedirs(tmp_directory, exist_ok=True)
        reader = csv.reader(file)
        next(reader)  # Skip the header
        for row in tqdm(reader, desc=f"Processing {os.path.basename(csv_file)}", unit="clip"):
            audiocaps_id, youtube_id, start_time, caption = row
            download_audio(youtube_id, int(start_time), output_directory)



In [None]:
# download the audio constructing ACE from AudioCaps
csv_file = "./data/AudioCaps/annotation/test.csv"
output_directory = "./data/AudioCaps/audio"
process_dataset(csv_file, output_directory)

## Tackle Clotho data

In [None]:
!./data/prepare/clotho.sh

/bin/bash: line 1: ./data/prepare/clotho.sh: No such file or directory


In [None]:
CLO_dev = "/content/HALLM/data/Clotho/caption/dev.csv"
CLOE_dev = "/content/HALLM/data/Clotho/entailment/clotho_development_gpt4.csv"
CLO_val = "/content/HALLM/data/Clotho/caption/val.csv"
CLOE_val = "/content/HALLM/data/Clotho/entailment/clotho_validation_gpt4.csv"
CLO_eval = "/content/HALLM/data/Clotho/caption/eval.csv"
CLOE_eval = "/content/HALLM/data/Clotho/entailment/clotho_evaluation_gpt4.csv"



df_CLO_dev = pd.read_csv(CLO_dev)
df_CLOE_dev = pd.read_csv(CLOE_dev)
df_CLO_val = pd.read_csv(CLO_val)
df_CLOE_val = pd.read_csv(CLOE_val)
df_CLO_eval = pd.read_csv(CLO_eval)
df_CLOE_eval = pd.read_csv(CLOE_eval)


print("Row number of CLotho development dataset is " + str(len(df_CLO_dev)))
print("Row number of CLotho development caption dataset is " + str(len(df_CLOE_dev)))
print("Row number of CLotho validation dataset is " + str(len(df_CLO_val)))
print("Row number of CLotho validation caption dataset is " + str(len(df_CLOE_val)))
print("Row number of CLotho evaluation dataset is " + str(len(df_CLO_eval)))
print("Row number of CLotho evaluation caption dataset is " + str(len(df_CLOE_eval)))


Row number of CLotho development dataset is 3839
Row number of CLotho development caption dataset is 3839
Row number of CLotho validation dataset is 1045
Row number of CLotho validation caption dataset is 1045
Row number of CLotho evaluation dataset is 1045
Row number of CLotho evaluation caption dataset is 1045


In [None]:
# check for Clotho dataset
Clotho_dev_audio_files = os.listdir("/content/HALLM/data/Clotho/audio/development")
print(df_CLO_dev["file_name"].isin(Clotho_dev_audio_files).value_counts())

Clotho_val_audio_files = os.listdir("/content/HALLM/data/Clotho/audio/validation")
print(df_CLO_val["file_name"].isin(Clotho_val_audio_files).value_counts())

Clotho_eval_audio_files = os.listdir("/content/HALLM/data/Clotho/audio/evaluation")
print(df_CLO_eval["file_name"].isin(Clotho_eval_audio_files).value_counts())
# The upper result shows that the file name from caption csv in Clotho dataset is exactly the same with audio file name

print("=========Start to check for CLOE===========")
# check for CLOE data set
Clotho_dev_audio_files = os.listdir("/content/HALLM/data/Clotho/audio/development")
print(df_CLOE_dev["Audio file"].isin(Clotho_dev_audio_files).value_counts())

Clotho_val_audio_files = os.listdir("/content/HALLM/data/Clotho/audio/validation")
print(df_CLOE_val["Audio file"].isin(Clotho_val_audio_files).value_counts())

Clotho_eval_audio_files = os.listdir("/content/HALLM/data/Clotho/audio/evaluation")
print(df_CLOE_eval["Audio file"].isin(Clotho_eval_audio_files).value_counts())
# The result shows that some of the file names are not aligned

print("=========Start to check for CLOE and CLO relationship===========")
print((df_CLOE_dev["Audio file"] == df_CLO_dev["file_name"]).value_counts())
a = df_CLOE_dev[(df_CLOE_dev["Audio file"] != df_CLO_dev["file_name"])]["Audio file"]
b = df_CLO_dev[(df_CLOE_dev["Audio file"] != df_CLO_dev["file_name"])]["file_name"]
print(a.iloc[0])
print(b.iloc[0])
print("We can find here the difference is the space in the start of file name in CLOE dataset ")
print("We can also find the data order of CLO and CLOE is aligned!")
print("Same for valid and eval data")


# print((df_CLOE_val["Audio file"] == df_CLO_val["file_name"]).value_counts())
# print((df_CLOE_eval["Audio file"] == df_CLO_eval["file_name"]).value_counts())


file_name
True    3839
Name: count, dtype: int64
file_name
True    1045
Name: count, dtype: int64
file_name
True    1045
Name: count, dtype: int64
Audio file
True     3837
False       2
Name: count, dtype: int64
Audio file
True     1044
False       1
Name: count, dtype: int64
Audio file
True    1045
Name: count, dtype: int64
True     3837
False       2
Name: count, dtype: int64
typical neighborhood in Porto.wav
 typical neighborhood in Porto.wav
We can find here the difference is the space in the start of file name in CLOE dataset 
We can also find the data order of CLO and CLOE is aligned!
Same for valid and eval data


In [None]:
# We can substitue CLOE file column as CLO file column
df_CLOE_dev["Audio file"] = df_CLO_dev["file_name"]
df_CLOE_val["Audio file"] = df_CLO_val["file_name"]
df_CLOE_eval["Audio file"] = df_CLO_eval["file_name"]

## Load Model

In [None]:
!pip install msclap


In [None]:
from msclap import CLAP

# Load model (Choose between versions '2022' or '2023')
# The model weight will be downloaded automatically if `model_fp` is not specified
clap_model = CLAP(version = '2022', use_cuda=False)

# # Extract text embeddings
# text_embeddings = clap_model.get_text_embeddings(class_labels: List[str])

# # Extract audio embeddings
# audio_embeddings = clap_model.get_audio_embeddings(file_paths: List[str])

# # Compute similarity between audio and text embeddings
# similarities = clap_model.compute_similarity(audio_embeddings, text_embeddings)

CLAP_weights_2022.pth:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

## audio embedding

In [None]:
import torch

In [None]:
def process_audio_data(df, root, batch_size = 200):
  files = df["Audio file"].tolist()
  files = [os.path.join(root, file) for file in files]
  audio_embeddings = []

  for i in range(0, len(files), batch_size):
    audio_embeddings_batch = clap_model.get_audio_embeddings(files[i:i+batch_size])
    audio_embeddings.extend(audio_embeddings_batch)
    del audio_embeddings_batch

  audio_embeddings = torch.stack(audio_embeddings)
  return audio_embeddings


In [None]:
dev_audio_embedding = process_audio_data(df_CLOE_dev, "/content/HALLM/data/Clotho/audio/development")
torch.save(dev_audio_embedding, '/content/drive/MyDrive/Dataset/11785 Project/dev_audio_embeddings')

In [None]:
dev_audio_embedding.shape

torch.Size([3839, 1024])

In [None]:
val_audio_embedding = process_audio_data(df_CLOE_val, "/content/HALLM/data/Clotho/audio/validation")
torch.save(val_audio_embedding, '/content/drive/MyDrive/Dataset/11785 Project/val_audio_embeddings')

In [None]:
eval_audio_embedding = process_audio_data(df_CLOE_eval, "/content/HALLM/data/Clotho/audio/evaluation")
torch.save(eval_audio_embedding, '/content/drive/MyDrive/Dataset/11785 Project/eval_audio_embeddings')

## text embedding

In [None]:
def process_entailment_data(df):
  df_entailment = pd.DataFrame()
  labels = ["Entailment", "Neutral", "Contradiction"]
  for i, label in enumerate(labels):
    if df_entailment is None:
      temp = df[["Audio file", "Caption", label]]
      temp.rename(columns={label: 'Hypothese'}, inplace=True)
      temp["label"] = i
      df_entailment = temp
    else:
      temp = df[["Audio file", "Caption", label]]
      temp.rename(columns={label: 'Hypothese'}, inplace=True)
      temp["label"] = i
      df_entailment = pd.concat([df_entailment, temp], ignore_index = True)
  return df_entailment

def batch_process_text_data(df, batch_size = 500):
  text_embeddings = []
  total_text = df["Hypothese"].tolist()
  for i in range(0, len(df), batch_size):
    batch = total_text[i:i+batch_size]
    text_embeddings_batch = clap_model.get_text_embeddings(batch)
    text_embeddings.extend(text_embeddings_batch)
    del text_embeddings_batch
  text_embeddings = torch.stack(text_embeddings)
  return text_embeddings

In [None]:
dev_entailment = process_entailment_data(df_CLOE_dev)
dev_text_embedding = batch_process_text_data(dev_entailment)
torch.save(dev_text_embedding, '/content/drive/MyDrive/Dataset/11785 Project/dev_text_embeddings')

In [None]:
val_entailment = process_entailment_data(df_CLOE_val)
val_text_embedding = batch_process_text_data(val_entailment)
torch.save(val_text_embedding, '/content/drive/MyDrive/Dataset/11785 Project/val_text_embeddings')

In [None]:
eval_entailment = process_entailment_data(df_CLOE_eval)
eval_text_embedding = batch_process_text_data(eval_entailment)
torch.save(eval_text_embedding, '/content/drive/MyDrive/Dataset/11785 Project/eval_text_embeddings')

## label

In [None]:
dev_label = torch.tensor(dev_entailment["label"].tolist())
val_label = torch.tensor(val_entailment["label"].tolist())
eval_label = torch.tensor(eval_entailment["label"].tolist())

torch.save(dev_label, '/content/drive/MyDrive/Dataset/11785 Project/dev_label')
torch.save(val_label, '/content/drive/MyDrive/Dataset/11785 Project/val_label')
torch.save(eval_label, '/content/drive/MyDrive/Dataset/11785 Project/eval_label')

# Classifer Model

In [None]:
!pip install torchsummaryX==1.1.0 wandb --quiet

In [None]:
import torch
import numpy as np
from torchsummaryX import summary
import sklearn
import gc
import zipfile
import pandas as pd
from tqdm.auto import tqdm
import os
import datetime
import wandb
from torch.optim import lr_scheduler
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device: ", device)

Device:  cuda


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
dev_audio = torch.load("/content/drive/MyDrive/Dataset/11785 Project/dev_audio_embeddings", weights_only=True)
dev_audio = dev_audio.repeat(3,1)
dev_text = torch.load("/content/drive/MyDrive/Dataset/11785 Project/dev_text_embeddings", weights_only=True)
dev_label = torch.load("/content/drive/MyDrive/Dataset/11785 Project/dev_label", weights_only=True)

val_audio = torch.load("/content/drive/MyDrive/Dataset/11785 Project/val_audio_embeddings", weights_only=True)
val_audio = val_audio.repeat(3,1)
val_text = torch.load("/content/drive/MyDrive/Dataset/11785 Project/val_text_embeddings", weights_only=True)
val_label = torch.load("/content/drive/MyDrive/Dataset/11785 Project/val_label", weights_only=True)

eval_audio = torch.load("/content/drive/MyDrive/Dataset/11785 Project/eval_audio_embeddings", weights_only=True)
eval_audio = eval_audio.repeat(3,1)
eval_text = torch.load("/content/drive/MyDrive/Dataset/11785 Project/eval_text_embeddings", weights_only=True)
eval_label = torch.load("/content/drive/MyDrive/Dataset/11785 Project/eval_label", weights_only=True)

In [None]:
print("The shape of dev aduio is: ", dev_audio.shape)
print("The shape of dev text is: ", dev_text.shape)
print("The shape of dev label is: ", dev_label.shape)

print("The shape of val aduio is: ", val_audio.shape)
print("The shape of val text is: ", val_text.shape)
print("The shape of val label is: ", val_label.shape)

print("The shape of eval aduio is: ", eval_audio.shape)
print("The shape of eval text is: ", eval_text.shape)
print("The shape of eval label is: ", eval_label.shape)



The shape of dev aduio is:  torch.Size([11517, 1024])
The shape of dev text is:  torch.Size([11517, 1024])
The shape of dev label is:  torch.Size([11517])
The shape of val aduio is:  torch.Size([3135, 1024])
The shape of val text is:  torch.Size([3135, 1024])
The shape of val label is:  torch.Size([3135])
The shape of eval aduio is:  torch.Size([3135, 1024])
The shape of eval text is:  torch.Size([3135, 1024])
The shape of eval label is:  torch.Size([3135])


In [None]:
config = {
    'epochs'        : 50,
    'batch_size'    : 32,
    'init_lr'       : 1e-4,
    "weight_decay": 1e-5,
    "drop_out": 0.2
    # Add more as you need them - e.g dropout values, weight decay, scheduler parameters
}

In [None]:
class Embedding_Dataset(torch.utils.data.Dataset):
  def __init__(self, audio, text, label):
    self.audio = audio
    self.text = text
    self.label = label
  def __len__(self):
    return len(self.label)
  def __getitem__(self, idx):
    return self.audio[idx], self.text[idx], self.label[idx]

In [None]:
# train_dataset = Embedding_Dataset(dev_audio, dev_text, dev_label)
# test_dataset = Embedding_Dataset(test_audio, test_text, test_label)

dev_dataset = Embedding_Dataset(dev_audio, dev_text, dev_label)
val_dataset = Embedding_Dataset(val_audio, val_text, val_label)
eval_dataset = Embedding_Dataset(eval_audio, eval_text, eval_label)


dev_dataloader = torch.utils.data.DataLoader(dev_dataset, batch_size=config['batch_size'], shuffle=True, num_workers=8, pin_memory=True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=config['batch_size'], shuffle=False, num_workers=2, pin_memory=True)
eval_dataloader = torch.utils.data.DataLoader(eval_dataset, batch_size=config['batch_size'], shuffle=False, num_workers=2, pin_memory=True)

print("Batch size     : ", config['batch_size'])
print("Train dataset samples = {}, batches = {}".format(dev_dataset.__len__(), len(dev_dataloader)))
print("Validation dataset samples = {}, batches = {}".format(val_dataset.__len__(), len(val_dataloader)))
print("Evaluation dataset samples = {}, batches = {}".format(eval_dataset.__len__(), len(eval_dataloader)))

Batch size     :  32
Train dataset samples = 11517, batches = 360
Validation dataset samples = 3135, batches = 98
Evaluation dataset samples = 3135, batches = 98


In [None]:
# test code
for i,j,k in dev_dataloader:
  print(i.shape)
  print(j.shape)
  print(k.shape)
  break


torch.Size([32, 1024])
torch.Size([32, 1024])
torch.Size([32])


In [None]:
# Baseline
class Baseline_network(torch.nn.Module):

    def __init__(self, input_size, output_size):

        super(Baseline_network, self).__init__()

        self.fc1 = torch.nn.Linear(input_size, output_size)
        self.relu = torch.nn.ReLU()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        return x

In [None]:
INPUT_SIZE  = 2048
OUTPUT_SIZE = 3
model = Baseline_network(INPUT_SIZE, OUTPUT_SIZE)
summary(model, torch.zeros(1, INPUT_SIZE))
model = model.to(device)


----------------------------------------------------------------------------------------------------
Layer                   Kernel Shape         Output Shape         # Params (K)      # Mult-Adds (M)
0_Linear                   [2048, 3]               [1, 3]                 6.15                 0.01
1_ReLU                             -               [1, 3]                    -                    -
# Params:    6.15K
# Mult-Adds: 0.01M
----------------------------------------------------------------------------------------------------


In [None]:
criterion = torch.nn.CrossEntropyLoss() # Defining Loss function.
# We use CE because the task is multi-class classification

optimizer = torch.optim.Adam(model.parameters(), lr= config['init_lr'], weight_decay = config["weight_decay"]) #Defining Optimizer
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, config["epochs"])

In [None]:
torch.cuda.empty_cache()
gc.collect()

619

In [None]:
def train(model, dataloader, optimizer, criterion):

    model.train()
    tloss, tacc = 0, 0 # Monitoring loss and accuracy
    batch_bar   = tqdm(total=len(dataloader), dynamic_ncols=True, leave=False, position=0, desc='Train')

    for i, (audio, text, label) in enumerate(dataloader):

        ### Initialize Gradients
        optimizer.zero_grad()

        ### Move Data to Device (Ideally GPU)
        audio = audio.to(device)
        text = text.to(device)
        label = label.to(device)

        concat = torch.cat((audio, text), dim=1)
        ### Forward Propagation
        logits  = model(concat)

        ### Loss Calculation
        loss    = criterion(logits, label)

        ### Backward Propagation
        loss.backward()

        ### Gradient Descent
        optimizer.step()

        tloss   += loss.item()
        tacc    += torch.sum(torch.argmax(logits, dim= 1) == label).item()/logits.shape[0]

        batch_bar.set_postfix(loss="{:.04f}".format(float(tloss / (i + 1))),
                              acc="{:.04f}%".format(float(tacc*100 / (i + 1))))
        batch_bar.update()

        ### Release memory
        del audio, text, label, concat, logits
        torch.cuda.empty_cache()

    batch_bar.close()
    tloss   /= len(dataloader)
    tacc    /= len(dataloader)

    return tloss, tacc

In [None]:
def eval(model, dataloader, criterion):

    model.eval() # set model in evaluation mode
    vloss, vacc = 0, 0 # Monitoring loss and accuracy
    batch_bar   = tqdm(total=len(dataloader), dynamic_ncols=True, position=0, leave=False, desc='Val')

    for i, (audio, text, label) in enumerate(dataloader):

        ### Move data to device (ideally GPU)
        audio = audio.to(device)
        text = text.to(device)
        label = label.to(device)

        concat = torch.cat((audio, text), dim=1)

        # makes sure that there are no gradients computed as we are not training the model now
        with torch.inference_mode():
            ### Forward Propagation
            logits  = model(concat)
            ### Loss Calculation
            loss    = criterion(logits, label)

        vloss   += loss.item()
        vacc    += torch.sum(torch.argmax(logits, dim= 1) == label).item()/logits.shape[0]


        batch_bar.set_postfix(loss="{:.04f}".format(float(vloss / (i + 1))),
                              acc="{:.04f}%".format(float(vacc*100 / (i + 1))))
        batch_bar.update()

        ### Release memory
        del audio, text, label, concat, logits
        torch.cuda.empty_cache()

    batch_bar.close()
    vloss   /= len(dataloader)
    vacc    /= len(dataloader)

    return vloss, vacc

In [None]:
wandb.login(key="bb3fb534ffdf4c363ed5c57d3c07fdb66645d5ea") #API Key is in your wandb account, under settings (wandb.ai/settings)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mzijinc[0m ([33mzijinc-carnegie-mellon-university[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
# Create your wandb run
version = "three"
run = wandb.init(
    name    = f"{version}-run-baseline", ### Wandb creates random run names if you skip this field, we recommend you give useful names
    reinit  = True, ### Allows reinitalizing runs when you re-run this cell
    #id     = "y28t31uz", ### Insert specific run id here if you want to resume a previous run
    #resume = "must", ### You need this to resume previous runs, but comment out reinit = True when using this
    project = "ALM_course_project", ### Project should be created in your wandb account
    config  = config ### Wandb Config for your run
)

In [None]:
from datetime import datetime
import pytz
timezone = pytz.timezone('America/New_York')

In [None]:
torch.cuda.empty_cache()
gc.collect()
wandb.watch(model, log="all")

for epoch in range(config["epochs"]):

    print("\nEpoch {}/{}".format(epoch+1, config['epochs']))

    curr_lr                 = float(optimizer.param_groups[0]['lr'])
    train_loss, train_acc   = train(model, dev_dataloader, optimizer, criterion)
    val_loss, val_acc       = eval(model, val_dataloader, criterion)

    print("\tTrain Acc {:.04f}%\tTrain Loss {:.04f}\t Learning Rate {:.07f}".format(train_acc*100, train_loss, curr_lr))
    print("\tVal Acc {:.04f}%\tVal Loss {:.04f}".format(val_acc*100, val_loss))

    ### Log metrics at each epoch in your run
    # Optionally, you can log at each batch inside train/eval functions
    # (explore wandb documentation/wandb recitation)
    wandb.log({'train_acc': train_acc*100, 'train_loss': train_loss,
               'val_acc': val_acc*100, 'valid_loss': val_loss, 'lr': curr_lr})

    scheduler.step()

    if val_acc > 0.75:
      current_time = datetime.now(timezone)
      formatted_time = current_time.strftime("%Y%m%d_%H%M")
      MODEL_SAVE_PATH = f"/content/drive/MyDrive/checkpoints__/project/{np.round(val_acc, 3)}_{formatted_time}_{version}.pt"

      torch.save({
        'epochs': config["epochs"],
        "batch_size": config["batch_size"],
        "init_lr": config["init_lr"],
        'model_state_dict': model.state_dict(),
      }, MODEL_SAVE_PATH)
      print(f"Model saved at {MODEL_SAVE_PATH}")


In [None]:
def evaluate_model(model, data_loader, device):
    model.eval()  # Set the model to evaluation mode

    all_preds = []
    all_labels = []

    with torch.no_grad():  # Disable gradient calculation
        for (audio, text, label) in data_loader:
            audio = audio.to(device)
            text = text.to(device)
            label = label.to(device)
            concat = torch.cat((audio, text), dim=1)

            # Forward pass
            outputs = model(concat)
            _, preds = torch.max(outputs, 1)  # Get the predicted class indices

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(label.cpu().numpy())

    # Convert lists to numpy arrays for metric calculations
    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)

    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average=None)  # For each class
    recall = recall_score(all_labels, all_preds, average=None)        # For each class
    f1 = f1_score(all_labels, all_preds, average=None)                # For each class

    # Overall metrics
    macro_precision = precision_score(all_labels, all_preds, average='macro')
    macro_recall = recall_score(all_labels, all_preds, average='macro')
    macro_f1 = f1_score(all_labels, all_preds, average='macro')

    # Print results
    print(f'Accuracy: {accuracy:.4f}')
    print('Precision (per class):', precision)
    print('Recall (per class):', recall)
    print('F1 Score (per class):', f1)
    print(f'Macro Precision: {macro_precision:.4f}')
    print(f'Macro Recall: {macro_recall:.4f}')
    print(f'Macro F1 Score: {macro_f1:.4f}')

In [None]:
evaluate_model(model, eval_dataloader, device)

Accuracy: 0.7378
Precision (per class): [0.69230769 0.68073879 0.86423841]
Recall (per class): [0.72344498 0.74066986 0.7492823 ]
F1 Score (per class): [0.70753393 0.70944088 0.8026653 ]
Macro Precision: 0.7458
Macro Recall: 0.7378
Macro F1 Score: 0.7399
