In [1]:
import os
import pandas as pd
import numpy as np
import glob
import torch
from tqdm import tqdm
from sklearn.metrics import classification_report, f1_score
import itertools
import random
import torch.nn as nn
from tqdm import tqdm
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, ConcatDataset, Subset, random_split

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#We load in the previously extracted and saved features (last four vs final)
import pickle
with open('/home/chwu/nonlayered_mustard_updated_text.pkl', 'rb') as f:
    data = pickle.load(f)
df = pd.DataFrame(data)

  from .autonotebook import tqdm as notebook_tqdm
  return torch._C._cuda_getDeviceCount() > 0


In [3]:
new_df = df.dropna(subset=['text_embeddings']).reset_index(drop=True)

In [5]:
new_df = new_df.rename(columns={
    'text_embeddings': 'uText',
    'audio_embeddings': 'uAudio',
    'keyframe_embeddings': 'uVideo',
    'Sarcasm':'SAR'
})

In [6]:
new_df['cAudio'] = None
for i in range(0, len(new_df) - 1, 2):
    new_df.at[i + 1, 'cAudio'] = new_df.at[i, 'uAudio']

new_df['cVideo'] = None

for i in range(0, len(new_df) - 1, 2):
    new_df.at[i + 1, 'cVideo'] = new_df.at[i, 'uVideo']

new_df['cText'] = None

for i in range(0, len(new_df) - 1, 2):
    new_df.at[i + 1, 'cText'] = new_df.at[i, 'uText']



In [7]:
new_df = new_df.dropna(subset=['cAudio']).reset_index(drop=True)

In [None]:
#We select a 450 video sample to conduct hyperparameter search
hyp_pos = new_df[new_df['SAR']==1].sample(225).reset_index()
hyp_neg = new_df[new_df['SAR']==0].sample(225).reset_index()
h_train = pd.concat([hyp_pos[75:], hyp_neg[75:]], ignore_index=True)
h_val = pd.concat([hyp_pos[:74], hyp_neg[:74]], ignore_index=True)

In [28]:
#we map the train and validation set for hyperparameter search into appropriate formats so the Custom ContentDataset class can process them properly.
dataset1 = {}

# Iterate over each row in the DataFrame
dataset1 = {
    row['SCENE']: {
        'uText': row['uText'],
        'cText': row['cText'],
        'uAudio': row['uAudio'],
        'cAudio': row['cAudio'],
        'uVideo': row['uVideo'],
        'cVideo': row['cVideo']
    }
    for _, row in h_train.iterrows()
}

dataset2 = {
    row['SCENE']: {
        'uText': row['uText'],
        'cText': row['cText'],
        'uAudio': row['uAudio'],
        'cAudio': row['cAudio'],
        'uVideo': row['uVideo'],
        'cVideo': row['cVideo']
    }
    for _, row in h_val.iterrows()
}

map1 = h_train[['SCENE','SAR','SPEAKER']]
map2 = h_val[['SCENE','SAR','SPEAKER']]


In [29]:
#we load in the speaker list from the full dataset
speaker_list = sorted(list(df.SPEAKER.value_counts().keys()))

In [30]:
def apply_attention(tensor):
    # Compute attention scores
        attention_layer = torch.nn.Linear(768, 1)
        attention_scores = attention_layer(tensor)  # [batch_size, seq_len, 1]
        attention_weights = F.softmax(attention_scores, dim=1)  # [batch_size, seq_len, 1]
    
    # Weighted sum of the input tensor
        attended_tensor = (tensor * attention_weights).sum(dim=-2)  # [batch_size, hidden_dim]
    
        return attended_tensor

In [31]:
#defining the custom Content Dataset class used to load in extracted features, speaker info, and sarcasm label.
#this is adapted from MUStARD++'s code
#commented out section is the version used for mean of last four layer
class ContentDataset(Dataset):

    def __init__(self, mapping, dataset, speaker_list):
        self.mapping = mapping
        self.dataset = dataset
        self.speakers_mapping = speaker_list

    def __len__(self):
        return len(self.mapping)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        index = self.mapping.loc[idx, 'SCENE']
        data = self.dataset[index]
        label = int(self.mapping.loc[idx, 'SAR'])
        spkr = np.eye(len(self.speakers_mapping))[self.speakers_mapping.index(
            self.mapping.loc[idx, 'SPEAKER'])]
        uText = data['uText'].squeeze()
        cText = data['cText'].squeeze()
        uAudio = apply_attention(data['uAudio']).squeeze()
        cAudio = apply_attention(data['cAudio']).squeeze()
        uVideo = apply_attention(data['uVideo']).squeeze()
        cVideo = apply_attention(data['cVideo']).squeeze()

        return uText, cText, uAudio, cAudio, uVideo, cVideo, spkr, label

# class ContentDataset(Dataset):

#     def __init__(self, mapping, dataset, speaker_list):
#         self.mapping = mapping
#         self.dataset = dataset
#         self.speakers_mapping = speaker_list

#     def __len__(self):
#         return len(self.mapping)

#     def __getitem__(self, idx):
#         if torch.is_tensor(idx):
#             idx = idx.tolist()

#         index = self.mapping.loc[idx, 'SCENE']
#         data = self.dataset[index]
#         label = int(self.mapping.loc[idx, 'SAR'])
#         spkr = np.eye(len(self.speakers_mapping))[self.speakers_mapping.index(
#             self.mapping.loc[idx, 'SPEAKER'])]
#         uText = data['uText'].squeeze()
#         cText = data['cText'].squeeze()
#         uAudio = data['uAudio'].squeeze()
#         cAudio = data['cAudio'].squeeze()
#         uVideo = data['uVideo'].squeeze()
#         cVideo = data['cVideo'].squeeze()

#         return uText, cText, uAudio, cAudio, uVideo, cVideo, spkr, label

In [33]:
#we load the features from the datafram into the ContentDataset class with its respective fold
CD1 = ContentDataset(mapping=map1, dataset=dataset1,speaker_list=speaker_list)
CD2 = ContentDataset(mapping=map2, dataset=dataset2,speaker_list=speaker_list)


In [34]:
#we inspect whether a sample of the ContentDataset class is in the right shape
index = 9  # Example index
entry = CD2[index]

# Print the entry
print("uText:", entry[0].shape)
print("cText:", entry[1].shape)
print("uAudio:", entry[2].shape)
print("cAudio:", entry[3].shape)
print("uVideo:", entry[4].shape)
print("cVideo:", entry[5].shape)
print("spkr:", entry[6].shape)
print("label:", entry[7])

uText: torch.Size([768])
cText: torch.Size([768])
uAudio: torch.Size([768])
cAudio: torch.Size([768])
uVideo: torch.Size([768])
cVideo: torch.Size([768])
spkr: (33,)
label: 0


The following code blocks (seed, evaluation, training, get_command, get_models_and_parameters, seed_worker) is largely replicated from MUStARD++'s training and evaluation code.

In [35]:
import random
def seed():
    """ This method is used for seeding the code and different points"""
    np.random.seed(42)
    random.seed(42)
    torch.manual_seed(42)
    torch.cuda.manual_seed(42)
    torch.backends.cudnn.enabled = False
    torch.backends.cudnn.deterministic = True

In [36]:
def evaluation(loader, mod, call, report=False, flag=False):
    """Args:
            loader:
                It is the validation dataloader
            mod:
                It is the best model, which we have to evaluate
            call:
                call is the COMMAND to be excuted to run the forward method of the model
                it changed as per the modality and other possible input
            report:
                If True then the classification report for the validation set is printed
            flag:
                if True the instead of evaluation metrics, method returns the class labels (predictions)
    """
    with torch.no_grad():
        pred = []
        true = []
        total_loss = []
        criterion = nn.CrossEntropyLoss()
        criterion.to(device)
        seed()
        for batch in loader:
            uText = batch[0].float().to(device)
            cText = batch[1].float().to(device)
            uAudio = batch[2].float().to(device)
            cAudio = batch[3].float().to(device)
            uVideo = batch[4].float().to(device)
            cVideo = batch[5].float().to(device)
            speaker = batch[6].float().to(device)
            y_true = batch[7].long().to(device)
            del batch
            output = torch.softmax(eval(call), dim=1)
            loss = criterion(output, y_true)
            del uText, cText, uAudio, cAudio, uVideo, cVideo, speaker
            # with torch.cuda.device(device):
            #     torch.cuda.empty_cache()
            total_loss.append(loss)
            pred.extend(output.detach().cpu().tolist())
            true.extend(y_true.tolist())
        if flag:
            return true, np.argmax(pred, axis=1)
        if report:
            print(classification_report(true, np.argmax(pred, axis=1), digits=3))

        
        return f1_score(true, np.argmax(pred, axis=1), average='macro'), sum(total_loss)/len(total_loss)

In [37]:
def training(mod, criterion, optimizer, call, train_loader, valid_loader, fold, e=500, patience=5, report=False,save=True):
    """Args:
            mod :
                It is the mod we have to train
            criterion :
                Loss function, here we have Cross entropy loss
            optimizer :
              object of torch.optim class
            call:
                call is the COMMAND to be excuted to run the forward method of the model
                it changed as per the modality and other possible input
            train_loader:
                It is a instance of train dataloader
            valid_loader:
                It is a instance of validation dataloader, it is given as a input to evaluation class
            fold:
                5 FOLD {0,1,2,3,4}
            e:
                maximum epoch
            patience:
                how many epoch to wait after the early stopping condition in satisfied
            report:
                It True then the classification report for the validation set is printed, it is given as a input to evaluation class
            save:
                If true then best model for each fold is saved

    """
    print('-'*100)
    train_losses = [0]
    valid_losses = [0]
    max_f1 = 0
    patience_flag = 1
    best_epoch = 0
    print(fold, e, patience)

    while e > 0:
        total_loss = []
        seed()
        for batch_data in train_loader:
            uText = batch_data[0].float().to(device)
            cText = batch_data[1].float().to(device)
            uAudio = batch_data[2].float().to(device)
            cAudio = batch_data[3].float().to(device)
            uVideo = batch_data[4].float().to(device)
            cVideo = batch_data[5].float().to(device)
            speaker = batch_data[6].float().to(device)
            y_true = batch_data[7].long().to(device)
            del batch_data
            output = eval(call)
            loss = criterion(output, y_true)
            del uText, cText, uAudio, cAudio, uVideo, cVideo, speaker
            # with torch.cuda.device(device):
            #     torch.cuda.empty_cache()
            optimizer.zero_grad()
            total_loss.append(loss.detach().item())
            loss.backward()
            optimizer.step()
        with torch.no_grad():
            valid_f1, valid_loss = evaluation(
                valid_loader, mod, call, report, False)
            train_losses.append(sum(total_loss)/len(total_loss))
            valid_losses.append(valid_loss)

            e = e-1
            if max_f1 < valid_f1:
                max_f1 = valid_f1
                best_model = mod
                best_epoch = 500-e
                print(
                    f'Epoch:{best_epoch} | Train Loss: {loss.detach().item():.3f} | Valid loss: { valid_loss.detach().item():7.3f} | Valid F1: { valid_f1:7.3f}')

            if abs(train_losses[-2]-train_losses[-1]) < 0.0001:
                if patience_flag == 1:
                    e = patience
                    patience_flag = 0
            else:
                patience_flag = 1

    # if save:
    #     best_model.to(device)
    #     torch.save(best_model.state_dict(), 'MPP_Code/saved_models/sarc/' +
    #                         filename+'_'+str(fold)+'.pth')
                
    return evaluation(valid_loader, best_model, call, report, True), best_epoch

In [38]:
def get_command(input_modes, context_flag, speaker_flag):
    """
        This method is used to create the COMMAND to execute the forward methof of particular model,
        Depending upon the input combination
        Args:
            input_modes:
                Input Modality {VTA, VT, VA, TA, V, T, A}
            context_flag :
                If true then "with context" else "without context" 
            speaker_flag:
                if true then Speaker dependent else Speaker INdependent
    """
    if input_modes == 'VTA':
        COMMAND = "mod(**{'uA':uVideo, 'uB':uText, 'uC':uAudio"
        if context_flag == 'y':
            COMMAND += ",'cA':cVideo, 'cB':cText, 'cC':cAudio"

    elif input_modes == 'VT':
        COMMAND = "mod(**{'uA':uVideo, 'uB':uText"
        if context_flag == 'y':
            COMMAND += ",'cA':cVideo, 'cB':cText"

    elif input_modes == 'VA':
        COMMAND = "mod(**{'uA':uVideo, 'uB':uAudio"
        if context_flag == 'y':
            COMMAND += ",'cA':cVideo, 'cB':cAudio"

    elif input_modes == 'TA':
        COMMAND = "mod(**{'uA':uText, 'uB':uAudio"
        if context_flag == 'y':
            COMMAND += ",'cA':cText, 'cB':cAudio"

    elif input_modes == 'T':
        COMMAND = "mod(**{'uA':uText"
        if context_flag == 'y':
            COMMAND += ",'cA':cText"

    elif input_modes == 'V':
        COMMAND = "mod(**{'uA':uVideo"
        if context_flag == 'y':
            COMMAND += ",'cA':cVideo"

    elif input_modes == 'A':
        COMMAND = "mod(**{'uA':uAudio"
        if context_flag == 'y':
            COMMAND += ",'cA':cAudio"
    if speaker_flag == 'y':
        COMMAND += ",'speaker_embedding':speaker})"
    else:
        COMMAND += "})"

    return COMMAND


In [39]:
#num_classes has been changed to 2 as we are performing binary sarcasm detection
#all tensors shape have been changed to 768 to correspond to the tensor shape of Data2Vec after reshaping with ContentDataset
def get_model_and_parameters(mode, speaker, context):
    """
    Parameters:
    - mode: A string representing the mode.
    - speaker: A string ('y' or 'n') indicating whether the speaker is dependent or not.
    - context: A string ('y' or 'n') indicating whether context is considered or not.
    """
    # Here we are sorting VTA in descending order, in order to have consistency in the model
    input_modes = ''.join(reversed(sorted(list(mode.upper()))))

    parameters = {}
    MODEL_NAME = 'Speaker_'

    parameters['num_classes'] = 2

    if speaker.lower() == 'y':
        MODEL_NAME += 'Dependent_'
        parameters['n_speaker'] = len(speaker_list)  # Make sure speaker_list is defined elsewhere in your code
    else:
        MODEL_NAME += 'Independent_'

    if len(input_modes) == 3:
        MODEL_NAME += 'Triple_'
        parameters['input_embedding_A'] = 768
        parameters['input_embedding_B'] = 768
        parameters['input_embedding_C'] = 768

    elif len(input_modes) == 2:
        MODEL_NAME += 'Dual_'
        parameters['input_embedding_A'] = 768
        parameters['input_embedding_B'] = 768 
    else:
        MODEL_NAME += 'Single_'
        parameters['input_embedding_A'] = 768 

    MODEL_NAME += 'Mode_with'
    MODEL_NAME += 'out' if context.lower() == 'n' else ''
    MODEL_NAME += '_Context'

    MODEL_NAME = 'emotion_classification_model.' + MODEL_NAME

    COMMAND = get_command(input_modes, context.lower(), speaker.lower())  # Ensure get_command is defined
    return MODEL_NAME, parameters, COMMAND


In [40]:
def seed_worker(worker_id):
    """ This method is used for seeding the worker in the dataloader"""
    worker_seed = 42
    np.random.seed(worker_seed)
    random.seed(worker_seed)


Here only one model is included. Models of different ablation setting are identical to that of MUStARD++.

In [41]:

class Speaker_Dependent_Triple_Mode_with_Context(nn.Module):
    def __init__(self, n_speaker=len(speaker_list), input_embedding_A=768, input_embedding_B=768, input_embedding_C=768, shared_embedding =768, projection_embedding=768, dropout=0.5, num_classes=2):
        super(Speaker_Dependent_Triple_Mode_with_Context, self).__init__()

        self.n_speaker = n_speaker

        self.input_embedding_A = input_embedding_A
        self.input_embedding_B = input_embedding_B
        self.input_embedding_C = input_embedding_C

        self.shared_embedding = shared_embedding
        self.projection_embedding = projection_embedding
        self.num_classes = num_classes
        self.dropout = dropout

        self.A_context_share = nn.Linear(self.input_embedding_A, self.shared_embedding)
        self.A_utterance_share = nn.Linear(self.input_embedding_A, self.shared_embedding)

        self.C_context_share = nn.Linear(self.input_embedding_C, self.shared_embedding)
        self.C_utterance_share = nn.Linear(self.input_embedding_C, self.shared_embedding)

        self.B_context_share = nn.Linear(self.input_embedding_B, self.shared_embedding)
        self.B_utterance_share = nn.Linear(self.input_embedding_B, self.shared_embedding)

        self.norm_A_context = nn.BatchNorm1d(self.shared_embedding)
        self.norm_A_utterance = nn.BatchNorm1d(self.shared_embedding)

        self.norm_C_context = nn.BatchNorm1d(self.shared_embedding)
        self.norm_C_utterance = nn.BatchNorm1d(self.shared_embedding)

        self.norm_B_context = nn.BatchNorm1d(self.shared_embedding)
        self.norm_B_utterance = nn.BatchNorm1d(self.shared_embedding)

        self.collaborative_gate_1 = nn.Linear(2 * self.shared_embedding, self.projection_embedding)
        self.collaborative_gate_2 = nn.Linear(self.projection_embedding, self.shared_embedding)

        self.pred_module = nn.Sequential(
            nn.Linear(self.n_speaker + 3 * self.shared_embedding, 2 * self.shared_embedding),
            nn.BatchNorm1d(2 * self.shared_embedding),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(2 * self.shared_embedding, self.shared_embedding),
            nn.BatchNorm1d(self.shared_embedding),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(self.shared_embedding, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(512, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, self.num_classes)
        )

    def attention(self, featureA, featureB):
        """ This method takes two features and calculates the attention """
        input = torch.cat((featureA, featureB), dim=1)
        return nn.functional.softmax(self.collaborative_gate_1(input), dim=1)
    

    def attention_aggregator(self, feA, feB, feC, feD, feE, feF):
        """ This method calculates the attention for feA with respect to others"""
        input = self.attention(feA, feB) + self.attention(feA, feC) + self.attention(feA, feD) + self.attention(feA, feE) + self.attention(feA, feF)
        return nn.functional.softmax(self.collaborative_gate_2(input), dim=1)

    def forward(self, uA, cA, uB, cB, uC, cC, speaker_embedding):
        """
        Args:
            uA: Utterance Video
            uB: Utterance Text
            uC: Utterance Audio
            cA: Context Video
            cB: Context Text
            cC: Context Audio

        Returns:
            probability of emotion classes
        """
        # Pooling or averaging the sequences to match the expected input dimensions for linear layers
        
        
        shared_A_context = self.norm_A_context(nn.functional.relu(self.A_context_share(cA)))
        shared_A_utterance = self.norm_A_utterance(nn.functional.relu(self.A_utterance_share(uA)))

        shared_C_context = self.norm_C_context(nn.functional.relu(self.C_context_share(cC)))
        shared_C_utterance = self.norm_C_utterance(nn.functional.relu(self.C_utterance_share(uC)))

        shared_B_context = self.norm_B_context(nn.functional.relu(self.B_context_share(cB)))
        shared_B_utterance = self.norm_B_utterance(nn.functional.relu(self.B_utterance_share(uB)))

        updated_shared_A = shared_A_utterance * self.attention_aggregator(
            shared_A_utterance, shared_A_context, shared_C_context, shared_C_utterance, shared_B_context, shared_B_utterance)
        updated_shared_C = shared_C_utterance * self.attention_aggregator(
            shared_C_utterance, shared_C_context, shared_A_context, shared_A_utterance, shared_B_context, shared_B_utterance)
        updated_shared_B = shared_B_utterance * self.attention_aggregator(
            shared_B_utterance, shared_B_context, shared_A_context, shared_A_utterance, shared_C_context, shared_C_utterance)

        temp = torch.cat((updated_shared_A, updated_shared_C), dim=1)
        input = torch.cat((temp, updated_shared_B), dim=1)

        input = torch.cat((input, speaker_embedding), dim=1)

        return self.pred_module(input)

In [None]:
speaker = "Y"  # or "Y" for Speaker Dependent else "n" or "N"
mode = "VTA"  # "V" for Video, "T" for Text, "A" for Audio respectively
context = "Y"  # "y" or "Y" for Context Dependent else "n" or "N"

MODEL_NAME, parameters, COMMAND = get_model_and_parameters(mode, speaker, context)


# Define ranges for hyper-parameters
dropout_values = [0.2, 0.3, 0.4]
learning_rate_values = [0.001, 0.0001]
batch_size_values = [32, 64, 128]
shared_emb_values = [2048, 1024]
proj_emb_values = [1024, 256]

# Function to initialize and train the model, change model according to ablation settings
def train_and_evaluate_model(dropout, lr, batch_size, shared_emb_size, proj_emb_size):
    seed()
    train_loader = DataLoader(CD1, batch_size=batch_size, num_workers=0, pin_memory=False, worker_init_fn=seed_worker)
    seed()
    val_loader = DataLoader(CD2, batch_size=batch_size, num_workers=0, pin_memory=False, worker_init_fn=seed_worker)

    seed()
    mod = Speaker_Dependent_Triple_Mode_with_Context(num_classes=2, dropout=dropout, shared_embedding=shared_emb_size, projection_embedding=proj_emb_size)
    
    mod.to(device)

    seed()
    criterion = nn.CrossEntropyLoss()
    criterion.to(device)
    
    seed()
    optimizer = optim.Adam(params=mod.parameters(), betas=(0.5, 0.99), lr=lr)
    
    (true, pred), epo = training(
        mod=mod, 
        criterion=criterion, 
        optimizer=optimizer, 
        call=COMMAND, 
        train_loader=train_loader, 
        valid_loader=val_loader, 
        fold=0, 
        e=epoch, 
        patience=patience
    )
    test_loader = DataLoader(CD3, batch_size, num_workers=0, pin_memory=False, worker_init_fn=seed_worker)
    test_accuracy, test_loss = evaluation(test_loader, mod, call=COMMAND, report=True)
    return test_accuracy, test_loss
    

# Grid search
best_params = None
best_accuracy = 0
best_loss = float('inf')


for dropout, lr, batch_size, shared_emb_size, proj_emb_size in itertools.product(
    dropout_values, learning_rate_values, batch_size_values, shared_emb_values, proj_emb_values
):
    print(f'Training with dropout={dropout}, lr={lr}, batch_size={batch_size}, shared_emb_size={shared_emb_size}, proj_emb_size={proj_emb_size}')
    accuracy, loss = train_and_evaluate_model(dropout, lr, batch_size, shared_emb_size, proj_emb_size)
    if loss < best_loss:
        best_loss = loss
        best_accuracy = accuracy
        best_params = (dropout, lr, batch_size, shared_emb_size, proj_emb_size)
        print(f'New best params: {best_params} with loss: {best_loss} and F1 score of :{best_accuracy}')

print(f'Best hyper-parameters: {best_params} with loss: {best_loss} and F1 score of {best_accuracy}')


Training with dropout=0.2, lr=0.001, batch_size=16, shared_emb_size=2048, proj_emb_size=1024
----------------------------------------------------------------------------------------------------
0 500 5
Epoch:1 | Train Loss: 0.631 | Valid loss:   0.677 | Valid F1:   0.558
Epoch:2 | Train Loss: 0.422 | Valid loss:   0.678 | Valid F1:   0.576
Epoch:3 | Train Loss: 0.150 | Valid loss:   0.689 | Valid F1:   0.633
              precision    recall  f1-score   support

           0      0.634     0.656     0.645        90
           1      0.644     0.622     0.633        90

    accuracy                          0.639       180
   macro avg      0.639     0.639     0.639       180
weighted avg      0.639     0.639     0.639       180

New best params: (0.2, 0.001, 16, 2048, 1024) with loss: 0.6468186974525452 and F1 score of :0.63878855237566
Training with dropout=0.2, lr=0.001, batch_size=16, shared_emb_size=2048, proj_emb_size=256
-----------------------------------------------------------