In [10]:
import pandas as pd

# Load the CSV file into a DataFrame
pathname = '../datasets/MELD.Raw/dev_sent_emo.csv'
df = pd.read_csv(pathname)


In [11]:
df.head()

Unnamed: 0,Sr No.,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Season,Episode,StartTime,EndTime
0,1,"Oh my God, hes lost it. Hes totally lost it.",Phoebe,sadness,negative,0,0,4,7,"00:20:57,256","00:21:00,049"
1,2,What?,Monica,surprise,negative,0,1,4,7,"00:21:01,927","00:21:03,261"
2,3,"Or! Or, we could go to the bank, close our acc...",Ross,neutral,neutral,1,0,4,4,"00:12:24,660","00:12:30,915"
3,4,Youre a genius!,Chandler,joy,positive,1,1,4,4,"00:12:32,334","00:12:33,960"
4,5,"Aww, man, now we wont be bank buddies!",Joey,sadness,negative,1,2,4,4,"00:12:34,211","00:12:37,505"


In [12]:
num_rows = df.shape[0]
print(f'The DataFrame contains {num_rows} rows.')

The DataFrame contains 1109 rows.


In [3]:
df.iloc[0]

Sr No.                                                       1
Utterance       Oh my God, hes lost it. Hes totally lost it.
Speaker                                                 Phoebe
Emotion                                                sadness
Sentiment                                             negative
Dialogue_ID                                                  0
Utterance_ID                                                 0
Season                                                       4
Episode                                                      7
StartTime                                         00:20:57,256
EndTime                                           00:21:00,049
Name: 0, dtype: object

In [4]:
def get_transcription_audio_file(row):
   row_data=df.iloc[row]
   utterance=row_data['Utterance']
   audio_file = 'dia'+str(row_data['Dialogue_ID']) + '_utt' + str(row_data['Utterance_ID']) + '.mp4'
   print(f'Transcripción: {utterance}')
   print(f'Archivo de audio: {audio_file}')
   

In [5]:
get_transcription_audio_file(0)

Transcripción: Oh my God, hes lost it. Hes totally lost it.
Archivo de audio: dia0_utt0.mp4


In [1]:
import torch
import torchaudio
from pydub import AudioSegment


In [67]:
import os

# Function to extract audio from mp4 and convert to wav
def extract_audio(mp4_file, wav_file):
    if not os.path.isfile(mp4_file):
        raise FileNotFoundError(f"The file {mp4_file} does not exist.")
    audio = AudioSegment.from_file(mp4_file, format="mp4")
    audio.export(wav_file, format="wav")

# Function to get embeddings using wav2Vec
def get_wav2vec_embeddings(wav_file):
    # Load the pretrained wav2vec model
    bundle = torchaudio.pipelines.WAV2VEC2_BASE
    model = bundle.get_model()
    
    # Load the audio file
    waveform, sample_rate = torchaudio.load(wav_file)
    print(f'Sample rate: {sample_rate}')
    print(f'Waveform shape before: {waveform.shape}')
    # Ensure the audio is mono
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)
    
    # Resample if necessary
    print(f'Bundle sample rate: {bundle.sample_rate}')
    if sample_rate != bundle.sample_rate:
        waveform = torchaudio.transforms.Resample(sample_rate, bundle.sample_rate)(waveform)
    
    # Get the embeddings
    with torch.inference_mode():
        print(f'Waveform shape: {waveform.shape}')
        embeddings = model(waveform)
    
    return embeddings


In [3]:

# Example usage
mp4_file = '../datasets/MELD.Raw/dev_splits_complete/dia0_utt0.mp4'
wav_file = 'extracted_audio.wav'
extract_audio(mp4_file, wav_file)
embeddings = get_wav2vec_embeddings(wav_file)
print(embeddings)

Downloading: "https://download.pytorch.org/torchaudio/models/wav2vec2_fairseq_base_ls960.pth" to /Users/marcofura/.cache/torch/hub/checkpoints/wav2vec2_fairseq_base_ls960.pth
100%|██████████| 360M/360M [13:15<00:00, 475kB/s]  


(tensor([[[ 0.3743,  0.2285,  0.3895,  ...,  0.6266,  0.1548,  0.0335],
         [ 0.0541, -0.0548, -0.3280,  ...,  0.2684,  0.5357, -0.0107],
         [ 0.0563, -0.0471, -0.3366,  ...,  0.2579,  0.5326, -0.0014],
         ...,
         [ 0.3783,  0.2390,  0.3014,  ..., -0.4249,  0.1131, -0.1660],
         [ 0.3739,  0.2372,  0.3494,  ..., -0.5008,  0.1228, -0.1781],
         [ 0.3632,  0.1879,  0.3222,  ..., -0.4441,  0.0179, -0.3082]]]), None)


In [8]:
embeddings[0].shape

torch.Size([1, 139, 768])

In [13]:
def get_embedding_for_row(row):
    row_data = df.iloc[row]
    mp4_file = f"../datasets/MELD.Raw/dev_splits_complete/dia{row_data['Dialogue_ID']}_utt{row_data['Utterance_ID']}.mp4"
    wav_file = "extracted_audio.wav"
    extract_audio(mp4_file, wav_file)
    embeddings = get_wav2vec_embeddings(wav_file)
    return embeddings

In [14]:
embeddings=get_embedding_for_row(1)

In [15]:
embeddings[0].shape

torch.Size([1, 66, 768])

In [16]:
from transformers import BertTokenizer, BertModel
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [17]:

# Load pre-trained model tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load pre-trained model
model = BertModel.from_pretrained('bert-base-uncased')


In [63]:


# Function to get BERT embeddings for a sentence
def get_bert_embeddings(sentence):
    # Tokenize input
    inputs = tokenizer(sentence, return_tensors='pt')
    
    # Get the embeddings
    with torch.no_grad():
        print(f'Input type: {type(inputs)}')
        print(f'Inputs Shape: {inputs["input_ids"].shape}')
        outputs = model(**inputs)
    
    # The last hidden state is the output of the model
    embeddings = outputs.last_hidden_state
    return embeddings



In [23]:
# Example usage
sentence = "Hello world, how are you?"
embeddings = get_bert_embeddings(sentence)
print(embeddings)

tensor([[[-1.0088e-01,  7.2336e-02, -6.6387e-02,  ..., -4.1218e-01,
           2.0131e-01,  5.0706e-01],
         [-8.0426e-02,  1.9865e-01,  9.1673e-01,  ..., -7.8568e-02,
           1.0137e+00,  2.1467e-01],
         [-3.9301e-02, -8.2062e-02,  1.3620e+00,  ..., -4.4073e-01,
           7.9244e-01, -6.5663e-02],
         ...,
         [-3.8392e-01, -9.5377e-01,  9.9188e-01,  ..., -2.4426e-01,
           4.6690e-01, -5.5713e-01],
         [-3.6248e-01, -8.8275e-01, -6.6190e-01,  ...,  1.6279e-01,
           8.7157e-02, -3.6306e-04],
         [ 5.5324e-01, -7.4988e-02, -1.8088e-01,  ...,  1.9566e-01,
          -4.2207e-01, -1.1609e-01]]])


In [24]:
embeddings.shape

torch.Size([1, 9, 768])

In [46]:
def generate_wavfiles_from_df(df):
    for i in range(df.shape[0]):
        row_data = df.iloc[i]
        mp4_file = f"../datasets/MELD.Raw/dev_splits_complete/dia{row_data['Dialogue_ID']}_utt{row_data['Utterance_ID']}.mp4"
        wav_file = f"../datasets/MELD.Raw/dev_splits_complete/dia{row_data['Dialogue_ID']}_utt{row_data['Utterance_ID']}.wav"
        if type(row_data["wav_file"]) is str:
            print(f"Skipping row {i} {type(row_data['wav_file'])}")
        else:
            try:
                extract_audio(mp4_file, wav_file)
                print(f"Extracted audio for row {i}")
                df.at[i, "wav_file"] = wav_file
            except Exception as e:
                print(f"Error extracting audio for row {i} {e}")
        

In [34]:
df.iloc[1100]

Sr No.                  1170
Utterance             And it
Speaker               Monica
Emotion              neutral
Sentiment            neutral
Dialogue_ID              113
Utterance_ID               5
Season                     6
Episode                    2
StartTime       00:19:11,525
EndTime         00:19:14,068
wav_file                 NaN
Name: 1100, dtype: object

In [47]:
generate_wavfiles_from_df(df)

Skipping row 0 <class 'str'>
Skipping row 1 <class 'str'>
Skipping row 2 <class 'str'>
Skipping row 3 <class 'str'>
Skipping row 4 <class 'str'>
Skipping row 5 <class 'str'>
Skipping row 6 <class 'str'>
Skipping row 7 <class 'str'>
Skipping row 8 <class 'str'>
Skipping row 9 <class 'str'>
Skipping row 10 <class 'str'>
Skipping row 11 <class 'str'>
Skipping row 12 <class 'str'>
Skipping row 13 <class 'str'>
Skipping row 14 <class 'str'>
Skipping row 15 <class 'str'>
Skipping row 16 <class 'str'>
Skipping row 17 <class 'str'>
Skipping row 18 <class 'str'>
Skipping row 19 <class 'str'>
Skipping row 20 <class 'str'>
Skipping row 21 <class 'str'>
Skipping row 22 <class 'str'>
Skipping row 23 <class 'str'>
Skipping row 24 <class 'str'>
Skipping row 25 <class 'str'>
Skipping row 26 <class 'str'>
Skipping row 27 <class 'str'>
Skipping row 28 <class 'str'>
Skipping row 29 <class 'str'>
Skipping row 30 <class 'str'>
Skipping row 31 <class 'str'>
Skipping row 32 <class 'str'>
Skipping row 33 <cla

In [48]:
df = df.dropna(subset=['wav_file'])
df.head()

Unnamed: 0,Sr No.,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Season,Episode,StartTime,EndTime,wav_file
0,1,"Oh my God, hes lost it. Hes totally lost it.",Phoebe,sadness,negative,0,0,4,7,"00:20:57,256","00:21:00,049",../datasets/MELD.Raw/dev_splits_complete/dia0_...
1,2,What?,Monica,surprise,negative,0,1,4,7,"00:21:01,927","00:21:03,261",../datasets/MELD.Raw/dev_splits_complete/dia0_...
2,3,"Or! Or, we could go to the bank, close our acc...",Ross,neutral,neutral,1,0,4,4,"00:12:24,660","00:12:30,915",../datasets/MELD.Raw/dev_splits_complete/dia1_...
3,4,Youre a genius!,Chandler,joy,positive,1,1,4,4,"00:12:32,334","00:12:33,960",../datasets/MELD.Raw/dev_splits_complete/dia1_...
4,5,"Aww, man, now we wont be bank buddies!",Joey,sadness,negative,1,2,4,4,"00:12:34,211","00:12:37,505",../datasets/MELD.Raw/dev_splits_complete/dia1_...


In [68]:
def generate_both_embeddings_from_row(row):
    row_data = df.iloc[row]
    embeddings_wav2vec = get_wav2vec_embeddings(row_data["wav_file"])
    embeddings_bert = get_bert_embeddings(row_data["Utterance"])
    return embeddings_wav2vec, embeddings_bert

In [69]:
wav_embeddings,word_embeddings=generate_both_embeddings_from_row(0)

Sample rate: 48000
Waveform shape before: torch.Size([6, 134144])
Bundle sample rate: 16000
Waveform shape: torch.Size([1, 44715])
Input type: <class 'transformers.tokenization_utils_base.BatchEncoding'>
Inputs Shape: torch.Size([1, 17])


In [55]:
wav_embeddings[0].shape,word_embeddings.shape

(torch.Size([1, 139, 768]), torch.Size([1, 17, 768]))

In [58]:
wav_embeddings[0][0].shape,word_embeddings[0].shape

(torch.Size([139, 768]), torch.Size([17, 768]))

In [None]:
import torch

import torch.nn as nn
import torch.nn.functional as F

class CrossAttentionModel(nn.Module):
    def __init__(self, bert_embedding_dim, audio_embedding_dim, lstm_hidden_dim, num_classes):
        super(CrossAttentionModel, self).__init__()
        self.bert_embedding_dim = bert_embedding_dim
        self.audio_embedding_dim = audio_embedding_dim
        self.lstm_hidden_dim = lstm_hidden_dim
        self.num_classes = num_classes
        
        # Attention layers
        self.attention = nn.MultiheadAttention(embed_dim=audio_embedding_dim, num_heads=8)
        
        # LSTM layer
        self.lstm = nn.LSTM(input_size=bert_embedding_dim + audio_embedding_dim, hidden_size=lstm_hidden_dim, batch_first=True)
        
        # Fully connected layer
        self.fc = nn.Linear(lstm_hidden_dim, num_classes)
        
    def forward(self, bert_embeddings, audio_embeddings):
        # bert_embeddings: (batch_size, seq_len, bert_embedding_dim)
        # audio_embeddings: (batch_size, audio_seq_len, audio_embedding_dim)
        
        # Apply attention
        attn_output, _ = self.attention(audio_embeddings, audio_embeddings, audio_embeddings)
        
        # Concatenate BERT embeddings with attended audio embeddings
        enriched_embeddings = torch.cat((bert_embeddings, attn_output), dim=-1)
        
        # Pass through LSTM
        lstm_out, _ = self.lstm(enriched_embeddings)
        
        # Pass through fully connected layer
        logits = self.fc(lstm_out[:, -1, :])  # Use the last output of LSTM
        
        # Apply softmax
        probs = F.softmax(logits, dim=-1)
        
        return probs

# Example usage
bert_embedding_dim = 768
audio_embedding_dim = 512
lstm_hidden_dim = 256
num_classes = 7  # Assuming 7 emotion classes

model = CrossAttentionModel(bert_embedding_dim, audio_embedding_dim, lstm_hidden_dim, num_classes)

# Example input tensors
bert_embeddings = torch.randn(1, 10, bert_embedding_dim)  # (batch_size, seq_len, bert_embedding_dim)
audio_embeddings = torch.randn(1, 20, audio_embedding_dim)  # (batch_size, audio_seq_len, audio_embedding_dim)

# Forward pass
output = model(bert_embeddings, audio_embeddings)
print(output)

ModuleNotFoundError: No module named 'keras'

In [75]:
import torch
import torch.nn as nn

class CrossAttentionTextEnrichmentModel(nn.Module):
    def __init__(self, bert_embedding_dim, audio_embedding_dim):
        super(CrossAttentionTextEnrichmentModel, self).__init__()
        self.bert_embedding_dim = bert_embedding_dim
        self.audio_embedding_dim = audio_embedding_dim
        
        # Cross-attention layer
        self.cross_attention = nn.MultiheadAttention(embed_dim=bert_embedding_dim, num_heads=8)
        
    def forward(self, bert_embeddings, audio_embeddings):
        """
        Args:
        - bert_embeddings: (batch_size, seq_len_text, bert_embedding_dim)
        - audio_embeddings: (batch_size, seq_len_audio, audio_embedding_dim)

        Returns:
        - enriched_text_embeddings: (batch_size, seq_len_text, bert_embedding_dim)
        """
        # Transpose embeddings to match MultiheadAttention input (seq_len, batch_size, embed_dim)
        bert_embeddings = bert_embeddings.permute(1, 0, 2)  # (seq_len_text, batch_size, bert_embedding_dim)
        audio_embeddings = audio_embeddings.permute(1, 0, 2)  # (seq_len_audio, batch_size, audio_embedding_dim)
        
        # Apply cross-attention
        enriched_text_embeddings, _ = self.cross_attention(
            query=bert_embeddings,  # Texto como consulta
            key=audio_embeddings,   # Audio como clave
            value=audio_embeddings  # Audio como valor
        )
        
        # Volver a (batch_size, seq_len_text, bert_embedding_dim)
        enriched_text_embeddings = enriched_text_embeddings.permute(1, 0, 2)
        
        return enriched_text_embeddings

# Parámetros de configuración
bert_embedding_dim = 768
audio_embedding_dim = 768

# Crear modelo
model = CrossAttentionTextEnrichmentModel(bert_embedding_dim, audio_embedding_dim)

# Datos de ejemplo
bert_embeddings = torch.randn(1, 10, bert_embedding_dim)  # (batch_size, seq_len_text, bert_embedding_dim)
audio_embeddings = torch.randn(1, 20, audio_embedding_dim)  # (batch_size, seq_len_audio, audio_embedding_dim)

# Forward pass
enriched_embeddings = model(bert_embeddings, audio_embeddings)
print(enriched_embeddings.shape)  # Salida: (batch_size, seq_len_text, bert_embedding_dim)

torch.Size([1, 10, 768])


In [78]:
bert_embeddings, audio_embeddings

(tensor([[[ 0.2515, -0.9427,  0.0573,  ...,  1.3649, -0.1262,  0.9242],
          [-0.3486, -0.1323,  0.0358,  ..., -2.1587,  0.2745,  1.4524],
          [ 0.4158,  0.2633, -0.0359,  ...,  0.0894, -1.6831, -1.4784],
          ...,
          [ 0.3845, -1.0703, -0.3469,  ...,  2.3600, -1.2870,  0.4869],
          [-0.7951,  0.1243,  0.3119,  ...,  0.6870, -0.7423,  0.7859],
          [ 0.4837,  1.0681,  0.8321,  ...,  0.7648, -0.1264, -0.3856]]]),
 tensor([[[-0.5143, -0.0514,  1.1506,  ..., -0.5607, -0.1458, -0.5993],
          [ 1.1445,  2.1469,  2.0252,  ..., -1.4377, -1.1171,  1.6386],
          [ 1.7648,  2.1115,  1.3746,  ..., -0.7915,  0.0996, -1.1341],
          ...,
          [-3.2153, -0.2478, -0.6051,  ...,  1.5727,  0.2580,  0.1949],
          [ 2.7828,  0.9934, -1.7951,  ..., -0.7847,  1.4050, -0.4075],
          [-0.1633, -0.5315,  1.3117,  ...,  0.4514,  0.2261,  1.3461]]]))

In [77]:
enriched_embeddings

tensor([[[ 0.0021,  0.0380, -0.1424,  ...,  0.0590,  0.0146, -0.1437],
         [ 0.0243,  0.0559, -0.1613,  ..., -0.0317, -0.0443, -0.0903],
         [-0.0268, -0.0032, -0.1862,  ...,  0.0754,  0.0207, -0.0848],
         ...,
         [ 0.0347,  0.0077, -0.1528,  ...,  0.0706,  0.0097, -0.1511],
         [ 0.1063, -0.0165, -0.0823,  ..., -0.0742, -0.0009, -0.1309],
         [ 0.0268,  0.0768, -0.2915,  ..., -0.0332, -0.0047, -0.1444]]],
       grad_fn=<PermuteBackward0>)