In [5]:
import numpy as np
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import ast
from IPython.display import Video, Audio
from collections import OrderedDict
import random
import matplotlib.pyplot as plt

In [6]:
seed = 2121

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

## Exploring Data

In [7]:
with open('data/train.txt', 'r') as f:
    train_data = f.readlines()

dia_id, utt_len = train_data[0].strip().split()
dia_id = int(dia_id)
utt_len = int(utt_len)

for x in train_data[:utt_len+2]:
    print(x.strip())

1 8
(3,1),(3,3),(4,1),(4,3),(4,4),(5,1),(5,3),(5,4)
1 | Chandler | neutral | Alright , so I am back in high school , I am standing in the middle of the cafeteria , and I realize I am totally naked . | Friends_S1E1: 00:01:19.037 - 00:01:25.417
2 | All | neutral | Oh , yeah . Had that dream . | Friends_S1E1: 00:01:25.627 - 00:01:27.169
3 | Chandler | surprise | Then I look down , and I realize there is a phone ... there . | Friends_S1E1: 00:01:27.378 - 00:01:33.343
4 | Joey | surprise | Instead of ... ? | Friends_S1E1: 00:01:34.928 - 00:01:35.600
5 | Chandler | anger | That is right . | Friends_S1E1: 00:01:35.600 - 00:01:36.892
6 | Joey | neutral | Never had that dream . | Friends_S1E1: 00:01:37.055 - 00:01:37.973
7 | Phoebe | neutral | No . | Friends_S1E1: 00:01:37.973 - 00:01:38.629
8 | Chandler | neutral | All of a sudden , the phone starts to ring . | Friends_S1E1: 00:01:38.723 - 00:01:42.851


In [8]:
print('Utt ID: | Speaker  | Emotion   | Utterance')
print('-------------------------------------------')
for utt in train_data[2: utt_len+2]:
    utt = utt.strip().split(' | ')
    utt_id = int(utt[0])
    spk = utt[1]
    emo = utt[2]
    text = utt[3]
    print(f"{utt_id:7d} | {spk:8s} | {emo:9s} | {text}")

Utt ID: | Speaker  | Emotion   | Utterance
-------------------------------------------
      1 | Chandler | neutral   | Alright , so I am back in high school , I am standing in the middle of the cafeteria , and I realize I am totally naked .
      2 | All      | neutral   | Oh , yeah . Had that dream .
      3 | Chandler | surprise  | Then I look down , and I realize there is a phone ... there .
      4 | Joey     | surprise  | Instead of ... ?
      5 | Chandler | anger     | That is right .
      6 | Joey     | neutral   | Never had that dream .
      7 | Phoebe   | neutral   | No .
      8 | Chandler | neutral   | All of a sudden , the phone starts to ring .


In [9]:
video_path = os.path.join('data/all_videos_and_audios/train', f'dia{dia_id}utt{utt_id}.mp4')
audio_path = os.path.join('data/all_videos_and_audios/train', f'dia{dia_id}utt{utt_id}.wav')

print(f"Video path: {video_path}")
Video(video_path, width=640, height=480)

Video path: data/all_videos_and_audios/train/dia1utt8.mp4


In [10]:
print(f"Audio path: {audio_path}")
print(f"Speaker: {spk}\nEmotion: {emo}\nUtterance: {text}")
Audio(audio_path)

Audio path: data/all_videos_and_audios/train/dia1utt8.wav
Speaker: Chandler
Emotion: neutral
Utterance: All of a sudden , the phone starts to ring .


## Text

In [11]:
all_data_file = 'data/all_data_pair.txt'

In [12]:
vocab = set()
with open(all_data_file, 'r') as file:
    for line in file:
        contents = line.strip().split()
        if len(contents)==0 or len(contents) == 2 or contents[0][0] == '(':
            continue
        line = line.strip().split(' | ')
        emotion, clause = line[2], line[3]
        vocab.update([emotion] + clause.split())

In [13]:
print(f"Number of unique words: {len(vocab)}")

Number of unique words: 7346


In [14]:
word_to_index = OrderedDict()
index_to_word = OrderedDict()

for i, word in enumerate(vocab):
    word_to_index[word] = i+1
    index_to_word[i+1] = word


In [15]:
speakers = set()
speaker_dict = OrderedDict()
with open(all_data_file, 'r') as file:
    for line in file:
        contents = line.strip().split()
        if len(contents)==0 or len(contents) == 2 or contents[0][0] == '(':
            continue
        line = line.strip().split(' | ')
        speaker = line[1]
        speakers.add(speaker)

        if speaker in speaker_dict:
            speaker_dict[speaker] += 1
        else:
            speaker_dict[speaker] = 1

In [16]:
print(f"Number of unique speakers: {len(speakers)}")

Number of unique speakers: 312


In [17]:
speaker_to_index = OrderedDict()
index_to_speaker = OrderedDict()

for i, speaker in enumerate(speakers):
    speaker_to_index[speaker] = i+1
    index_to_speaker[i+1] = speaker
    

In [18]:
print("Speaker-wise distribution:")
for speaker, count in list(speaker_dict.items())[:20]:
    print(f"{speaker}: {count}")

Speaker-wise distribution:
Chandler: 1737
All: 52
Joey: 2044
Phoebe: 1779
Ross: 2026
Monica: 1767
Paul: 31
Rachel: 1950
Mrs. Geller: 24
Mr. Geller: 26
Susan: 34
Carol: 59
Barry: 16
Robbie: 1
Paula: 3
Alan: 3
Kiki: 2
Joanne: 3
Receptionist: 10
Pizza Guy: 7


In [19]:
glove = {}
with open('data/features/ECF_glove_300.txt', 'r') as file:
    num_words, embedding_dim = map(int, file.readline().split())
    for line in file:
        line = line.strip().split()
        word = line[0]
        embedding = np.array(line[1:], dtype=np.float32)
        glove[word] = embedding


In [20]:
print(f"Number of words in GloVe: {num_words}")
print(f"Dimension of word embeddings: {embedding_dim}")

print("Example word embeddings:")
for word, embedding in list(glove.items())[5:6]:
    print(f"{word}: {embedding}")

Number of words in GloVe: 7087
Dimension of word embeddings: 300
Example word embeddings:
prices: [-0.82765    0.30205    0.282     -0.18721    0.49644   -0.40895
  0.062102   0.052711  -0.045717   1.8077    -0.44887    0.16328
  0.011212  -0.076777   0.18669   -0.54094   -0.29376    1.4595
 -0.093523  -0.083263  -0.42288    0.4276     0.23469   -0.46337
 -0.57357    0.053304  -0.61531   -0.46576    0.11459    0.5076
 -0.30123    0.30445    0.19201    0.34598    0.01977    0.30309
  0.036973  -0.11748    0.11945   -0.051341  -0.53364   -0.091979
  0.55728   -0.0084541  0.031024   0.052872  -0.099646  -0.39833
  0.0083812  0.22286   -0.11101    0.13712    0.089677   0.55723
  0.58576   -0.75677   -0.45634   -0.63121   -0.53755   -0.60519
 -0.21867   -0.89607   -0.2874     0.37736   -0.090579   0.18957
 -0.33006    0.10786    0.20543   -0.077149   0.58829    0.34077
  0.089441  -0.030981   0.27187    0.13792   -0.093708  -0.17529
  0.099991   0.50514    0.14737    0.49083    0.091299  -0

In [21]:
embedding_matrix = [np.zeros(embedding_dim)]
for word in vocab:
    if word in glove:
        embedding_matrix.append(glove[word])
    else:
        embedding_matrix.append(np.random.uniform(-0.1, 0.1, embedding_dim))

embedding_matrix = np.array(embedding_matrix)

In [22]:
print(f"Shape of embedding matrix: {embedding_matrix.shape}")

Shape of embedding matrix: (7347, 300)


## Audio + Video

In [55]:
vid_maps = np.load('data/video_id_mapping.npy', allow_pickle=True)
vid_maps = vid_maps.item()
print(f"Number of videos: {len(vid_maps)}")

Number of videos: 13619


In [56]:
video_embeddings = np.load('data/features/video_embs.npy')
audio_embeddings = np.load('data/features/audio_embs.npy')

In [57]:
print(f"Shape of video embeddings: {video_embeddings.shape}")
print(f"Shape of audio embeddings: {audio_embeddings.shape}")

Shape of video embeddings: (13620, 4096)
Shape of audio embeddings: (13620, 6373)


In [58]:
print(f"Sample Video Embeds: {video_embeddings[1][:5]}")
print(f"Sample Audio Embeds: {audio_embeddings[1][:5]}")

Sample Video Embeds: [-1.20914364 -1.45758319 -1.38567138 -2.08530951 -1.30417991]
Sample Audio Embeds: [1.186063   0.03486529 0.8050713  0.3181443  0.4891213 ]


In [59]:
def normalize_embeddings(embeddings):
    data = embeddings[1:, :] #Ignore first row
    min_vals = np.min(data, keepdims=True)
    max_vals = np.max(data, keepdims=True)
    data = (data - min_vals) / (max_vals - min_vals + 1e-8)
    embeddings[1:, :] = data
    return embeddings

In [60]:
video_embeddings = normalize_embeddings(video_embeddings)
audio_embeddings = normalize_embeddings(audio_embeddings)

print(f"Sample Video Embeds: {video_embeddings[1][:5]}")
print(f"Sample Audio Embeds: {audio_embeddings[1][:5]}")

Sample Video Embeds: [0.95323268 0.93913186 0.94321339 0.90350366 0.94783865]
Sample Audio Embeds: [1.03875928e-06 1.03875928e-06 1.03875928e-06 1.03875928e-06
 1.03875928e-06]


## Data Prep

In [29]:
train_file = 'data/train.txt'
test_file = 'data/test.txt'
dev_file = 'data/dev.txt'

In [30]:
MAX_UTT_LEN = 35
MAX_SENT_LEN = 35

emotion_map = {
    'neutral': 0,
    'anger': 1,
    'disgust': 2,
    'fear': 3,
    'joy': 4,
    'sadness': 5,
    'surprise': 6
}

In [47]:
class TextDataset(Dataset):
    def __init__(self, file, vid_maps, speaker_to_index, emotion_map, word_to_index, MAX_UTT_LEN, MAX_SENT_LEN):
        self.file = file
        self.vid_maps = vid_maps
        self.speaker_to_index = speaker_to_index
        self.emotion_map = emotion_map
        self.word_to_index = word_to_index
        self.MAX_UTT_LEN = MAX_UTT_LEN
        self.MAX_SENT_LEN = MAX_SENT_LEN
        
        self.data = []
        self.doc_id = []
        self.doc_len = []
        self.sen_len = []
        self.target_pairs = []
        self.target_emotion = []
        self.target_cause = []
        self.X = []
        self.X_v = []
        self.speaker = []

        self.load_data()

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]
    
    def load_data(self):
        with open(self.file, 'r') as file:
            while True:
                line = file.readline()
                if line == '':
                    break
                #print(line)

                line = line.strip().split()
                dialog_id, num_utterances = map(int, line)

                self.doc_id.append(dialog_id)
                self.doc_len.append(num_utterances)

                # Read and parse pair_line
                pair_line = file.readline().strip()
                #print(pair_line)

                pair_line = ast.literal_eval('['+pair_line+']')
                #pair_line = list(map(tuple, pair_line))

                pairs = []
                emotions, causes = [], []

                if pair_line:
                    if len(pair_line[0]) > 2:
                        pairs = sorted(set((int(p[0]), int(p[1])) for p in pair_line))
                    else:
                        pairs = pair_line

                    for p in pairs:
                        emotions.append(p[0])
                        causes.append(p[1])
                # else:
                #     file.seek(1, 1)

                self.target_pairs.append(pairs)

                # Initialize placeholders for utterance data
                y_emotion_tmp = np.zeros((self.MAX_UTT_LEN, 2))
                y_cause_tmp = np.zeros((self.MAX_UTT_LEN, 2))
                x_tmp = np.zeros((self.MAX_UTT_LEN, self.MAX_SENT_LEN), dtype=np.int32)
                x_v_tmp = np.zeros((self.MAX_UTT_LEN, 300), dtype=np.float32)
                sen_len_temp = np.zeros(self.MAX_UTT_LEN, dtype=np.int32)
                spe_tmp = np.zeros(self.MAX_UTT_LEN, dtype=np.int32)

                for i in range(num_utterances):
                    x_v_tmp[i] = self.vid_maps[f'dia{dialog_id}utt{i + 1}']
                    line = file.readline().strip()
                    #print(line)

                    speaker, emotion, clause = line.split(' | ')[1:4]

                    # Speaker and emotion mapping
                    if speaker in self.speaker_to_index:
                        spe_tmp[i] = self.speaker_to_index[speaker]
                    else:
                        spe_tmp[i] = -1
                        #raise ValueError(f"Speaker {speaker} not found in speaker_to_index")

                    emo_id = self.emotion_map[emotion]
                    y_emotion_tmp[i] = [1, 0] if emo_id == 0 else [0, 1]

                    # Cause annotation
                    if i + 1 in causes:
                        y_cause_tmp[i][1] = 1
                    else:
                        y_cause_tmp[i][0] = 1

                    # Sentence processing
                    words = clause.split()
                    sen_len_temp[i] = min(len(words), self.MAX_SENT_LEN)
                    for j, word in enumerate(words[:self.MAX_SENT_LEN]):
                        x_tmp[i][j] = self.word_to_index.get(word, 0)  # Use 0 for unknown words

                # Append processed data
                self.target_emotion.append(y_emotion_tmp)
                self.target_cause.append(y_cause_tmp)
                self.X.append(x_tmp)
                self.X_v.append(x_v_tmp)
                self.speaker.append(spe_tmp)
                self.sen_len.append(sen_len_temp)

            # Convert lists to numpy arrays and zip into self.data
            X = np.array(self.X)
            X_v = np.array(self.X_v)
            speaker = np.array(self.speaker)
            target_emotion = np.array(self.target_emotion)
            target_cause = np.array(self.target_cause)
            sen_len = np.array(self.sen_len)
            doc_len = np.array(self.doc_len)
            #target_pairs = np.array(self.target_pairs)
            doc_id = np.array(self.doc_id)


            self.data = list(zip(X, X_v, speaker, target_emotion, target_cause, sen_len, doc_len, self.target_pairs, doc_id))

In [48]:
train_data = TextDataset(train_file, vid_maps, speaker_to_index, emotion_map, word_to_index, MAX_UTT_LEN, MAX_SENT_LEN)

In [51]:
for x in train_data[0]:
    print(x.shape if isinstance(x, np.ndarray) else x)

(35, 35)
(35, 300)
(35,)
(35, 2)
(35, 2)
(35,)
8
[(3, 1), (3, 3), (4, 1), (4, 3), (4, 4), (5, 1), (5, 3), (5, 4)]
1


In [52]:
test_data = TextDataset(test_file, vid_maps, speaker_to_index, emotion_map, word_to_index, MAX_UTT_LEN, MAX_SENT_LEN)
dev_data = TextDataset(dev_file, vid_maps, speaker_to_index, emotion_map, word_to_index, MAX_UTT_LEN, MAX_SENT_LEN)

In [61]:
word_embeddings = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix), padding_idx=0)
video_embeddings = nn.Embedding.from_pretrained(torch.FloatTensor(video_embeddings), padding_idx=0)
audio_embeddings = nn.Embedding.from_pretrained(torch.FloatTensor(audio_embeddings), padding_idx=0)

In [62]:
embeddings = [word_embeddings, video_embeddings, audio_embeddings]

In [66]:
X = torch.zeros((1, MAX_UTT_LEN, MAX_SENT_LEN), dtype=torch.int32)
X_v = torch.zeros((1, MAX_UTT_LEN), dtype=torch.float32)
doc_len = torch.zeros((1,), dtype=torch.int32)
sen_len = torch.zeros((1, MAX_UTT_LEN), dtype=torch.int32)
speaker = torch.zeros((1, MAX_UTT_LEN), dtype=torch.int32)

In [67]:
PREDS = 2

In [68]:
y_emotion = torch.zeros((1, MAX_UTT_LEN, PREDS), dtype=torch.int32)
y_cause = torch.zeros((1, MAX_UTT_LEN, 7), dtype=torch.int32)

In [70]:
placeholders = [X, X_v, speaker, y_emotion, y_cause, sen_len, doc_len]

In [72]:
EMBEDDING_DIM = 300

In [None]:
def model(embeddings, placeholders):
    word_embeddings, video_embeddings, audio_embeddings = embeddings
    X, X_v, speaker, y_emotion, y_cause, sen_len, doc_len = placeholders

    sen_len = sen_len.view(-1)
    X = np.reshape(X, (-1, MAX_SENT_LEN, EMBEDDING_DIM))
    X = 

In [None]:
pred_emo, pred_x_v, pred_x_a, pred_cause, reg = model(*placeholders)