In [None]:
from mmsdk import mmdatasdk as md
import numpy as np
import os
import re
import torch
from torch.optim import Adam
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from copy import deepcopy

DATA_DIR = "cmumosei_highlevel"
LABEL_DIR = "cmumosei_labels"
CUDA = torch.cuda.is_available()
POSE_FEATURES = True

In [None]:
# define your different modalities - refer to the filenames of the CSD files
visual_field = 'CMU_MOSEI_VisualFacet42'
acoustic_field = 'CMU_MOSEI_COVAREP'
hand_r_features = "hand_r_features_top10"
hand_l_features = "hand_l_features_top10"
pose_features = "pose_features_top10"
text_field = 'CMU_MOSEI_TimestampedWordVectors'

features = [
    text_field,
    visual_field,
    acoustic_field]

open_features = [pose_features,
    hand_l_features,
    hand_r_features,
    text_field,
    visual_field,
    acoustic_field]
if POSE_FEATURES:
    recipe = {feat: os.path.join(DATA_DIR, feat) +
          '.csd' for feat in open_features}
else:
    recipe = {feat: os.path.join(DATA_DIR, feat) +
            '.csd' for feat in features}
dataset = md.mmdataset(recipe)


In [None]:
total_videos = 1500

In [None]:
# Remove videos not in Pose
if POSE_FEATURES:
    remove = [name for name in list(dataset[visual_field].keys()) + list(dataset[acoustic_field].keys()) + list(
        dataset[text_field].keys()) + list(dataset[pose_features].keys()) if name not in set(dataset[pose_features].keys())]
    for v in remove:
        dataset.remove_id(v)
remove = [name for name in list(dataset[visual_field].keys()) + list(dataset[acoustic_field].keys()) + list(
    dataset[text_field].keys()) if name not in list(dataset[text_field].keys())[:total_videos]]
for v in remove:
    dataset.remove_id(v)

In [None]:
# Collapse
# we define a simple averaging function that does not depend on intervals
def avg(intervals: np.array, features: np.array) -> np.array:
    try:
        return np.average(features, axis=0)
    except:
        return features
        
# first we align to words with averaging, collapse_function receives a list of functions
#dataset.align(pose_features, collapse_functions=[avg])
dataset.align(text_field, collapse_functions=[avg])
#aligned = dataset.align(text_field, replace=False)


In [None]:
label_field = 'CMU_MOSEI_Labels'
# we add and align to lables to obtain labeled segments
# this time we don't apply collapse functions so that the temporal sequences are preserved
label_recipe = {label_field: os.path.join(LABEL_DIR, label_field + '.csd')}
dataset.add_computational_sequences(label_recipe, destination=None)
dataset.align(label_field)

In [78]:
dataset[label_field].metadata

{'alignment compatible': True,
 'computational sequence description': 'Labels for CMU-MOSEI Dataset',
 'computational sequence version': 1.0,
 'contact': 'abagherz@andrew.cmu.edu',
 'creator': 'Amir Zadeh',
 'dataset bib citation': '@inproceedings{cmumoseiacl2018, title={Multimodal Language Analysis in the Wild: {CMU-MOSEI} Dataset and Interpretable Dynamic Fusion Graph}, author={Zadeh, Amir and Liang, Paul Pu and Vanbriesen, Jon and Poria, Soujanya and Cambria, Erik and Chen, Minghai and Morency, Louis-Philippe},booktitle={Association for Computational Linguistics (ACL)},year={2018}}',
 'dataset name': 'CMU-MOSEI',
 'dataset version': 1.0,
 'dimension names': ['sentiment',
  'happy',
  'sad',
  'anger',
  'surprise',
  'disgust',
  'fear'],
 'featureset bib citation': '@online{amt, author = {Amazon},title = {Amazon Mechanical Turk},year = {2017},url = {https://www.mturk.com}}',
 'md5': None,
 'root name': 'All Labels',
 'uuid': 'bbce9ca9-e556-46f4-823e-7c5e0147afab'}

## Determining the Highest Sentiment Videos

In [None]:
import re
import heapq
# Filter videos based per name (Merge segments)
pattern = re.compile('(.*)\[.*\]')
videos = set()
for video in list(dataset[label_field].keys()):
    vid_id = re.search(pattern, video).group(1)
    videos.add(vid_id)

In [None]:
# Create a dictionary of VideoKey -> [Sentiments]
dict_average_sentiment = dict()
for video in list(dataset[label_field].keys()):
    video_name = video.split("[")[0]
    if video_name in dict_average_sentiment.keys():
        dict_average_sentiment[video_name] = np.vstack(
            (dict_average_sentiment[video_name], dataset[label_field][video]['features'][0]))
    else:
        dict_average_sentiment[video_name] = dataset[label_field][video]['features'][0]


In [None]:
# Generate the average sentiment for each video
dict_avg_sentiment_q = []
for k, v in dict_average_sentiment.items():
    if len(v.shape) < 2:
        n_segements = 1
        avg_sentiments = v
    else:
        n_segements = v.shape[0]
        avg_sentiments = v.mean(axis=0)
    heapq.heappush(dict_avg_sentiment_q, (avg_sentiments[0], k))


In [None]:
# Top 25 Positive Sentiment Videos
pos_videos = [k for v, k in heapq.nlargest(25, dict_avg_sentiment_q)]
# Top 50 Negative Videos
neg_videos = [k for v, k in heapq.nsmallest(50, dict_avg_sentiment_q)]
import random
random.shuffle(dict_avg_sentiment_q)
# 15 Neutral videos (sentiment average is -0.05 < v < 0.05)
neutral_videos = [k for v, k in dict_avg_sentiment_q if np.abs(v) < 0.05][:15]

In [None]:
#np.savetxt("aligned_videos.csv", np.array(
#    list(videos)), fmt="%s", delimiter=",")

In [None]:
from sklearn.model_selection import train_test_split

train_split, test_split, _, _ = train_test_split(
    list(videos), list(videos), test_size=0.2)
train_split, dev_split, _, _ = train_test_split(train_split, train_split, test_size=0.2)

In [None]:
train_split, dev_split, test_split

In [None]:
def get_train_dev_test_splits(dataset, train_videos, test_videos, dev_videos, label_n=0, n_classes=7, verbose=False):
    # Label_n is which label to predict if label_n = 0 and n_cl
    # a sentinel epsilon for safe division, without it we will replace illegal values with a constant
    EPS = 0
    # place holders for the final train/dev/test dataset
    train = []
    dev = []
    test = []

    # define a regular expression to extract the video ID out of the keys
    pattern = re.compile('(.*)\[.*\]')
    num_drop = 0  # a counter to count how many data points went into some processing issues

    for segment in dataset[label_field].keys():
        # get the video ID and the features out of the aligned dataset
        vid = re.search(pattern, segment).group(1)
        label = deepcopy(dataset[label_field][segment]['features'])
        # Uncomment to get sentiment only
        if label_n == 0 and n_classes == -1:
            # This means it will return the average sentiment (For Regression)
            label = label[:,0]
        elif label_n == 0 and n_classes == 2:
            # This will return two classes Positive (1), Negative (0)
            label = label[:,0]
            label[label >= 0] = 1
            label[label < 0] = 0
        elif label_n == 0 and n_classes == 7:
            # Returns Sentiment in 7 classes, [0, 6], 0 - Very Negative, 6 - Very Positive
            # First Rounds to closest Int and sums +3 to convert it to [0,6]
            label = np.round(label[:,0])+3
        elif label_n >= 1 and label_n <= 6:
            label = np.round(label[:,label_n])
        else:
            print(f"ERROR: This is not a valid configuration. Parameters: label_n: {label_n} & n_classes: {n_classes}")
            raise(ValueError)
        # Uncomment if you want rounded sentiment (+3) -> -3 becomes 0, this is just to 
        # make it easier to run on PyTorch.
        #label = np.round(dataset[label_field][segment]['features'][:,0])+3
        _words = dataset[text_field][segment]['features']
        _visual = dataset[visual_field][segment]['features']
        _acoustic = dataset[acoustic_field][segment]['features']
        # Need this because some segments does not exist in pose
        if POSE_FEATURES:
            if not segment in dataset[pose_features].keys():
                if verbose:
                    print(f"Havent found features for {segment}. Continouing")
                continue
            _pose = dataset[pose_features][segment]['features']
            _hand_r = dataset[hand_r_features][segment]['features']
            _hand_l = dataset[hand_l_features][segment]['features']

        # if the sequences are not same length after alignment, there must be some problem with some modalities
        # we should drop it or inspect the data again
        if POSE_FEATURES:
            if not _words.shape[0] == _visual.shape[0] == _acoustic.shape[0] == _pose.shape[0] == _hand_r.shape[0] == _hand_l.shape[0]:
                dif = np.abs(_visual.shape[0] - _pose.shape[0]) 
                if dif <= 3:
                    intervals = dataset[pose_features][segment]['intervals']
                    word_intervals = dataset[text_field][segment]['intervals']
                    dif_intervals = np.array([intervals[:,0], word_intervals[:-dif,0]]).std(axis=0)
                    #_new_pose_intervals = np.zeros((_visual.shape[0], 2))
                    _new_pose = np.zeros((_visual.shape[0], _pose.shape[1]))
                    last_i = 0
                    h = []
                    for i in range(len(intervals)):
                        heapq.heappush(h, (dif_intervals[i],i))
                    index_to_change = heapq.nlargest(dif, h)
                    for _, i_c  in index_to_change:
                        if last_i > 0:
                            #_new_pose_intervals[last_i+1:i_c+1,:] = intervals[last_i:i_c,:]
                            _new_pose[last_i+1:i_c+1,:] = _pose[last_i:i_c,:]
                        else:
                            #_new_pose_intervals[last_i:i_c,:] = intervals[last_i:i_c,:]
                            _new_pose[last_i:i_c,:] = _pose[last_i:i_c,:]
                        #_new_pose_intervals[i_c] = intervals[i_c-2:i_c+1,:].mean(axis=0)
                        _new_pose[i_c] = _pose[i_c-2:i_c+1,:].mean(axis=0)
                        last_i = i_c
                    _new_pose[last_i+1:,:] = _pose[i_c-dif+1:,:]
                    #_new_pose_intervals[last_i+1:,:] = intervals[i_c-dif+1:,:]
                    _pose = np.concatenate((_pose, _pose[:dif:,:].reshape(dif,-1)))
                    _hand_l = np.concatenate((_hand_l, _hand_l[:dif:,:].reshape(dif,-1)))
                    _hand_r = np.concatenate((_hand_r, _hand_r[:dif:,:].reshape(dif,-1)))
                else:
                    if verbose:
                        print(f"DROPPED: {vid},{segment} with text shape {_words.shape}, visual shape {_visual.shape}, acoustic shape {_acoustic.shape}, pose shape {_pose.shape}")
                    num_drop += 1
                    continue
        else:
            if not _words.shape[0] == _visual.shape[0] == _acoustic.shape[0]:
                if verbose:
                    print(f"DROPPED: {vid},{segment} with text shape {_words.shape}, visual shape {_visual.shape}, acoustic shape {_acoustic.shape}")
                num_drop += 1
                continue
        if verbose:
            print(f"Added: {segment}")
        # remove nan values
        label = np.nan_to_num(label)
        _visual = np.nan_to_num(_visual)
        _acoustic = np.nan_to_num(_acoustic)
        if POSE_FEATURES:
            _pose = np.nan_to_num(_pose)
            _hand_r = np.nan_to_num(_hand_r)
            _hand_l = np.nan_to_num(_hand_l)

        # remove speech pause tokens - this is in general helpful
        # we should remove speech pauses and corresponding visual/acoustic features together
        # otherwise modalities would no longer be aligned
        words = []
        visual = []
        acoustic = []
        if POSE_FEATURES:
            pose = []
            hand_r = []
            hand_l = []
        for i, word in enumerate(_words):
            if word[0] != b'sp':
                # SDK stores strings as bytes, decode into strings here
                words.append(_words[i,:])
                visual.append(_visual[i, :])
                acoustic.append(_acoustic[i, :])
                if POSE_FEATURES:
                    pose.append(_pose[i, :])
                    hand_r.append(_hand_r[i, :])
                    hand_l.append(_hand_l[i, :])

        words = np.asarray(words)
        visual = np.asarray(visual)
        acoustic = np.asarray(acoustic)
        if POSE_FEATURES:
            pose = np.asarray(pose)
            hand_r = np.asarray(hand_r)
            hand_l = np.asarray(hand_l)

        # z-normalization per instance and remove nan/infs
        visual = np.nan_to_num((visual - visual.mean(0, keepdims=True)) /
                               (EPS + np.std(visual, axis=0, keepdims=True)))
        acoustic = np.nan_to_num((acoustic - acoustic.mean(0, keepdims=True)) /
                                 (EPS + np.std(acoustic, axis=0, keepdims=True)))
        if POSE_FEATURES:
            pose = np.nan_to_num((pose - pose.mean(0, keepdims=True)) /
                                (EPS + np.std(pose, axis=0, keepdims=True)))
            hand_r = np.nan_to_num((hand_r - hand_r.mean(0, keepdims=True)) /
                                (EPS + np.std(hand_r, axis=0, keepdims=True)))
            hand_l = np.nan_to_num((hand_l - hand_l.mean(0, keepdims=True)) /
                                (EPS + np.std(hand_l, axis=0, keepdims=True)))
        if POSE_FEATURES:
            if vid in train_videos:
                train.append(((words, visual, acoustic, pose, hand_r, hand_l), label, segment))
            elif vid in dev_videos:
                dev.append(((words, visual, acoustic, pose, hand_r, hand_l), label, segment))
            elif vid in test_videos:
                test.append(((words, visual, acoustic, pose, hand_r, hand_l), label, segment))
            else:
                print(f"Found video that doesn't belong to any splits: {vid}")
        else:
            if vid in train_videos:
                train.append(((words, visual, acoustic), label, segment))
            elif vid in dev_videos:
                dev.append(((words, visual, acoustic), label, segment))
            elif vid in test_videos:
                test.append(((words, visual, acoustic), label, segment))
            else:
                print(f"Found video that doesn't belong to any splits: {vid}")
    return train, dev, test


In [None]:
# let's see the size of each set and shape of data
train, dev, test = get_train_dev_test_splits(dataset, train_split, dev_split, test_split, label_n=0, n_classes=-1)
print(len(train))
print(len(dev))
print(len(test))


In [None]:
np.array(train).shape, np.array(dev).shape, np.array(test).shape


In [None]:
train[0][0][0].shape, train[0][1]

In [None]:
# Modelling
def multi_collate_reg(batch):
    '''
    Collate functions assume batch = [Dataset[i] for i in index_set]
    '''
    # for later use we sort the batch in descending order of length
    batch = sorted(batch, key=lambda x: x[0][0].shape[0], reverse=True)
    # get the data out of the batch - use pad sequence util functions from PyTorch to pad things
    labels = torch.cat([torch.FloatTensor(sample[1],)
                       for sample in batch], dim=0)
                       
    sentences = pad_sequence([torch.FloatTensor(sample[0][0])
                             for sample in batch], padding_value=1)
    visual = pad_sequence([torch.FloatTensor(sample[0][1])
                          for sample in batch])

    acoustic = pad_sequence([torch.FloatTensor(sample[0][2])
                            for sample in batch])
    if POSE_FEATURES:
        pose = pad_sequence([torch.FloatTensor(sample[0][3]) for sample in batch])

        hand_r = pad_sequence([torch.FloatTensor(sample[0][4]) for sample in batch])

        hand_l = pad_sequence([torch.FloatTensor(sample[0][5]) for sample in batch])

    # lengths are useful later in using RNNs
    lengths = torch.LongTensor([sample[0][0].shape[0] for sample in batch])
    if POSE_FEATURES:
        return sentences, visual, acoustic, pose, hand_r, hand_l, labels, lengths
    else:
        return sentences, visual, acoustic, labels, lengths

In [None]:
# Modelling
def multi_collate_class(batch):
    '''
    Collate functions assume batch = [Dataset[i] for i in index_set]
    '''
    # for later use we sort the batch in descending order of length
    batch = sorted(batch, key=lambda x: x[0][0].shape[0], reverse=True)
    # get the data out of the batch - use pad sequence util functions from PyTorch to pad things
    labels = torch.cat([torch.LongTensor(sample[1],)
                       for sample in batch], dim=0)
                       
    sentences = pad_sequence([torch.FloatTensor(sample[0][0])
                             for sample in batch], padding_value=1)
    visual = pad_sequence([torch.FloatTensor(sample[0][1])
                          for sample in batch])

    acoustic = pad_sequence([torch.FloatTensor(sample[0][2])
                            for sample in batch])
    if POSE_FEATURES:
        pose = pad_sequence([torch.FloatTensor(sample[0][3]) for sample in batch])

        hand_r = pad_sequence([torch.FloatTensor(sample[0][4]) for sample in batch])

        hand_l = pad_sequence([torch.FloatTensor(sample[0][5]) for sample in batch])

    # lengths are useful later in using RNNs
    lengths = torch.LongTensor([sample[0][0].shape[0] for sample in batch])
    if POSE_FEATURES:
        return sentences, visual, acoustic, pose, hand_r, hand_l, labels, lengths
    else:
        return sentences, visual, acoustic, labels, lengths

In [None]:
train_loader = DataLoader(train, shuffle=False, batch_size=32, collate_fn=multi_collate_reg)
dev_loader = DataLoader(dev, shuffle=False, batch_size=32, collate_fn=multi_collate_reg)
test_loader = DataLoader(test, shuffle=False, batch_size=32, collate_fn=multi_collate_reg)

In [None]:
class LangLSTM(nn.Module):
    def __init__(self, input_sizes, hidden_sizes, output_size, dropout_rate, n_layers=2, classification=False):
        super(LangLSTM, self).__init__()
        self.input_size = input_sizes
        self.hidden_size = hidden_sizes
        self.output_size = output_size
        self.dropout_rate = dropout_rate
        self.classification = classification

        #self.embed = nn.Embedding(len(word_dict), input_sizes[0])

        self.total_inputsize = sum(self.input_size)
        self.total_hiddensize = sum(self.hidden_size)
        self.biLSTM = nn.LSTM(
            self.total_inputsize, self.total_hiddensize, bidirectional=True, num_layers = n_layers, dropout=dropout_rate)

        self.output = nn.Linear(self.total_hiddensize*2*n_layers, output_size)
        if self.classification:
            self.softmax = nn.Softmax(dim=1)
        # self.relu = nn.ReLU()

    def forward(self, sentences, visual, acoustic, pose, hand_r, hand_l, lengths):
        batch_size = lengths.size(0)
        #sentences = self.embed(sentences)
        packed_sequence = pack_padded_sequence(sentences, lengths)
        _, (final_h, _) = self.biLSTM(packed_sequence)
        h = final_h.view(batch_size, -1)
        o = self.output(h)
        #o = torch.cat((o[:,0].unsqueeze(dim=1), self.relu(o[:,1:])),axis=1)
        if self.classification:
            return self.softmax(o)
        return o.squeeze()

In [None]:
class EFLSTMP(nn.Module):
    def __init__(self, input_sizes, hidden_sizes, output_size, dropout_rate, n_layers=2, classification=False):
        super(EFLSTMP, self).__init__()
        self.input_size = input_sizes
        self.hidden_size = hidden_sizes
        self.output_size = output_size
        self.dropout_rate = dropout_rate
        self.classification = classification

        #self.embed = nn.Embedding(len(word_dict), input_sizes[0])

        self.total_inputsize = sum(self.input_size)
        self.total_hiddensize = sum(self.hidden_size)
        self.biLSTM = nn.LSTM(
            self.total_inputsize, self.total_hiddensize, bidirectional=True, num_layers = n_layers, dropout=dropout_rate)

        self.output = nn.Linear(self.total_hiddensize*2*n_layers, output_size)
        if self.classification:
            self.softmax = nn.Softmax(dim=1)
        # self.relu = nn.ReLU()

    def forward(self, sentences, visual, acoustic, pose, hand_r, hand_l, lengths):
        batch_size = lengths.size(0)
        #sentences = self.embed(sentences)
        merge_features = torch.cat((sentences, visual, acoustic, pose, hand_r, hand_l), dim=2)
        packed_sequence = pack_padded_sequence(merge_features, lengths)
        _, (final_h, _) = self.biLSTM(packed_sequence)
        h = final_h.view(batch_size, -1)
        o = self.output(h)
        #o = torch.cat((o[:,0].unsqueeze(dim=1), self.relu(o[:,1:])),axis=1)
        if self.classification:
            return self.softmax(o)
        return o.squeeze()

In [None]:
class EFLSTM(nn.Module):
    def __init__(self, input_sizes, hidden_sizes, output_size, dropout_rate, n_layers=2, classification=False):
        super(EFLSTM, self).__init__()
        self.input_size = input_sizes
        self.hidden_size = hidden_sizes
        self.output_size = output_size
        self.dropout_rate = dropout_rate
        self.classification = classification

        #self.embed = nn.Embedding(len(word_dict), input_sizes[0])

        self.total_inputsize = sum(self.input_size[:3])
        self.total_hiddensize = sum(self.hidden_size[:3])
        self.biLSTM = nn.LSTM(
            self.total_inputsize, self.total_hiddensize, bidirectional=True, num_layers=n_layers, dropout=dropout_rate)

        if self.classification:
            self.softmax = nn.Softmax(dim=1)

        self.output = nn.Linear(self.total_hiddensize*2*n_layers, output_size)

    def forward(self, sentences, visual, acoustic, pose, hand_r, hand_l, lengths):
        batch_size = lengths.size(0)
        #sentences = self.embed(sentences)
        merge_features = torch.cat((sentences, visual, acoustic), dim=2)
        packed_sequence = pack_padded_sequence(merge_features, lengths)
        _, (final_h, _) = self.biLSTM(packed_sequence)
        h = final_h.view(batch_size, -1)
        o = self.output(h)
        #o = torch.cat((o[:,0].unsqueeze(dim=1), self.relu(o[:,1:])),axis=1)
        if self.classification:
            return self.softmax(o)
        return o.squeeze()


In [None]:
# From https://github.com/Justin1904/CMU-MultimodalSDK-Tutorials/blob/master/tutorial_interactive.ipynb
class LFLSTM(nn.Module):
    def __init__(self, input_sizes, hidden_sizes, fc1_size, output_size, dropout_rate, classification=False):
        super(LFLSTM, self).__init__()
        self.input_size = input_sizes
        self.hidden_size = hidden_sizes
        self.fc1_size = fc1_size
        self.output_size = output_size
        self.dropout_rate = dropout_rate
        self.classification = classification
        # defining modules - two layer bidirectional LSTM with layer norm in between
        # self.embed = nn.Embedding(len(word2id), input_sizes[0])
        self.trnn1 = nn.LSTM(
            input_sizes[0], hidden_sizes[0], bidirectional=True)
        self.trnn2 = nn.LSTM(
            2*hidden_sizes[0], hidden_sizes[0], bidirectional=True)

        self.vrnn1 = nn.LSTM(
            input_sizes[1], hidden_sizes[1], bidirectional=True)
        self.vrnn2 = nn.LSTM(
            2*hidden_sizes[1], hidden_sizes[1], bidirectional=True)

        self.arnn1 = nn.LSTM(
            input_sizes[2], hidden_sizes[2], bidirectional=True)
        self.arnn2 = nn.LSTM(
            2*hidden_sizes[2], hidden_sizes[2], bidirectional=True)

        self.fc1 = nn.Linear(sum(hidden_sizes)*4, fc1_size)
        self.fc2 = nn.Linear(fc1_size, output_size)
        self.dropout = nn.Dropout(dropout_rate)
        self.tlayer_norm = nn.LayerNorm((hidden_sizes[0]*2,))
        self.vlayer_norm = nn.LayerNorm((hidden_sizes[1]*2,))
        self.alayer_norm = nn.LayerNorm((hidden_sizes[2]*2,))
        self.bn = nn.BatchNorm1d(sum(hidden_sizes)*4)
        self.relu = nn.ReLU()
        if self.classification:
            self.softmax = nn.Softmax(dim=1)

    def extract_features(self, sequence, lengths, rnn1, rnn2, layer_norm):
        packed_sequence = pack_padded_sequence(sequence, lengths)
        packed_h1, (final_h1, _) = rnn1(packed_sequence)
        padded_h1, _ = pad_packed_sequence(packed_h1)
        normed_h1 = layer_norm(padded_h1)
        packed_normed_h1 = pack_padded_sequence(normed_h1, lengths)
        _, (final_h2, _) = rnn2(packed_normed_h1)
        return final_h1, final_h2

    def fusion(self, sentences, visual, acoustic, lengths):
        batch_size = lengths.size(0)
        # In case no embeddings are used
        # sentences = self.embed(sentences)
        # extract features from text modality
        final_h1t, final_h2t = self.extract_features(
            sentences, lengths, self.trnn1, self.trnn2, self.tlayer_norm)

        # extract features from visual modality
        final_h1v, final_h2v = self.extract_features(
            visual, lengths, self.vrnn1, self.vrnn2, self.vlayer_norm)

        # extract features from acoustic modality
        final_h1a, final_h2a = self.extract_features(
            acoustic, lengths, self.arnn1, self.arnn2, self.alayer_norm)

        # simple late fusion -- concatenation + normalization
        h = torch.cat((final_h1t, final_h2t, final_h1v, final_h2v, final_h1a, final_h2a),
                      dim=2).permute(1, 0, 2).contiguous().view(batch_size, -1)
        return self.bn(h)

    def forward(self, sentences, visual, acoustic, pose, hand_r, hand_l, lengths):
        batch_size = lengths.size(0)
        h = self.fusion(sentences, visual, acoustic, lengths)
        h = self.fc1(h)
        h = self.dropout(h)
        h = self.relu(h)
        o = self.fc2(h)
        #o = torch.cat((o[:,0].unsqueeze(dim=1), self.relu(o[:,1:])),axis=1)
        if self.classification:
            return self.softmax(o)
        return o.squeeze()


In [None]:
# From https://github.com/Justin1904/CMU-MultimodalSDK-Tutorials/blob/master/tutorial_interactive.ipynb
class LFLSTMP(nn.Module):
    def __init__(self, input_sizes, hidden_sizes, fc1_size, output_size, dropout_rate, classification=False):
        super(LFLSTMP, self).__init__()
        self.input_size = input_sizes
        self.hidden_size = hidden_sizes
        self.fc1_size = fc1_size
        self.output_size = output_size
        self.dropout_rate = dropout_rate
        self.classification = classification

        # defining modules - two layer bidirectional LSTM with layer norm in between
        # self.embed = nn.Embedding(len(word2id), input_sizes[0])
        self.trnn1 = nn.LSTM(
            input_sizes[0], hidden_sizes[0], bidirectional=True)
        self.trnn2 = nn.LSTM(
            2*hidden_sizes[0], hidden_sizes[0], bidirectional=True)

        self.vrnn1 = nn.LSTM(
            input_sizes[1], hidden_sizes[1], bidirectional=True)
        self.vrnn2 = nn.LSTM(
            2*hidden_sizes[1], hidden_sizes[1], bidirectional=True)

        self.arnn1 = nn.LSTM(
            input_sizes[2], hidden_sizes[2], bidirectional=True)
        self.arnn2 = nn.LSTM(
            2*hidden_sizes[2], hidden_sizes[2], bidirectional=True)

        self.prnn1 = nn.LSTM(
            input_sizes[3], hidden_sizes[3], bidirectional=True)
        self.prnn2 = nn.LSTM(
            2*hidden_sizes[3], hidden_sizes[3], bidirectional=True)
        
        self.hrrnn1 = nn.LSTM(
            input_sizes[4], hidden_sizes[4], bidirectional=True)
        self.hrrnn2 = nn.LSTM(
            2*hidden_sizes[4], hidden_sizes[4], bidirectional=True)
        
        self.hlrnn1 = nn.LSTM(
            input_sizes[5], hidden_sizes[5], bidirectional=True)
        self.hlrnn2 = nn.LSTM(
            2*hidden_sizes[5], hidden_sizes[5], bidirectional=True)

        self.fc1 = nn.Linear(sum(hidden_sizes)*4, fc1_size)
        self.fc2 = nn.Linear(fc1_size, output_size)
        self.dropout = nn.Dropout(dropout_rate)
        self.tlayer_norm = nn.LayerNorm((hidden_sizes[0]*2,))
        self.vlayer_norm = nn.LayerNorm((hidden_sizes[1]*2,))
        self.alayer_norm = nn.LayerNorm((hidden_sizes[2]*2,))
        self.player_norm = nn.LayerNorm((hidden_sizes[3]*2,))
        self.hrlayer_norm = nn.LayerNorm((hidden_sizes[4]*2,))
        self.hllayer_norm = nn.LayerNorm((hidden_sizes[5]*2,))
        self.bn = nn.BatchNorm1d(sum(hidden_sizes)*4)
        self.relu = nn.ReLU()
        if self.classification:
            self.softmax = nn.Softmax(dim=1)
    def extract_features(self, sequence, lengths, rnn1, rnn2, layer_norm):
        packed_sequence = pack_padded_sequence(sequence, lengths)
        packed_h1, (final_h1, _) = rnn1(packed_sequence)
        padded_h1, _ = pad_packed_sequence(packed_h1)
        normed_h1 = layer_norm(padded_h1)
        packed_normed_h1 = pack_padded_sequence(normed_h1, lengths)
        _, (final_h2, _) = rnn2(packed_normed_h1)
        return final_h1, final_h2

    def fusion(self, sentences, visual, acoustic, pose, hand_r, hand_l, lengths):
        batch_size = lengths.size(0)
        # We have features in sentences
        #sentences = self.embed(sentences)
        # extract features from text modality
        final_h1t, final_h2t = self.extract_features(
            sentences, lengths, self.trnn1, self.trnn2, self.tlayer_norm)

        # extract features from visual modality
        final_h1v, final_h2v = self.extract_features(
            visual, lengths, self.vrnn1, self.vrnn2, self.vlayer_norm)

        # extract features from acoustic modality
        final_h1a, final_h2a = self.extract_features(
            acoustic, lengths, self.arnn1, self.arnn2, self.alayer_norm)

        # extract features from pose modality
        final_h1p, final_h2p = self.extract_features(
            pose, lengths, self.prnn1, self.prnn2, self.player_norm)
        
        final_h1hr, final_h2hr = self.extract_features(
            hand_r, lengths, self.hrrnn1, self.hrrnn2, self.hrlayer_norm)
        
        final_h1hl, final_h2hl = self.extract_features(
            hand_l, lengths, self.hlrnn1, self.hlrnn2, self.hllayer_norm)

        # simple late fusion -- concatenation + normalization
        h = torch.cat((final_h1t, final_h2t, final_h1v, final_h2v, final_h1a, final_h2a, final_h1p, final_h2p, final_h1hr, final_h2hr, final_h1hl, final_h2hl),
                      dim=2).permute(1, 0, 2).contiguous().view(batch_size, -1)
        return self.bn(h)

    def forward(self, sentences, visual, acoustic, pose, hand_r, hand_l, lengths):
        batch_size = lengths.size(0)
        h = self.fusion(sentences, visual, acoustic, pose, hand_r, hand_l, lengths)
        h = self.fc1(h)
        h = self.dropout(h)
        h = self.relu(h)
        o = self.fc2(h)
        #o = torch.cat((o[:,0].unsqueeze(dim=1), self.relu(o[:,1:])),axis=1)
        if self.classification:
            return self.softmax(o)
        return o.squeeze()


In [None]:
def train_model(model, train_set, dev_set, num_epochs, criterion_func, optimizer, patience_n=15, update_to_save=0.01, verbose=True):
    # Train parameters
    curr_patience = patience = patience_n
    num_trials = 3
    grad_clip_value = 1.0

    if CUDA:
        model.cuda()
    criterion = criterion_func
    best_valid_loss = float('inf')
    lr_scheduler = torch.optim.lr_scheduler.StepLR(
        optimizer, step_size=1, gamma=0.1)
    lr_scheduler.step()  # for some reason it seems the StepLR needs to be stepped once first
    train_losses = []
    valid_losses = []
    for e in range(num_epochs):
        model.train()
        train_iter = train_set
        train_loss = 0.0
        for batch in train_iter:
            model.zero_grad()
            if POSE_FEATURES:
                t, v, a, p, hr, hl, y, l = batch
                batch_size = t.size(0)
                if CUDA:
                    t, v, a = t.cuda(), v.cuda(), a.cuda()
                    p, hr, hl = p.cuda(), hr.cuda(), hl.cuda()
                    y = y.cuda()
                y_tilde = model(t, v, a, p, hr, hl, l)
            else:
                t, v, a, y, l = batch
                batch_size = t.size(0)
                if CUDA:
                    t, v, a = t.cuda(), v.cuda(), a.cuda()
                    y = y.cuda()
                y_tilde = model(t, v, a, l)
            loss = criterion(y_tilde, y)
            loss.backward()
            # Clip gradients to avoid vanishing gradients
            torch.nn.utils.clip_grad_value_(
                [param for param in model.parameters() if param.requires_grad], grad_clip_value)
            # Update Weights
            optimizer.step()
            #train_iter.set_description(
            #    f"Epoch {e}/{MAX_EPOCH}, current batch loss: {round(loss.item()/batch_size, 4)}")
            train_loss += loss.item()
        train_loss = train_loss / len(train_set)
        train_losses.append(train_loss)
        if e % 5 == 0:
            if verbose:
                print(f"E ({e+1}): Training loss: {round(train_loss, 4)}")

        # Perfom Validation
        model.eval()
        with torch.no_grad():
            valid_loss = 0.0
            for batch in dev_set:
                model.zero_grad()
                if POSE_FEATURES:
                    t, v, a, p, hr, hl, y, l = batch
                    batch_size = t.size(0)
                    if CUDA:
                        t, v, a = t.cuda(), v.cuda(), a.cuda()
                        p, hr, hl = p.cuda(), hr.cuda(), hl.cuda()
                        y = y.cuda()
                    y_tilde = model(t, v, a, p, hr, hl, l)
                else:
                    t, v, a, y, l = batch
                    batch_size = t.size(0)
                    if CUDA:
                        t, v, a = t.cuda(), v.cuda(), a.cuda()
                        y = y.cuda()
                    y_tilde = model(t, v, a, l)
                loss = criterion(y_tilde, y)
                #print("Pred:", y_tilde)
                #print("True:", y)
                valid_loss += loss.item()

        valid_loss = valid_loss/len(dev_set)
        valid_losses.append(valid_loss)
        if verbose:
            print(f"Validation loss: {round(valid_loss, 4)}")
        #print(f"Current patience: {curr_patience}, current trial: {num_trials}.")
        improve = (valid_loss - best_valid_loss)
        if improve == -float('inf'):
            best_valid_loss = 0
        # If improvement is above 1% of last loss
        if verbose:
            print(improve, (best_valid_loss * -update_to_save))
        if improve < (best_valid_loss * -update_to_save) :
            best_valid_loss = valid_loss
            if verbose:
                print("Found new best model on dev set! Saving current params")
            torch.save(model.state_dict(), 'model.std')
            torch.save(optimizer.state_dict(), 'optim.std')
            curr_patience = patience
        else:
            curr_patience -= 1
            if curr_patience < 0:
                if verbose:
                    print("Running out of patience, loading previous best model.")
                num_trials -= 1
                curr_patience = patience
                model.load_state_dict(torch.load('model.std'))
                optimizer.load_state_dict(torch.load('optim.std'))
                lr_scheduler.step()
                if verbose:
                    print(f"Current learning rate: {optimizer.state_dict()['param_groups'][0]['lr']}")
        if num_trials <= 0:
            if verbose:
                print("Running out of patience, early stopping.")
            break
    return train_losses, valid_losses

In [None]:
from sklearn.metrics import f1_score

def test_model_classification(model, test_loader, criterion_func):
    y_true = []
    y_pred = []
    model.eval()
    with torch.no_grad():
        test_loss = 0.0
        for batch in test_loader:
            model.zero_grad()
            if POSE_FEATURES:
                t, v, a, p, hr, hl, y, l = batch
                batch_size = t.size(0)
                if CUDA:
                    t, v, a = t.cuda(), v.cuda(), a.cuda()
                    p, hr, hl = p.cuda(), hr.cuda(), hl.cuda()
                    y = y.cuda()
                y_tilde = model(t, v, a, p, hr, hl, l)
            else:
                t, v, a, y, l = batch
                batch_size = t.size(0)
                if CUDA:
                    t, v, a = t.cuda(), v.cuda(), a.cuda()
                    y = y.cuda()
                y_tilde = model(t, v, a, l)
            loss = criterion_func(y_tilde, y)
            y_tilde_numpy = y_tilde.detach().cpu().numpy()
            if len(y_tilde_numpy.shape) == 0:
                y_pred += [y_tilde_numpy]
            else:
                y_pred += [e for e in y_tilde_numpy]
            y_true += [e for e in y.detach().cpu().numpy()]
            test_loss += loss.item()
    
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    y_pred =y_pred.argmax(axis=1)

    if len(y_pred.shape) > 1: 
        #y_pred[:,1:][y_pred[:,1:] < 0] = 0
        f1_s = f1_score(y_true, y_pred, average='weighted')
        acc_s = (y_true == y_pred).sum()/len(y_pred)
        print(f"Test set F1S is {f1_s}")
        #print("--Average Error per dimension--")
        #print(np.abs(y_true - y_pred).mean(axis=0))
        return y_pred, y_true, (f1_s, acc_s)
    else:
        #y_pred[1:][y_pred[1:] < 0] = 0
        #y_true = y_true[0]
        f1_s = f1_score(y_true, y_pred, average='weighted')
        acc_s = (y_true == y_pred).sum()/len(y_pred)
        print(f"Test set F1S is {f1_s}")
        #print(np.abs(y_true - y_pred))
        return y_pred, y_true, (f1_s, acc_s)
    


In [None]:
from sklearn.metrics import mean_squared_error

def test_model_regression(model, test_loader, criterion_func):
    y_true = []
    y_pred = []
    model.eval()
    with torch.no_grad():
        test_loss = 0.0
        for batch in test_loader:
            model.zero_grad()
            if POSE_FEATURES:
                t, v, a, p, hr, hl, y, l = batch
                batch_size = t.size(0)
                if CUDA:
                    t, v, a = t.cuda(), v.cuda(), a.cuda()
                    p, hr, hl = p.cuda(), hr.cuda(), hl.cuda()
                    y = y.cuda()
                y_tilde = model(t, v, a, p, hr, hl, l)
            else:
                t, v, a, y, l = batch
                batch_size = t.size(0)
                if CUDA:
                    t, v, a = t.cuda(), v.cuda(), a.cuda()
                    y = y.cuda()
                y_tilde = model(t, v, a, l)
            loss = criterion_func(y_tilde, y)
            y_tilde_numpy = y_tilde.detach().cpu().numpy()
            if len(y_tilde_numpy.shape) == 0:
                y_pred += [y_tilde_numpy]
            else:
                y_pred += [e for e in y_tilde_numpy]
            y_true += [e for e in y.detach().cpu().numpy()]
            test_loss += loss.item()
    
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)


    if len(y_pred.shape) > 1: 
        #y_pred[:,1:][y_pred[:,1:] < 0] = 0
        #error_test = criterion_func(y_true, y_pred)
        print(f"Test set MAE is {test_loss}")
        #print("--Average Error per dimension--")
        #print(np.abs(y_true - y_pred).mean(axis=0))
        return y_pred, y_true, test_loss#, np.abs(y_true - y_pred).mean(axis=0) 
    else:
        #y_pred[1:][y_pred[1:] < 0] = 0
        #y_true = y_true[0]
        #error_test = criterion_func(y_true, y_pred)
        print(f"Test set MAE is {test_loss}")
        #print(np.abs(y_true - y_pred))
        return y_pred, y_true, test_loss#, np.abs(y_true - y_pred)
    


In [None]:
def LFLSTMP_run(train_loader, dev_loader, test_loader,
    input_sizes, hidden_sizes, fc1_size, output_size, dropout, epoch_max, weight_decay, classification):

    model = LFLSTMP(input_sizes, hidden_sizes, fc1_size, output_size, dropout, classification)
    optimizer = Adam([param for param in model.parameters()
                    if param.requires_grad], weight_decay=weight_decay)
    if classification:
        train_losses, valid_losses = train_model(model, train_loader, dev_loader, epoch_max, nn.CrossEntropyLoss(), optimizer, patience_n=25)
    else:
        train_losses, valid_losses = train_model(model, train_loader, dev_loader, epoch_max, nn.MSELoss(), optimizer, patience_n=25)
    model.load_state_dict(torch.load('model.std'))
    if classification:
        results = test_model_classification(model, test_loader, nn.CrossEntropyLoss())
    else:
        results = test_model_regression(model, test_loader, nn.L1Loss())
    return train_losses, valid_losses, results
    

In [None]:
def LFLSTM_run(train_loader, dev_loader, test_loader,
    input_sizes, hidden_sizes, fc1_size, output_size, dropout, epoch_max, weight_decay, classification):
    print("Performing Late Fusion LSTM with pose Run")

    model = LFLSTM(input_sizes, hidden_sizes, fc1_size, output_size, dropout, classification)
    optimizer = Adam([param for param in model.parameters()
                    if param.requires_grad], weight_decay=weight_decay)
    if classification:
        train_losses, valid_losses = train_model(model, train_loader, dev_loader, epoch_max, nn.CrossEntropyLoss(), optimizer, patience_n=25)
    else:
        train_losses, valid_losses = train_model(model, train_loader, dev_loader, epoch_max, nn.MSELoss(), optimizer, patience_n=25)
    model.load_state_dict(torch.load('model.std'))
    if classification:
        results = test_model_classification(model, test_loader, nn.CrossEntropyLoss())
    else:
        results = test_model_regression(model, test_loader, nn.L1Loss())
    return train_losses, valid_losses, results
    

In [None]:
def LFLSTM_run(train_loader, dev_loader, test_loader,
    input_sizes, hidden_sizes, fc1_size, output_size, dropout, epoch_max, weight_decay, classification):
    print("Performing Late Fusion LSTM NO pose Run")

    model = LFLSTM(input_sizes, hidden_sizes, fc1_size, output_size, dropout, classification)
    optimizer = Adam([param for param in model.parameters()
                    if param.requires_grad], weight_decay=weight_decay)
    if classification:
        train_losses, valid_losses = train_model(model, train_loader, dev_loader, epoch_max, nn.CrossEntropyLoss(), optimizer, patience_n=25)
    else:
        train_losses, valid_losses = train_model(model, train_loader, dev_loader, epoch_max, nn.MSELoss(), optimizer, patience_n=25)
    model.load_state_dict(torch.load('model.std'))
    if classification:
        results = test_model_classification(model, test_loader, nn.CrossEntropyLoss())
    else:
        results = test_model_regression(model, test_loader, nn.L1Loss())
    return train_losses, valid_losses, results
    

In [None]:
def EFLSTMP_run(train_loader, dev_loader, test_loader,
    input_sizes, hidden_sizes, output_size, dropout, epoch_max, lstm_dim, weight_decay, classification):
    print("Performing Early Fusion LSTM with pose Run")

    model = EFLSTMP(input_sizes, hidden_sizes, output_size, dropout, n_layers=lstm_dim, classification=classification)
    optimizer = Adam([param for param in model.parameters()
                    if param.requires_grad], weight_decay=weight_decay)
    if classification:
        train_losses, valid_losses = train_model(model, train_loader, dev_loader, epoch_max, nn.CrossEntropyLoss(), optimizer, patience_n=25)
    else:
        train_losses, valid_losses = train_model(model, train_loader, dev_loader, epoch_max, nn.MSELoss(), optimizer, patience_n=25)
    model.load_state_dict(torch.load('model.std'))
    if classification:
        results = test_model_classification(model, test_loader, nn.CrossEntropyLoss())
    else:
        results = test_model_regression(model, test_loader, nn.L1Loss())
    return train_losses, valid_losses, results

In [None]:
def EFLSTM_run(train_loader, dev_loader, test_loader,
    input_sizes, hidden_sizes, output_size, dropout, epoch_max, lstm_dim, weight_decay, classification):
    print("Performing Early Fusion LSTM NO pose Run")

    model = EFLSTM(input_sizes, hidden_sizes, output_size, dropout, n_layers=lstm_dim, classification=classification)
    optimizer = Adam([param for param in model.parameters()
                    if param.requires_grad], weight_decay=weight_decay)
    if classification:
        train_losses, valid_losses = train_model(model, train_loader, dev_loader, epoch_max, nn.CrossEntropyLoss(), optimizer, patience_n=25)
    else:
        train_losses, valid_losses = train_model(model, train_loader, dev_loader, epoch_max, nn.MSELoss(), optimizer, patience_n=25)
    model.load_state_dict(torch.load('model.std'))
    if classification:
        results = test_model_classification(model, test_loader, nn.CrossEntropyLoss())
    else:
        results = test_model_regression(model, test_loader, nn.L1Loss())
    return train_losses, valid_losses, results

In [None]:
def LangLSTM_run(train_loader, dev_loader, test_loader,
    input_sizes, hidden_sizes, output_size, dropout, epoch_max, lstm_dim, weight_decay, classification):
    print("Performing Early Fusion LSTM NO pose Run")

    model = LangLSTM(input_sizes, hidden_sizes, output_size, dropout, n_layers=lstm_dim, classification=classification)
    optimizer = Adam([param for param in model.parameters()
                    if param.requires_grad], weight_decay=weight_decay)
    if classification:
        train_losses, valid_losses = train_model(model, train_loader, dev_loader, epoch_max, nn.CrossEntropyLoss(), optimizer, patience_n=25)
    else:
        train_losses, valid_losses = train_model(model, train_loader, dev_loader, epoch_max, nn.MSELoss(), optimizer, patience_n=25)
    model.load_state_dict(torch.load('model.std'))
    if classification:
        results = test_model_classification(model, test_loader, nn.CrossEntropyLoss())
    else:
        results = test_model_regression(model, test_loader, nn.L1Loss())
    return train_losses, valid_losses, results

In [None]:
# ONE OUT TESTING
def leave_one_out(dataset, dict_results=dict(), save_plots=True, label_n=0, output_size=1,  num_classes=-1, classification_task=False, graph_path="."):
    all_videos = list(videos)

    CUDA = torch.cuda.is_available()
    print("CUDA: ", CUDA)
    MAX_EPOCH = 1000

    text_size = 300
    visual_size = 35
    acoustic_size = 74
    pose_size = 25
    hand_r_size = 21
    hand_l_size = 21
    num_classes = num_classes
    output_size = output_size
    classification = classification_task
    dropout_lflstm = 0.15
    dropout_eflstm = 0.05
    n_layer_eflstm = 2
    weight_decay = 0.1

    dict_results["videos"] = all_videos
    for test_i in range(0, len(all_videos)):
        test_v = [all_videos[test_i]]
        for dev_i in range(0, len(all_videos)):
            if dev_i != test_i:
                print("Dev i:", dev_i, " || Test i: ", test_i,)
                dev_v = [all_videos[dev_i]]
                train_v = [v for v in all_videos if v not in test_v and v not in dev_v]
                train_s, dev_s, test_s = get_train_dev_test_splits(dataset, train_v, test_v, dev_v, label_n=label_n, n_classes=num_classes)
                if len(dev_s) == 0 or len(test_s) == 0:
                    print("WARNING 0 SEGMENTS in TEST OR DEV, skipping...")
                    continue
                if classification_task:
                    train_loader = DataLoader(train_s, shuffle=False, batch_size=32, collate_fn=multi_collate_class)
                    dev_loader = DataLoader(dev_s, shuffle=False, batch_size=32, collate_fn=multi_collate_class)
                    test_loader = DataLoader(test_s, shuffle=False, batch_size=32, collate_fn=multi_collate_class)
                else:
                    train_loader = DataLoader(train_s, shuffle=False, batch_size=32, collate_fn=multi_collate_reg)
                    dev_loader = DataLoader(dev_s, shuffle=False, batch_size=32, collate_fn=multi_collate_reg)
                    test_loader = DataLoader(test_s, shuffle=False, batch_size=32, collate_fn=multi_collate_reg)
                print(np.array(train_s).shape, np.array(dev_s).shape, np.array(test_s).shape)
                lflstmp_settings = ( [text_size, visual_size, acoustic_size, pose_size, hand_r_size, hand_l_size], 
                                    [int(text_size * 1.25), int(visual_size * 1.25), int(acoustic_size * 1.25), int(pose_size*1.), int(hand_r_size*1.), int(hand_l_size*1.)],
                                    sum([int(text_size * 1.25), int(visual_size * 1.25), int(acoustic_size * 1.25), int(pose_size*1.), int(hand_r_size*1.), int(hand_l_size*1.)]) // 2,
                                    output_size,
                                    dropout_lflstm,
                                    MAX_EPOCH,
                                    weight_decay,
                                    classification
                                )

                dict_results[("LFLSTMP",test_i,dev_i)] = LFLSTMP_run(train_loader, dev_loader, test_loader, *lflstmp_settings)
                train_losses, valid_losses, _ = dict_results[("LFLSTMP",test_i,dev_i)]
                if save_plots:
                    plt.plot(np.arange(0, len(train_losses)), train_losses, label="Train Loss")
                    plt.plot(np.arange(0, len(valid_losses)), valid_losses, label="Dev Loss")
                    plt.legend()
                    plt.title(f"Training LFLSTM w/P, Dev: {dev_i} | Test: {test_i}")
                    plt.xlabel("N Epochs")
                    plt.ylabel("MSE")
                    plt.savefig(os.path.join(graph_path, "LFLSTMP", f"LFLSTMP_t{test_i}_d{dev_i}.png"))
                    plt.clf()

                lflstm_settings = ( [text_size, visual_size, acoustic_size,], 
                                    [int(text_size * 1.5), int(visual_size * 1.5), int(acoustic_size * 1.5),],
                                    sum([int(text_size * 1.5), int(visual_size * 1.5), int(acoustic_size * 1.5),]) // 2,
                                    output_size,
                                    dropout_lflstm,
                                    MAX_EPOCH,
                                    weight_decay,
                                    classification
                                )

                dict_results[("LFLSTM",test_i,dev_i)] = LFLSTM_run(train_loader, dev_loader, test_loader, *lflstm_settings)
                train_losses, valid_losses, _ = dict_results[("LFLSTM",test_i,dev_i)]
                if save_plots:
                    plt.plot(np.arange(0, len(train_losses)), train_losses, label="Train Loss")
                    plt.plot(np.arange(0, len(valid_losses)), valid_losses, label="Dev Loss")
                    plt.legend()
                    plt.title(f"Training LFLSTM, Dev: {dev_i} | Test: {test_i}")
                    plt.xlabel("N Epochs")
                    plt.ylabel("MSE")
                    plt.savefig(os.path.join(graph_path, "LFLSTM", f"LFLSTM_t{test_i}_d{dev_i}.png"))
                    plt.clf()

                eflstmp_settings = ([text_size, visual_size, acoustic_size, pose_size, hand_r_size, hand_l_size], 
                                    [int(text_size * 1.25), int(visual_size * 1.25), int(acoustic_size * 1.25), int(pose_size*1.), int(hand_r_size*1.), int(hand_l_size*1.)],
                                    output_size,
                                    dropout_eflstm,
                                    MAX_EPOCH,
                                    n_layer_eflstm,
                                    weight_decay,
                                    classification
                                )

                dict_results[("EFLSTMP",test_i,dev_i)] = EFLSTMP_run(train_loader, dev_loader, test_loader, *eflstmp_settings)
                train_losses, valid_losses, _ = dict_results[("EFLSTMP",test_i,dev_i)]
                if save_plots:
                    plt.plot(np.arange(0, len(train_losses)), train_losses, label="Train Loss")
                    plt.plot(np.arange(0, len(valid_losses)), valid_losses, label="Dev Loss")
                    plt.legend()
                    plt.title(f"Training EFLSTMP w/P, Dev: {dev_i} | Test: {test_i}")
                    plt.xlabel("N Epochs")
                    plt.ylabel("MSE")
                    plt.savefig(os.path.join(graph_path, "EFLSTMP", f"EFLSTMP_t{test_i}_d{dev_i}.png"))
                    plt.clf()

                eflstm_settings = ( [text_size, visual_size, acoustic_size,], 
                                    [int(text_size * 1.5), int(visual_size * 1.5), int(acoustic_size * 1.5),],
                                    output_size,
                                    dropout_eflstm,
                                    MAX_EPOCH,
                                    n_layer_eflstm,
                                    weight_decay,
                                    classification
                                )

                dict_results[("EFLSTM",test_i,dev_i)] = EFLSTM_run(train_loader, dev_loader, test_loader, *eflstm_settings)
                train_losses, valid_losses, _ = dict_results[("EFLSTM",test_i,dev_i)]
                if save_plots:
                    plt.plot(np.arange(0, len(train_losses)), train_losses, label="Train Loss")
                    plt.plot(np.arange(0, len(valid_losses)), valid_losses, label="Dev Loss")
                    plt.legend()
                    plt.title(f"Training EFLSTM, Dev: {dev_i} | Test: {test_i}")
                    plt.xlabel("N Epochs")
                    plt.ylabel("MSE")
                    plt.savefig(os.path.join(graph_path, "EFLSTM", f"EFLSTM_t{test_i}_d{dev_i}.png"))
                    plt.clf()
                
    return dict_results                

In [None]:
#saved_results = dict()
#CUDA = False

In [None]:
#results_dictionary_top10_mae = leave_one_out(dataset, saved_results, True, label_n=0, output_size=1,
#                                 num_classes=-1, classification_task=False, graph_path=os.path.join("train_graphs","Sentiment_MSE"))

In [None]:
import pickle

def test_models(dataset, saved_results, save_graph, label_n=0, output_size=1,
                                 num_classes=-1, classification_task=False, graph_path=os.path.join("train_graphs","Sentiment_MSE"),
                                 dictionary_name=""):
    resulting_dict = leave_one_out(dataset, saved_results, save_plots=save_graph, label_n=label_n, output_size=output_size,
                                    num_classes=num_classes, classification_task=classification_task, graph_path=graph_path)
    with open(dictionary_name+".pkl","wb") as f:
        pickle.dump(resulting_dict,f)
        
    if classification_task:
        LFLSTMP_average_acc = [resulting_dict[key][2][2][1] for key in resulting_dict.keys() if key[0] == "LFLSTMP"]
        LFLSTM_average_acc = [resulting_dict[key][2][2][1] for key in resulting_dict.keys() if key[0] == "LFLSTM"]
        EFLSTM_average_acc = [resulting_dict[key][2][2][1] for key in resulting_dict.keys() if key[0] == "EFLSTMP"]
        EFLSTMNP_average_acc = [resulting_dict[key][2][2][1] for key in resulting_dict.keys() if key[0] == "EFLSTM"]
        LFLSTMP_average_f1 = [resulting_dict[key][2][2][0] for key in resulting_dict.keys() if key[0] == "LFLSTMP"]
        LFLSTM_average_f1 = [resulting_dict[key][2][2][0] for key in resulting_dict.keys() if key[0] == "LFLSTM"]
        EFLSTM_average_f1 = [resulting_dict[key][2][2][0] for key in resulting_dict.keys() if key[0] == "EFLSTMP"]
        EFLSTMNP_average_f1 = [resulting_dict[key][2][2][0] for key in resulting_dict.keys() if key[0] == "EFLSTM"]
        print(dictionary_name, "RESULTS: ")
        print("Average Acc:")
        print(np.array(LFLSTMP_average_acc).mean(),np.array(LFLSTM_average_acc).mean(),
        np.array(EFLSTM_average_acc).mean(),np.array(EFLSTMNP_average_acc).mean())
        print("Average F1:")
        print(np.array(LFLSTMP_average_f1).mean(),np.array(LFLSTM_average_f1).mean(),
        np.array(EFLSTM_average_f1).mean(),np.array(EFLSTMNP_average_f1).mean())
    else:
        LFLSTMP_average_mae = [resulting_dict[key][2][2] for key in resulting_dict.keys() if key[0] == "LFLSTMP"]
        LFLSTM_average_mae = [resulting_dict[key][2][2] for key in resulting_dict.keys() if key[0] == "LFLSTM"]
        EFLSTM_average_mae = [resulting_dict[key][2][2] for key in resulting_dict.keys() if key[0] == "EFLSTMP"]
        EFLSTMNP_average_mae = [resulting_dict[key][2][2] for key in resulting_dict.keys() if key[0] == "EFLSTM"]
        print(dictionary_name, "RESULTS: ")
        print("MAE: ")
        print(np.array(LFLSTMP_average_mae).mean(),np.array(LFLSTM_average_mae).mean(),
        np.array(EFLSTM_average_mae).mean(),np.array(EFLSTMNP_average_mae).mean())
    return resulting_dict

        

In [None]:
saved_dictionary = dict()
results_dictionary_top10_h4c = test_models(dataset, saved_dictionary, save_graph=True, label_n=1, output_size=4, 
                                    num_classes=4, classification_task=True, graph_path=os.path.join("train_graphs","Happy_4c"),
                                    dictionary_name="results_dictionary_top10_h4c")

In [None]:
LFLSTMP_average_acc = [saved_dictionary[key][2][2][1] for key in saved_dictionary.keys() if key[0] == "LFLSTMP"]
LFLSTM_average_acc = [saved_dictionary[key][2][2][1] for key in saved_dictionary.keys() if key[0] == "LFLSTM"]
EFLSTM_average_acc = [saved_dictionary[key][2][2][1] for key in saved_dictionary.keys() if key[0] == "EFLSTMP"]
EFLSTMNP_average_acc = [saved_dictionary[key][2][2][1] for key in saved_dictionary.keys() if key[0] == "EFLSTM"]
LFLSTMP_average_f1 = [saved_dictionary[key][2][2][0] for key in saved_dictionary.keys() if key[0] == "LFLSTMP"]
LFLSTM_average_f1 = [saved_dictionary[key][2][2][0] for key in saved_dictionary.keys() if key[0] == "LFLSTM"]
EFLSTM_average_f1 = [saved_dictionary[key][2][2][0] for key in saved_dictionary.keys() if key[0] == "EFLSTMP"]
EFLSTMNP_average_f1 = [saved_dictionary[key][2][2][0] for key in saved_dictionary.keys() if key[0] == "EFLSTM"]
print("Average Acc:")
print("LFLSTMP: ", np.array(LFLSTMP_average_acc).mean())
print("LFLSTM: ", np.array(LFLSTM_average_acc).mean())
print("EFLSTMP: ", np.array(EFLSTMNP_average_acc).mean())
print("EFLSTM: ", np.array(EFLSTM_average_acc).mean())
print("Average F1:")
print("LFLSTMP: ", np.array(LFLSTMP_average_f1).mean())
print("LFLSTM: ", np.array(LFLSTM_average_f1).mean())
print("EFLSTMP: ", np.array(EFLSTMNP_average_f1).mean())
print("EFLSTM: ", np.array(EFLSTM_average_f1).mean())

In [None]:
# 0: 'sentiment',
# 1:  'happy',
# 2:  'sad',
# 3:  'anger',
# 4:  'surprise',
# 5:  'disgust',
# 6:  'fear'],

In [None]:
saved_dictionary = dict()
results_dictionary_top10_h4c = test_models(dataset, saved_dictionary, save_graph=True, label_n=2, output_size=4, 
                                    num_classes=4, classification_task=True, graph_path=os.path.join("train_graphs","Sad_4c"),
                                    dictionary_name="results_dictionary_top10_s4c")

In [None]:
LFLSTMP_average_acc = [saved_dictionary[key][2][2][1] for key in saved_dictionary.keys() if key[0] == "LFLSTMP"]
LFLSTM_average_acc = [saved_dictionary[key][2][2][1] for key in saved_dictionary.keys() if key[0] == "LFLSTM"]
EFLSTM_average_acc = [saved_dictionary[key][2][2][1] for key in saved_dictionary.keys() if key[0] == "EFLSTMP"]
EFLSTMNP_average_acc = [saved_dictionary[key][2][2][1] for key in saved_dictionary.keys() if key[0] == "EFLSTM"]
LFLSTMP_average_f1 = [saved_dictionary[key][2][2][0] for key in saved_dictionary.keys() if key[0] == "LFLSTMP"]
LFLSTM_average_f1 = [saved_dictionary[key][2][2][0] for key in saved_dictionary.keys() if key[0] == "LFLSTM"]
EFLSTM_average_f1 = [saved_dictionary[key][2][2][0] for key in saved_dictionary.keys() if key[0] == "EFLSTMP"]
EFLSTMNP_average_f1 = [saved_dictionary[key][2][2][0] for key in saved_dictionary.keys() if key[0] == "EFLSTM"]
print("Average Acc:")
print("LFLSTMP: ", np.array(LFLSTMP_average_acc).mean())
print("LFLSTM: ", np.array(LFLSTM_average_acc).mean())
print("EFLSTMP: ", np.array(EFLSTMNP_average_acc).mean())
print("EFLSTM: ", np.array(EFLSTM_average_acc).mean())
print("Average F1:")
print("LFLSTMP: ", np.array(LFLSTMP_average_f1).mean())
print("LFLSTM: ", np.array(LFLSTM_average_f1).mean())
print("EFLSTMP: ", np.array(EFLSTMNP_average_f1).mean())
print("EFLSTM: ", np.array(EFLSTM_average_f1).mean())

In [None]:
# 0: 'sentiment',
# 1:  'happy',
# 2:  'sad',
# 3:  'anger',
# 4:  'surprise',
# 5:  'disgust',
# 6:  'fear'],
saved_dictionary = dict()
results_dictionary_top10_a7c = test_models(dataset, saved_dictionary, save_graph=True, label_n=3, output_size=4, 
                                    num_classes=4, classification_task=True, graph_path=os.path.join("train_graphs","Anger_4c"),
                                    dictionary_name="results_dictionary_top10_a4c")

In [None]:
# 0: 'sentiment',
# 1:  'happy',
# 2:  'sad',
# 3:  'anger',
# 4:  'surprise',
# 5:  'disgust',
# 6:  'fear'],
saved_dictionary = dict()
results_dictionary_top10_f7c = test_models(dataset, saved_dictionary, save_graph=True, label_n=6, output_size=4, 
                                    num_classes=4, classification_task=True, graph_path=os.path.join("train_graphs","Fear_4c"),
                                    dictionary_name="results_dictionary_top10_f4c")
LFLSTMP_average_acc = [saved_dictionary[key][2][2][1] for key in saved_dictionary.keys() if key[0] == "LFLSTMP"]
LFLSTM_average_acc = [saved_dictionary[key][2][2][1] for key in saved_dictionary.keys() if key[0] == "LFLSTM"]
EFLSTM_average_acc = [saved_dictionary[key][2][2][1] for key in saved_dictionary.keys() if key[0] == "EFLSTMP"]
EFLSTMNP_average_acc = [saved_dictionary[key][2][2][1] for key in saved_dictionary.keys() if key[0] == "EFLSTM"]
LFLSTMP_average_f1 = [saved_dictionary[key][2][2][0] for key in saved_dictionary.keys() if key[0] == "LFLSTMP"]
LFLSTM_average_f1 = [saved_dictionary[key][2][2][0] for key in saved_dictionary.keys() if key[0] == "LFLSTM"]
EFLSTM_average_f1 = [saved_dictionary[key][2][2][0] for key in saved_dictionary.keys() if key[0] == "EFLSTMP"]
EFLSTMNP_average_f1 = [saved_dictionary[key][2][2][0] for key in saved_dictionary.keys() if key[0] == "EFLSTM"]
print("Average Acc:")
print("LFLSTMP: ", np.array(LFLSTMP_average_acc).mean())
print("LFLSTM: ", np.array(LFLSTM_average_acc).mean())
print("EFLSTMP: ", np.array(EFLSTMNP_average_acc).mean())
print("EFLSTM: ", np.array(EFLSTM_average_acc).mean())
print("Average F1:")
print("LFLSTMP: ", np.array(LFLSTMP_average_f1).mean())
print("LFLSTM: ", np.array(LFLSTM_average_f1).mean())
print("EFLSTMP: ", np.array(EFLSTMNP_average_f1).mean())
print("EFLSTM: ", np.array(EFLSTM_average_f1).mean())

In [None]:
# 0: 'sentiment',
# 1:  'happy',
# 2:  'sad',
# 3:  'anger',
# 4:  'surprise',
# 5:  'disgust',
# 6:  'fear'],
saved_dictionary = dict()
results_dictionary_top10_d7c = test_models(dataset, saved_dictionary, save_graph=True, label_n=5, output_size=4, 
                                    num_classes=4, classification_task=True, graph_path=os.path.join("train_graphs","Disgust_4c"),
                                    dictionary_name="results_dictionary_top10_d4c")
LFLSTMP_average_acc = [saved_dictionary[key][2][2][1] for key in saved_dictionary.keys() if key[0] == "LFLSTMP"]
LFLSTM_average_acc = [saved_dictionary[key][2][2][1] for key in saved_dictionary.keys() if key[0] == "LFLSTM"]
EFLSTM_average_acc = [saved_dictionary[key][2][2][1] for key in saved_dictionary.keys() if key[0] == "EFLSTMP"]
EFLSTMNP_average_acc = [saved_dictionary[key][2][2][1] for key in saved_dictionary.keys() if key[0] == "EFLSTM"]
LFLSTMP_average_f1 = [saved_dictionary[key][2][2][0] for key in saved_dictionary.keys() if key[0] == "LFLSTMP"]
LFLSTM_average_f1 = [saved_dictionary[key][2][2][0] for key in saved_dictionary.keys() if key[0] == "LFLSTM"]
EFLSTM_average_f1 = [saved_dictionary[key][2][2][0] for key in saved_dictionary.keys() if key[0] == "EFLSTMP"]
EFLSTMNP_average_f1 = [saved_dictionary[key][2][2][0] for key in saved_dictionary.keys() if key[0] == "EFLSTM"]
print("Average Acc:")
print("LFLSTMP: ", np.array(LFLSTMP_average_acc).mean())
print("LFLSTM: ", np.array(LFLSTM_average_acc).mean())
print("EFLSTMP: ", np.array(EFLSTMNP_average_acc).mean())
print("EFLSTM: ", np.array(EFLSTM_average_acc).mean())
print("Average F1:")
print("LFLSTMP: ", np.array(LFLSTMP_average_f1).mean())
print("LFLSTM: ", np.array(LFLSTM_average_f1).mean())
print("EFLSTMP: ", np.array(EFLSTMNP_average_f1).mean())
print("EFLSTM: ", np.array(EFLSTM_average_f1).mean())

In [None]:
# 0: 'sentiment',
# 1:  'happy',
# 2:  'sad',
# 3:  'anger',
# 4:  'surprise',
# 5:  'disgust',
# 6:  'fear'],
saved_dictionary = dict()
results_dictionary_top10_sur7c = test_models(dataset, saved_dictionary, save_graph=True, label_n=4, output_size=4, 
                                    num_classes=4, classification_task=True, graph_path=os.path.join("train_graphs","Surprise_4c"),
                                    dictionary_name="results_dictionary_top10_surprise4c")

In [None]:
saved_dictionary_test = None
with open("results_dictionary_top10_mae.pkl",'rb') as f:
    saved_dictionary_test = pickle.load(f)

In [None]:
LFLSTMP_average_mae = [saved_dictionary_test[key][2][2] for key in saved_dictionary_test.keys() if key[0] == "LFLSTMP"]
LFLSTM_average_mae = [saved_dictionary_test[key][2][2] for key in saved_dictionary_test.keys() if key[0] == "LFLSTM"]
EFLSTM_average_mae = [saved_dictionary_test[key][2][2] for key in saved_dictionary_test.keys() if key[0] == "EFLSTMP"]
EFLSTMNP_average_mae = [saved_dictionary_test[key][2][2] for key in saved_dictionary_test.keys() if key[0] == "EFLSTM"]
print("MAE: ")
print(np.array(LFLSTMP_average_mae).mean(),np.array(LFLSTM_average_mae).mean(),
np.array(EFLSTM_average_mae).mean(),np.array(EFLSTMNP_average_mae).mean())

In [None]:
np.mean(LFLSTMP_average_acc), np.mean(LFLSTM_average_acc)

In [None]:
import pickle

with open("results_dictionary_top10_mae.pkl","wb") as f:
    pickle.dump(results_dictionary_top10_mae,f)


In [None]:
LFLSTMP_average_mse = [results_dictionary_top10[key][2][2] for key in results_dictionary_top10.keys() if key[0] == "LFLSTMP"]
LFLSTM_average_mse = [results_dictionary_top10[key][2][2] for key in results_dictionary_top10.keys() if key[0] == "LFLSTM"]
EFLSTM_average_mse = [results_dictionary_top10[key][2][2] for key in results_dictionary_top10.keys() if key[0] == "EFLSTMP"]
EFLSTMNP_average_mse = [results_dictionary_top10[key][2][2] for key in results_dictionary_top10.keys() if key[0] == "EFLSTM"]

In [None]:
np.array(LFLSTMP_average_mse).mean(),np.array(LFLSTM_average_mse).mean(),np.array(EFLSTM_average_mse).mean(),np.array(EFLSTMNP_average_mse).mean()

In [None]:
LFLSTMP_average_error_dim = [results_dictionary_top10[key][2][3] for key in results_dictionary_top10.keys() if key[0] == "LFLSTMP"]
LFLSTM_average_error_dim  = [results_dictionary_top10[key][2][3] for key in results_dictionary_top10.keys() if key[0] == "LFLSTM"]
EFLSTM_average_error_dim  = [results_dictionary_top10[key][2][3] for key in results_dictionary_top10.keys() if key[0] == "EFLSTMP"]
EFLSTMNP_average_error_dim = [results_dictionary_top10[key][2][3] for key in results_dictionary_top10.keys() if key[0] == "EFLSTM"]

In [None]:
np.array(LFLSTMP_average_error_dim).mean(axis=0),np.array(LFLSTM_average_error_dim).mean(axis=0),np.array(EFLSTM_average_error_dim).mean(axis=0),np.array(EFLSTMNP_average_error_dim).mean(axis=0)

In [None]:
CUDA = False#torch.cuda.is_available()
print("CUDA: ", CUDA)
MAX_EPOCH = 1000

text_size = 300
visual_size = 35
acoustic_size = 74
pose_size = 25
hand_r_size = 21
hand_l_size = 21

# define some model settings and hyper-parameters
input_sizes = [text_size, visual_size, acoustic_size, pose_size, hand_r_size, hand_l_size]
hidden_sizes = [int(text_size * 1.25), int(visual_size * 1.25),
                int(acoustic_size * 1.25), int(pose_size*1.), int(hand_r_size*1.), int(hand_l_size*1.)]
fc1_size = sum(hidden_sizes) // 2
dropout = 0.15
output_size = 7
weight_decay = 0.1


model = LFLSTMP(input_sizes, hidden_sizes, fc1_size, output_size, dropout, True)

optimizer = Adam([param for param in model.parameters()
                 if param.requires_grad], weight_decay=weight_decay)

train_losses, valid_losses = train_model(model, train_loader, dev_loader, MAX_EPOCH, nn.CrossEntropyLoss(), optimizer, patience_n=25, update_to_save=0.05)
model.load_state_dict(torch.load('model.std'))

In [None]:
results = test_model_classification(model, test_loader, nn.CrossEntropyLoss())
# 1.18910813331604
# [1.4783462  0.07662236 1.4068763  1.1306256  0.02685845 0.5496703 0.36040738]
# [1 2 2 0 0]
# [[0.24364449 0.25448358 0.24080864 0.26106322]
#  [0.23234913 0.25409198 0.25244874 0.2611101 ]
#  [0.22957    0.25405973 0.2547654  0.2616048 ]
#  [0.22653975 0.27216277 0.24622498 0.25507247]
#  [0.23375641 0.27675375 0.23467144 0.25481838]]
# [3 3 3 1 1]
# Test set F1S is 0.0
# [Sentiment, Happiness, Sadness, Anger, Fear, Disgust, Surprise]
# True (n=5):  [1 2 2 0 0]
# Pred (n=5):  [3 3 3 1 1]
# --Average Error per dimension--

In [None]:
plt.plot(np.arange(0, len(train_losses)), train_losses, label="Train Loss")
plt.plot(np.arange(0, len(valid_losses)), valid_losses, label="Dev Loss")
plt.legend()

In [None]:
from tqdm import tqdm_notebook
from torch.optim import Adam, SGD

CUDA = False #torch.cuda.is_available()
print("CUDA: ", CUDA)
MAX_EPOCH = 1000

text_size = 300
visual_size = 35
acoustic_size = 74
pose_size = 25
hand_r_size = 21
hand_l_size = 21

# define some model settings and hyper-parameters
input_sizes = [text_size, visual_size, acoustic_size]#, pose_size, hand_r_size, hand_l_size]
hidden_sizes = [int(text_size * 1.5), int(visual_size * 1.5),
                int(acoustic_size * 1.5)]#, int(pose_size*1.2), int(hand_r_size*1.2), int(hand_l_size*1.2)]
fc1_size = sum(hidden_sizes) // 2
dropout = 0.1
output_size = 7
weight_decay = 0.01

model = LFLSTM(input_sizes, hidden_sizes, fc1_size, output_size, dropout)
optimizer = Adam([param for param in model.parameters()
                 if param.requires_grad], weight_decay=weight_decay)

train_losses, valid_losses = train_model(model, train_loader, dev_loader, MAX_EPOCH, nn.CrossEntropyLoss(), optimizer, patience_n=25)
model.load_state_dict(torch.load('model.std'))

In [None]:
results = test_model_classification(model, test_loader, nn.CrossEntropyLoss())

In [None]:
plt.plot(np.arange(0, len(train_losses)), train_losses, label="Train Loss")
plt.plot(np.arange(0, len(valid_losses)), valid_losses, label="Dev Loss")
plt.legend()

In [None]:
from tqdm import tqdm_notebook
from torch.optim import Adam, SGD

CUDA = torch.cuda.is_available()
print("CUDA: ", CUDA)
MAX_EPOCH = 1000

text_size = 300
visual_size = 35
acoustic_size = 74
pose_size = 25
hand_r_size = 21
hand_l_size = 21

# define some model settings and hyper-parameters
input_sizes = [text_size, visual_size, acoustic_size, pose_size, hand_r_size, hand_l_size]
hidden_sizes = [int(text_size * 1.25), int(visual_size * 1.25),
                int(acoustic_size * 1.25), int(pose_size*1.), int(hand_r_size*1.), int(hand_l_size*1.)]
fc1_size = sum(hidden_sizes) // 2
dropout = 0.05
output_size = 4
weight_decay = 0.1

model = EFLSTM(input_sizes, hidden_sizes, output_size, dropout, n_layers=2, classification=True)
optimizer = Adam([param for param in model.parameters()
                 if param.requires_grad], weight_decay=weight_decay)

train_losses, valid_losses = train_model(model, train_loader, dev_loader, MAX_EPOCH, nn.CrossEntropyLoss(), optimizer, patience_n=50)
model.load_state_dict(torch.load('model.std'))


In [None]:
results = test_model_classification(model, test_loader, nn.CrossEntropyLoss())
plt.plot(np.arange(0, len(train_losses)), train_losses, label="Train Loss")
plt.plot(np.arange(0, len(valid_losses)), valid_losses, label="Dev Loss")
plt.legend()
plt.title("Training EFLSTM w/P")
plt.xlabel("N Epochs")
plt.ylabel("MSE")
plt.savefig("train_graphs/plot.png")
# Test set MSE is 1.3169214725494385
# [1.7539093e+00 4.1389942e-01 1.4151512e+00 1.1141731e+00 7.1338250e-04 4.9885786e-01 3.6192438e-01]


In [None]:
from tqdm import tqdm_notebook
from torch.optim import Adam, SGD

CUDA = torch.cuda.is_available()
print("CUDA: ", CUDA)
MAX_EPOCH = 1000

text_size = 300
visual_size = 35
acoustic_size = 74
pose_size = 25
hand_r_size = 21
hand_l_size = 21

# define some model settings and hyper-parameters
input_sizes = [text_size, visual_size, acoustic_size]#, pose_size, hand_r_size, hand_l_size]
hidden_sizes = [int(text_size * 1.5), int(visual_size * 1.5),
                int(acoustic_size * 1.5)]#, int(pose_size * 1.5), int(hand_r_size * 1.5), int(hand_l_size * 1.5)]
fc1_size = sum(hidden_sizes) // 2
dropout = 0.05
output_size = 4
weight_decay = 0.1

model = EFLSTM(input_sizes, hidden_sizes, output_size, dropout, n_layers=2, classification=True)

optimizer = Adam([param for param in model.parameters()
                 if param.requires_grad], weight_decay=weight_decay)

train_losses, valid_losses = train_model(model, train_loader, dev_loader, MAX_EPOCH, nn.CrossEntropyLoss(), optimizer, patience_n=50)
model.load_state_dict(torch.load('model.std'))

In [None]:
results = test_model_classification(model, test_loader, nn.CrossEntropyLoss())
plt.plot(np.arange(0, len(train_losses)), train_losses, label="Train Loss")
plt.plot(np.arange(0, len(valid_losses)), valid_losses, label="Dev Loss")
plt.legend()

In [None]:
# ONE OUT TESTING
def lang_leave_one_out(dataset, dict_results=dict(), save_plots=True, label_n=0, output_size=1,  num_classes=-1, classification_task=False, graph_path="."):
    all_videos = list(videos)

    CUDA = torch.cuda.is_available()
    print("CUDA: ", CUDA)
    MAX_EPOCH = 1000

    text_size = 300

    num_classes = num_classes
    output_size = output_size
    classification = classification_task
    dropout_eflstm = 0.05
    n_layer_eflstm = 2
    weight_decay = 0.1

    dict_results["videos"] = all_videos
    for test_i in range(0, len(all_videos)):
        test_v = [all_videos[test_i]]
        for dev_i in range(0, len(all_videos)):
            if dev_i != test_i:
                print("Dev i:", dev_i, " || Test i: ", test_i,)
                dev_v = [all_videos[dev_i]]
                train_v = [v for v in all_videos if v not in test_v and v not in dev_v]
                train_s, dev_s, test_s = get_train_dev_test_splits(dataset, train_v, test_v, dev_v, label_n=label_n, n_classes=num_classes)
                if len(dev_s) == 0 or len(test_s) == 0:
                    print("WARNING 0 SEGMENTS in TEST OR DEV, skipping...")
                    continue
                if classification_task:
                    train_loader = DataLoader(train_s, shuffle=False, batch_size=32, collate_fn=multi_collate_class)
                    dev_loader = DataLoader(dev_s, shuffle=False, batch_size=32, collate_fn=multi_collate_class)
                    test_loader = DataLoader(test_s, shuffle=False, batch_size=32, collate_fn=multi_collate_class)
                else:
                    train_loader = DataLoader(train_s, shuffle=False, batch_size=32, collate_fn=multi_collate_reg)
                    dev_loader = DataLoader(dev_s, shuffle=False, batch_size=32, collate_fn=multi_collate_reg)
                    test_loader = DataLoader(test_s, shuffle=False, batch_size=32, collate_fn=multi_collate_reg)
                print(np.array(train_s).shape, np.array(dev_s).shape, np.array(test_s).shape)

                eflstm_settings = ( [text_size,], 
                                    [int(text_size * 1.5),],
                                    output_size,
                                    dropout_eflstm,
                                    MAX_EPOCH,
                                    n_layer_eflstm,
                                    weight_decay,
                                    classification
                                )

                dict_results[("LangLSTM",test_i,dev_i)] = LangLSTM_run(train_loader, dev_loader, test_loader, *eflstm_settings)
                train_losses, valid_losses, _ = dict_results[("LangLSTM",test_i,dev_i)]
                if save_plots:
                    plt.plot(np.arange(0, len(train_losses)), train_losses, label="Train Loss")
                    plt.plot(np.arange(0, len(valid_losses)), valid_losses, label="Dev Loss")
                    plt.legend()
                    plt.title(f"Training LangLSTM, Dev: {dev_i} | Test: {test_i}")
                    plt.xlabel("N Epochs")
                    plt.ylabel("Loss")
                    plt.savefig(os.path.join(graph_path, "LangLSTM", f"LangLSTM_t{test_i}_d{dev_i}.png"))
                    plt.clf()
                
    return dict_results                

In [None]:
saved_dictionary = dict()
results_dictionary_top10_langs2c = lang_leave_one_out(dataset, saved_dictionary, save_plots=False, label_n=1, output_size=4, 
                                    num_classes=4, classification_task=True, graph_path=os.path.join("train_graphs","Happy_4c"),)

In [None]:
from sklearn.metrics import balanced_accuracy_score

In [None]:
def to_percent(n, decimals=2):
    return np.round(n*100, decimals)

In [None]:
def report_f1_acc(results):
    single_weighted_acc = [balanced_accuracy_score(results[key][2][1], results[key][2][0]) for key in results.keys() if key != "videos"]
    single_weighted_f1 = [results[key][2][2][0] for key in results.keys() if key != "videos"]
    print("Weighted Acc:", to_percent(np.array(single_weighted_acc).mean()))
    print("Average F1:", to_percent(np.array(single_weighted_f1).mean()))

    return single_weighted_acc, single_weighted_f1
    

In [None]:
report_f1_acc(results_dictionary_top10_langs2c)