In [116]:
import sys
import os
import re
import pickle
import pandas as pd
import numpy as np

DATA_PATH = '../data/MOSEI/'

def to_pickle(obj, path):
    with open(path, 'wb') as f:
        pickle.dump(obj, f)

def load_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

def get_length(x):
    return x.shape[1]-(np.sum(x, axis=-1) == 0).sum(1)

# first we align to words with averaging, collapse_function receives a list of functions
# dataset.align(text_field, collapse_functions=[avg])
# load pickle file for unaligned acoustic and visual source
pickle_filename = DATA_PATH+'mosei_senti_data_noalign.pkl'
csv_filename = DATA_PATH+'MOSEI-label.csv'

with open(pickle_filename, 'rb') as f:
    d = pickle.load(f)

# read csv file for label and text
df = pd.read_csv(csv_filename)
text = df['text']
vid = df['video_id']
cid = df['clip_id']

train_split_noalign = d['train']
dev_split_noalign = d['valid']
test_split_noalign = d['test']

# a sentinel epsilon for safe division, without it we will replace illegal values with a constant
EPS = 1e-6

# place holders for the final train/dev/test dataset
train = train = []
dev = dev = []
test = test = []

# define a regular expression to extract the video ID out of the keys
# pattern = re.compile('(.*)\[.*\]')
pattern = re.compile('(.*)_([.*])')
num_drop = 0 # a counter to count how many data points went into some processing issues

v = np.concatenate((train_split_noalign['vision'],dev_split_noalign['vision'], test_split_noalign['vision']),axis=0)
vlens = get_length(v)

a = np.concatenate((train_split_noalign['audio'],dev_split_noalign['audio'], test_split_noalign['audio']),axis=0)
alens = get_length(a)

label = np.concatenate((train_split_noalign['labels'],dev_split_noalign['labels'], test_split_noalign['labels']),axis=0)

L_V = v.shape[1]
L_A = a.shape[1]


all_id = np.concatenate((train_split_noalign['id'], dev_split_noalign['id'], test_split_noalign['id']),axis=0)[:,0]
all_id_list = all_id.tolist()

train_size = len(train_split_noalign['id'])
dev_size = len(dev_split_noalign['id'])
test_size = len(test_split_noalign['id'])

dev_start = train_size
test_start = train_size + dev_size

all_csv_id = [(vid[i], str(cid[i])) for i in range(len(vid))]

for i, idd in enumerate(all_id_list):
    # get the video ID and the features out of the aligned dataset

    # matching process
    try:
        index = i
    except:
        import ipdb; ipdb.set_trace()

    _words = text[index].split()
    _label = label[i].astype(np.float32)
    _visual = v[i]
    _acoustic = a[i]
    _vlen = vlens[i]
    _alen = alens[i]
    _id = '{}[{}]'.format(all_csv_id[0], all_csv_id[1])           

    # remove nan values
    # label = np.nan_to_num(label)
    _visual = np.nan_to_num(_visual)
    _acoustic = np.nan_to_num(_acoustic)

    # remove speech pause tokens - this is in general helpful
    # we should remove speech pauses and corresponding visual/acoustic features together
    # otherwise modalities would no longer be aligned
    actual_words = []
    words = []
    visual = []
    acoustic = []

    for word in _words:
        actual_words.append(word)

    visual = _visual[L_V - _vlen:,:]
    acoustic = _acoustic[L_A - _alen:,:]

    if i < dev_start:
        train.append((words, visual, acoustic, actual_words, _vlen, _alen, _label, idd))
    elif i >= dev_start and i < test_start:
        dev.append((words, visual, acoustic, actual_words, _vlen, _alen, _label, idd))
    elif i >= test_start:
        test.append((words, visual, acoustic, actual_words, _vlen, _alen, _label, idd))
    else:
        print(f"Found video that doesn't belong to any splits: {idd}")


# print(f"Total number of {num_drop} datapoints have been dropped.")
print(f"Total number of {num_drop} datapoints have been dropped.")
print("Dataset split")
print("Train Set: {}".format(len(train)))
print("Validation Set: {}".format(len(dev)))
print("Test Set: {}".format(len(test)))

# Save glove embeddings cache too
# self.pretrained_emb = pretrained_emb = load_emb(word2id, config.word_emb_path)
# torch.save((pretrained_emb, word2id), CACHE_PATH)
pretrained_emb = None

# Save pickles
to_pickle(train, DATA_PATH + '/dftrain.pkl')
to_pickle(dev, DATA_PATH + '/dfdev.pkl')
to_pickle(test, DATA_PATH + '/dftest.pkl')

Total number of 0 datapoints have been dropped.
Dataset split
Train Set: 16326
Validation Set: 1871
Test Set: 4659


In [117]:
train_df = pd.DataFrame(train,columns=['words', 'visual', 'acoustic', 'actual_words', '_vlen', '_alen', '_label', 'idd'])

In [118]:
train_df

Unnamed: 0,words,visual,acoustic,actual_words,_vlen,_alen,_label,idd
0,[],"[[-1.2108299732208252, -0.46178698539733887, -...","[[194.5, 0.0, 0.07899338752031326, 0.412973552...","[Key, is, part, of, the, people, that, we, use...",267,356,[[1.0]],-3g5yACwYnA
1,[],"[[-1.7858200073242188, -0.6380839943885803, 0....","[[105.5, 1.0, 0.021022265776991844, 0.08599962...","[They've, been, able, to, find, solutions, or,...",81,107,[[0.6666667]],-3g5yACwYnA
2,[],"[[-1.7642099857330322, -0.7958599925041199, -0...","[[106.5, 1.0, 0.11148008704185486, 0.726543664...","[We're, a, huge, user, of, adhesives, for, our...",215,286,[[0.0]],-3g5yACwYnA
3,[],"[[-1.2986199855804443, -0.2510870099067688, -0...","[[115.5, 1.0, 0.057251088321208954, 0.33875322...","[Key, Polymer, brings, a, technical, aspect, t...",138,184,[[0.0]],-3g5yACwYnA
4,[],"[[-1.6502399444580078, -0.3371959924697876, -0...","[[100.5, 1.0, 0.14517584443092346, 0.675116181...","[Key, brings, those, types, of, aspects, to, a...",221,295,[[1.0]],-3g5yACwYnA
...,...,...,...,...,...,...,...,...
16321,[],"[[-1.8842400312423706, -0.6028929948806763, 0....","[[127.5, 1.0, 0.11853820085525513, 0.720579862...","[I, read, other, articles,, what, other, train...",72,103,[[0.0]],zwTrXwi54us
16322,[],"[[-2.153140068054199, -0.04792049899697304, -0...","[[129.5, 1.0, 0.1991356760263443, 0.6945900321...","[I, do, all, of, that]",28,37,[[0.0]],zwTrXwi54us
16323,[],"[[-4.401090145111084, -1.0127899646759033, -1....","[[188.5, 0.0, 0.05910159647464752, 0.331484466...","[Now,, if, this, sounds, like, something, you'...",163,227,[[0.6666667]],zwTrXwi54us
16324,[],"[[-2.3272500038146973, -1.1171799898147583, 0....","[[133.0, 0.0, 0.027281710878014565, 0.07664652...","[I, actually, speak, to, the, experts, myself,...",80,106,[[1.0]],zwTrXwi54us


In [119]:
train_df['acoustic'][0]

array([[ 1.94500000e+02,  0.00000000e+00,  7.89933875e-02, ...,
        -3.43945742e-01, -3.39845806e-01, -2.55919456e-01],
       [ 1.20500000e+02,  0.00000000e+00,  6.85892552e-02, ...,
        -3.29696000e-01, -3.68389398e-01, -2.19704941e-01],
       [ 1.68000000e+02,  0.00000000e+00,  5.81851192e-02, ...,
        -3.20934534e-01, -3.78141046e-01, -2.37061188e-01],
       ...,
       [ 8.55000000e+01,  0.00000000e+00,  5.52437678e-02, ...,
        -3.50019425e-01, -2.89983302e-01, -2.81043440e-01],
       [ 2.06000000e+02,  0.00000000e+00,  4.93893474e-02, ...,
        -3.87713164e-01, -3.68427217e-01, -2.66391695e-01],
       [ 1.01500000e+02,  0.00000000e+00,  4.35349271e-02, ...,
        -4.55446243e-01, -4.16983306e-01, -2.69667059e-01]])

In [120]:
np.shape(train_df['acoustic'][0])

(356, 74)

In [121]:
np.shape(train_df['visual'][15])

(64, 35)

In [122]:

np.shape(train_df['actual_words'][0])

(42,)

In [123]:
# pd.DataFrame(train_split_noalign,columns=['a'])
d

{'train': {'vision': array([[[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
            0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
          [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
            0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
          [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
            0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
          ...,
          [-1.71436000e+00,  4.13351990e-02, -9.11785007e-01, ...,
            2.26524997e+00,  1.35622997e+01, -7.79251993e-01],
          [-1.46472001e+00,  1.42652005e-01, -9.68686998e-01, ...,
            1.66319001e+00,  1.43842001e+01, -1.30941999e+00],
          [-1.61250997e+00, -1.18235998e-01, -8.67457986e-01, ...,
            2.01573992e+00,  1.48933001e+01, -1.63349998e+00]],
  
         [[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
            0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
          [ 0.00000000e+00,  0.00000000e+00,  0.0000

In [124]:
df

Unnamed: 0,video_id,clip_id,text,label,annotation,mode,label_by
0,-3g5yACwYnA,10,Key is part of the people that we use to solve...,1.000000,Positive,train,0
1,-3g5yACwYnA,13,They've been able to find solutions or at leas...,0.666667,Positive,train,0
2,-3g5yACwYnA,3,We're a huge user of adhesives for our operati...,0.000000,Neutral,train,0
3,-3g5yACwYnA,2,Key Polymer brings a technical aspect to our o...,0.000000,Neutral,train,0
4,-3g5yACwYnA,4,Key brings those types of aspects to a busines...,1.000000,Positive,train,0
...,...,...,...,...,...,...,...
22851,zhNksSReaQk,35,"And yet, it's like, how a/Autistic people defe...",0.000000,Neutral,test,0
22852,zhNksSReaQk,34,But the thing is that intelligence [scoffs] is...,-2.000000,Negative,test,0
22853,zhNksSReaQk,33,"They're like, ""Oh, they have a high IQ, they a...",0.000000,Neutral,test,0
22854,zvZd3V5D5Ik,3,"If you're ready to strengthen your skills, whi...",1.000000,Positive,test,0


In [125]:
dev

[([],
  array([[-1.34214997, -0.66886699, -0.92878699, ...,  0.308211  ,
          -1.96506   ,  2.62928009],
         [-1.26037002, -0.35704899, -0.86147302, ..., -2.42943001,
          -2.64622998,  2.88656998],
         [-1.28139997, -0.0797786 , -1.09742999, ..., -2.15464997,
          -2.16053009,  2.08038998],
         ...,
         [ 0.280287  , -0.70146602, -0.76364201, ..., -3.09418988,
          -0.29675201, -2.08999991],
         [ 0.44858301, -0.62257099, -0.75444698, ..., -2.58513999,
           0.37447199, -2.08825994],
         [ 0.56670898, -0.67319298, -0.88724601, ..., -1.04074001,
           0.78969997, -1.81952   ]]),
  array([[ 1.14000000e+02,  1.00000000e+00,  4.57898416e-02, ...,
          -3.57514054e-01, -3.33211869e-01, -2.78034717e-01],
         [ 1.20000000e+02,  1.00000000e+00,  7.00449124e-02, ...,
          -2.78207064e-01, -3.11943054e-01, -2.90412426e-01],
         [ 1.32500000e+02,  1.00000000e+00,  5.28569371e-02, ...,
          -2.35187754e-01, -1.91

In [126]:
test

[([],
  array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
           0.        ,  0.        ],
         [ 0.47424001, -0.40938199, -0.589284  , ..., 11.28579998,
           1.62586999, -5.32987022],
         [ 0.60675502,  0.054957  ,  0.169245  , ...,  9.32586002,
           2.4591701 , -5.83106995],
         ...,
         [-1.08104002,  1.41233003, -0.93670303, ...,  1.20792997,
           7.39161015, -7.18501997],
         [-0.29260001,  0.39868599, -0.470195  , ...,  2.53096008,
           7.47227001, -6.66331005],
         [-0.54625899,  0.68057501, -1.01171005, ...,  1.46886003,
           7.99183989, -5.81487989]]),
  array([[ 1.60000000e+02,  0.00000000e+00,  1.58676822e-02, ...,
          -2.89231271e-01, -2.83395469e-01, -2.22432852e-01],
         [ 1.19000000e+02,  0.00000000e+00,  1.38829816e-02, ...,
          -3.64206821e-01, -2.99161464e-01, -2.41426721e-01],
         [ 1.12000000e+02,  0.00000000e+00,  1.18982811e-02, ...,
          -4.24055427e-01, -2.98

In [127]:
# import tensorflow as tf

# def create_acoustic_model(input_shape):
#     inputs = tf.keras.layers.Input(shape=input_shape)
#     # Your acoustic CNN layers go here
#     # Example:
#     x = tf.keras.layers.Conv1D(64, kernel_size=3, activation='relu')(inputs)
#     x = tf.keras.layers.MaxPooling1D(pool_size=2)(x)
#     x = tf.keras.layers.Flatten()(x)
#     outputs = tf.keras.layers.Dense(32, activation='relu')(x)
#     return tf.keras.models.Model(inputs=inputs, outputs=outputs)


In [128]:
import tensorflow as tf

def create_acoustic_model(input_shape):
    inputs = tf.keras.layers.Input(shape=input_shape)
    x = tf.keras.layers.Reshape(input_shape)(inputs)
    x = tf.keras.layers.Conv2D(64, kernel_size=(3, 3), activation='relu')(x)
    x = tf.keras.layers.MaxPooling2D(pool_size=(2, 2))(x)
    x = tf.keras.layers.Flatten()(x)
    outputs = tf.keras.layers.Dense(32, activation='relu')(x)
    return tf.keras.models.Model(inputs=inputs, outputs=outputs)


In [129]:
def create_video_model(input_shape):
    inputs = tf.keras.layers.Input(shape=input_shape)
    x = tf.keras.layers.Reshape(input_shape)(inputs)
    x = tf.keras.layers.Conv2D(64, kernel_size=(3, 3), activation='relu')(x)
    x = tf.keras.layers.MaxPooling2D(pool_size=(2, 2))(x)
    x = tf.keras.layers.Flatten()(x)
    outputs = tf.keras.layers.Dense(32, activation='relu')(x)
    return tf.keras.models.Model(inputs=inputs, outputs=outputs)


In [130]:
import tensorflow_hub as hub

def create_bert_model():
    bert_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3"
    bert_layer = hub.KerasLayer(bert_url, trainable=True)
    input_text = tf.keras.layers.Input(shape=(), dtype=tf.string)
    bert_output = bert_layer(input_text)
    return tf.keras.models.Model(inputs=input_text, outputs=bert_output)


In [131]:
def create_combined_model(acoustic_input_shape, video_input_shape):
    acoustic_model = create_acoustic_model(acoustic_input_shape)
    bert_model = create_bert_model()
    video_model = create_video_model(video_input_shape)

    acoustic_input = tf.keras.layers.Input(shape=acoustic_input_shape)
    bert_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
    video_input = tf.keras.layers.Input(shape=video_input_shape)

    acoustic_output = acoustic_model(acoustic_input)
    bert_output = bert_model(bert_input)
    video_output = video_model(video_input)

    combined_output = tf.keras.layers.concatenate([acoustic_output, bert_output, video_output])
    predictions = tf.keras.layers.Dense(1, activation='sigmoid')(combined_output)

    model = tf.keras.models.Model(inputs=[acoustic_input, bert_input, video_input], outputs=predictions)

    return model


In [132]:
# def create_combined_model(acoustic_input_shape, video_input_shape, text_input_shape):
#     acoustic_model = create_acoustic_model(acoustic_input_shape)
#     bert_model = create_bert_model()
#     video_model = create_video_model(video_input_shape)

#     # Modify the next lines according to your actual requirements for combining the models
#     combined_input = tf.keras.layers.Concatenate()([acoustic_model.output, bert_model.output, video_model.output])
#     x = tf.keras.layers.Dense(64, activation='relu')(combined_input)
#     outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)

#     model = tf.keras.models.Model(inputs=[acoustic_model.input, bert_model.input, video_model.input], outputs=outputs)
#     return model

In [133]:

# acoustic_shape = (74,)
# video_shape = (35,)


# # Create the combined model
# combined_model = create_combined_model(acoustic_shape, video_shape)
# combined_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# # Train the model
# combined_model.fit([train_df['acoustic'], train_df['actual_words'], train_df['visual']], train_df['__label'], epochs=10, batch_size=32)

In [134]:
# model = create_combined_model(acoustic_input_shape=(74,), video_input_shape=(35,), text_input_shape=None)
# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# model.fit([train_df['acoustic'], train_df['actual_words'], train_df['visual']], train_df['__label'], epochs=10, batch_size=32)

In [135]:
def data_generator(acoustic_data, video_data, text_data, labels, batch_size=32):
    num_samples = len(labels)
    indices = np.arange(num_samples)
    np.random.shuffle(indices)

    start_idx = 0
    while True:
        if start_idx >= num_samples:
            start_idx = 0
            np.random.shuffle(indices)

        batch_indices = indices[start_idx : start_idx + batch_size]

        batch_acoustic = acoustic_data[batch_indices]
        batch_video = video_data[batch_indices]
        batch_text = [text_data[i] for i in batch_indices]
        batch_labels = labels[batch_indices]

        start_idx += batch_size

        yield [batch_acoustic, batch_text, batch_video], batch_labels

# Assuming your model is created using the 'create_combined_model' function
model = create_combined_model(acoustic_input_shape=(74,), video_input_shape=(35,))

# Train the model using the generator
batch_size = 32
num_samples = len(train_df['__label'])
steps_per_epoch = num_samples // batch_size
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(data_generator(train_df['acoustic'], train_df['actual_words'], train_df['visual'], train_df['__label'], batch_size),
          epochs=10,
          steps_per_epoch=steps_per_epoch)

ValueError: Input 0 of layer "conv2d_2" is incompatible with the layer: expected min_ndim=4, found ndim=2. Full shape received: (None, 74)