In [1]:
import sys
import os
import re
import pickle
import pandas as pd
import numpy as np
import random
import tensorflow as tf
from transformers import TFBertModel,BertTokenizer
from sklearn.preprocessing import StandardScaler
import sys
import os
import re
import pickle


DATA_PATH = '../data/MOSEI/'

def to_pickle(obj, path):
    with open(path, 'wb') as f:
        pickle.dump(obj, f)

def load_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

def get_length(x):
    return x.shape[1]-(np.sum(x, axis=-1) == 0).sum(1)

# first we align to words with averaging, collapse_function receives a list of functions
# dataset.align(text_field, collapse_functions=[avg])
# load pickle file for unaligned acoustic and visual source
pickle_filename = DATA_PATH+'mosei_senti_data_noalign.pkl'
csv_filename = DATA_PATH+'MOSEI-label.csv'

with open(pickle_filename, 'rb') as f:
    d = pickle.load(f)

# read csv file for label and text
df = pd.read_csv(csv_filename)
text = df['text']
vid = df['video_id']
cid = df['clip_id']

train_split_noalign = d['train']
dev_split_noalign = d['valid']
test_split_noalign = d['test']

# a sentinel epsilon for safe division, without it we will replace illegal values with a constant
EPS = 1e-6

# place holders for the final train/dev/test dataset
train = train = []
dev = dev = []
test = test = []

# define a regular expression to extract the video ID out of the keys
# pattern = re.compile('(.*)\[.*\]')
pattern = re.compile('(.*)_([.*])')
num_drop = 0 # a counter to count how many data points went into some processing issues

v = np.concatenate((train_split_noalign['vision'],dev_split_noalign['vision'], test_split_noalign['vision']),axis=0)
vlens = get_length(v)

a = np.concatenate((train_split_noalign['audio'],dev_split_noalign['audio'], test_split_noalign['audio']),axis=0)
alens = get_length(a)

label = np.concatenate((train_split_noalign['labels'],dev_split_noalign['labels'], test_split_noalign['labels']),axis=0)

L_V = v.shape[1]
L_A = a.shape[1]


all_id = np.concatenate((train_split_noalign['id'], dev_split_noalign['id'], test_split_noalign['id']),axis=0)[:,0]
all_id_list = all_id.tolist()

train_size = len(train_split_noalign['id'])
dev_size = len(dev_split_noalign['id'])
test_size = len(test_split_noalign['id'])

dev_start = train_size
test_start = train_size + dev_size

all_csv_id = [(vid[i], str(cid[i])) for i in range(len(vid))]

for i, idd in enumerate(all_id_list):
    # get the video ID and the features out of the aligned dataset

    # matching process
    try:
        index = i
    except:
        import ipdb; ipdb.set_trace()

    _words = text[index].split()
    _label = label[i].astype(np.float32)
    _visual = v[i]
    _acoustic = a[i]
    _vlen = vlens[i]
    _alen = alens[i]
    _id = '{}[{}]'.format(all_csv_id[0], all_csv_id[1])           

    # remove nan values
    # label = np.nan_to_num(label)
    _visual = np.nan_to_num(_visual)
    _acoustic = np.nan_to_num(_acoustic)

    # remove speech pause tokens - this is in general helpful
    # we should remove speech pauses and corresponding visual/acoustic features together
    # otherwise modalities would no longer be aligned
    actual_words = []
    words = []
    visual = []
    acoustic = []

    for word in _words:
        actual_words.append(word)

    visual = _visual[L_V - _vlen:,:]
    acoustic = _acoustic[L_A - _alen:,:]

    if i < dev_start:
        train.append((words, visual, acoustic, actual_words, _vlen, _alen, _label, idd))
    elif i >= dev_start and i < test_start:
        dev.append((words, visual, acoustic, actual_words, _vlen, _alen, _label, idd))
    elif i >= test_start:
        test.append((words, visual, acoustic, actual_words, _vlen, _alen, _label, idd))
    else:
        print(f"Found video that doesn't belong to any splits: {idd}")


# print(f"Total number of {num_drop} datapoints have been dropped.")
print(f"Total number of {num_drop} datapoints have been dropped.")
print("Dataset split")
print("Train Set: {}".format(len(train)))
print("Validation Set: {}".format(len(dev)))
print("Test Set: {}".format(len(test)))

# Save glove embeddings cache too
# self.pretrained_emb = pretrained_emb = load_emb(word2id, config.word_emb_path)
# torch.save((pretrained_emb, word2id), CACHE_PATH)
pretrained_emb = None

# Save pickles
to_pickle(train, DATA_PATH + '/dftrain.pkl')
to_pickle(dev, DATA_PATH + '/dfdev.pkl')
to_pickle(test, DATA_PATH + '/dftest.pkl')

  from .autonotebook import tqdm as notebook_tqdm


Total number of 0 datapoints have been dropped.
Dataset split
Train Set: 16326
Validation Set: 1871
Test Set: 4659


In [2]:
train_df = pd.DataFrame(train,columns=['words', 'visual', 'acoustic', 'actual_words', '_vlen', '_alen', '_label', 'idd'])
dev_df = pd.DataFrame(dev,columns=['words', 'visual', 'acoustic', 'actual_words', '_vlen', '_alen', '_label', 'idd'])
test_df = pd.DataFrame(test,columns=['words', 'visual', 'acoustic', 'actual_words', '_vlen', '_alen', '_label', 'idd'])


In [3]:
def addPadding(df, colName, dim):
    max_length_x = df[colName].apply(lambda x: x.shape[0] if isinstance(x, np.ndarray) else 0).max()

    # Perform padding within the DataFrame
    df[colName] = df[colName].apply(lambda x: np.vstack([x, np.zeros((max_length_x - x.shape[0], dim))]) if isinstance(x, np.ndarray) else x)


In [4]:
addPadding(train_df,'visual',35)
addPadding(train_df,'acoustic',74)

In [5]:
def create_acoustic_model(input_shape):
    inputs = tf.keras.layers.Input(shape=input_shape)
    x = tf.keras.layers.Reshape((input_shape[0], input_shape[1], 1))(inputs)
    x = tf.keras.layers.Conv2D(64, kernel_size=(3, 3), activation='relu')(x)
    x = tf.keras.layers.MaxPooling2D(pool_size=(2, 2))(x)
    x = tf.keras.layers.Conv2D(32, kernel_size=(3, 3), activation='relu')(x)  # Add an additional Conv2D layer
    x = tf.keras.layers.MaxPooling2D(pool_size=(2, 2))(x)
    x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.Dense(128, activation='relu')(x)
    model = tf.keras.Model(inputs=inputs, outputs=x)
    return model

def create_video_model(input_shape):
    inputs = tf.keras.layers.Input(shape=input_shape)
    x = tf.keras.layers.Reshape((input_shape[0], input_shape[1], 1))(inputs)
    x = tf.keras.layers.Conv2D(64, kernel_size=(3, 3), activation='relu')(x)
    x = tf.keras.layers.MaxPooling2D(pool_size=(2, 2))(x)
    x = tf.keras.layers.Conv2D(32, kernel_size=(3, 3), activation='relu')(x)  # Add an additional Conv2D layer
    x = tf.keras.layers.MaxPooling2D(pool_size=(2, 2))(x)
    x = tf.keras.layers.Flatten()(x)
    outputs = tf.keras.layers.Dense(32, activation='relu')(x)
    return tf.keras.models.Model(inputs=inputs, outputs=outputs)



In [6]:
def create_bert_model(max_length):
    input_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32)
    attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32)

    bert_model = TFBertModel.from_pretrained("bert-base-uncased")
    bert_output = bert_model(input_ids=input_ids, attention_mask=attention_mask)[0]
    return tf.keras.Model(inputs=[input_ids, attention_mask], outputs=bert_output)

# Modify the create_combined_model function
def create_combined_model(acoustic_input_shape, visual_input_shape, text_max_length):
    acoustic_model = create_acoustic_model(acoustic_input_shape)
    bert_model = create_bert_model(text_max_length)
    video_model = create_video_model(visual_input_shape)

    input_ids = tf.keras.layers.Input(shape=(text_max_length,), dtype=tf.int32)
    attention_mask = tf.keras.layers.Input(shape=(text_max_length,), dtype=tf.int32)
    acoustic_input = tf.keras.layers.Input(shape=acoustic_input_shape)
    visual_input = tf.keras.layers.Input(shape=visual_input_shape)

    # Get the BERT embeddings sequence (output shape: batch_size, max_length, bert_output_dim)
    bert_output = bert_model([input_ids, attention_mask])

    # Assuming the output shape of your video_model is (batch_size, video_output_dim)
    visual_output = video_model(visual_input)

    # Assuming the output shape of your acoustic_model is (batch_size, acoustic_output_dim)
    acoustic_output = acoustic_model(acoustic_input)

    # Modify the next lines according to your actual requirements for combining the models
    # For example, flatten the video_output if needed
    visual_output = tf.keras.layers.Flatten()(visual_output)

    # Flatten the BERT embeddings sequence to match the visual_output's shape
    bert_output = tf.keras.layers.Flatten()(bert_output)

    # Concatenate the outputs of the three models
    combined_input = tf.keras.layers.Concatenate()([acoustic_output, bert_output, visual_output])
    x = tf.keras.layers.Dense(128, activation='relu')(combined_input)
    outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)

    model = tf.keras.models.Model(inputs=[acoustic_input, input_ids, attention_mask, visual_input], outputs=outputs)
    return model


In [15]:
def processText(text):
    return ' '.join(text)

train_df['words'] = train_df['actual_words'].apply(processText)
test_df['words'] = test_df['actual_words'].apply(processText)

In [8]:
np.shape(train_df['acoustic'][15])

(500, 74)

In [9]:
np.shape(train_df['visual'][15])

(500, 35)

In [10]:

model = create_combined_model(acoustic_input_shape=(500,74), visual_input_shape=(500,35), text_max_length=500)

# Encode text data using BERT tokenizer
def encode_text_data(text_data, max_length=500):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    input_ids = []
    attention_masks = []

    for text in text_data:
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,  # Set max_length to 500 for BERT base model
            padding='max_length',
            return_attention_mask=True,
            return_tensors='tf'
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = tf.concat(input_ids, axis=0)
    attention_masks = tf.concat(attention_masks, axis=0)

    return input_ids, attention_masks
    
# ... (Previous code before preparing the data remains unchanged)

# Preprocess text data and create a tf.data.Dataset
text_data = encode_text_data(train_df['words'].tolist())

# Prepare the rest of the data
acoustic_data = np.stack(train_df['acoustic'])
visual_data = np.stack(train_df['visual'])
label_data = np.array([label[0] for label in train_df['_label']])

# Unpack the text_data into input_ids_data and attention_mask_data
input_ids_data, attention_mask_data = text_data
input_ids_data = input_ids_data.numpy().astype(np.int32)
attention_mask_data = attention_mask_data.numpy().astype(np.int32)

# Convert the label_data to float32 as it is expected for regression
label_data = label_data.astype(np.float32)


model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error'])
model.fit(
    [acoustic_data, input_ids_data, attention_mask_data, visual_data],
    label_data,
    epochs=3
)



Metal device set to: Apple M1 Max

systemMemory: 64.00 GB
maxCacheSize: 24.00 GB



2023-07-28 15:33:04.134864: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-07-28 15:33:04.135055: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identi

Epoch 1/3


2023-07-28 15:33:18.127442: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz




2023-07-28 15:33:24.539441: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1730f4f40>

In [11]:
np.shape(attention_mask_data[0])

(500,)

In [12]:
acoustic_data.shape

(16326, 500, 74)

In [13]:
visual_data.shape

(16326, 500, 35)

In [14]:
np.shape(label_data)

(16326, 1)

In [17]:
addPadding(test_df,'visual',35)
addPadding(test_df,'acoustic',74)

In [18]:
def encode_text_data(text_data, max_length=500):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    input_ids = []
    attention_masks = []

    for text in text_data:
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,  # Set max_length to 500 for BERT base model
            padding='max_length',
            return_attention_mask=True,
            return_tensors='tf'
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = tf.concat(input_ids, axis=0)
    attention_masks = tf.concat(attention_masks, axis=0)

    return input_ids, attention_masks
    
# ... (Previous code before preparing the data remains unchanged)

# Preprocess text data and create a tf.data.Dataset
text_data = encode_text_data(test_df['words'].tolist())

# Prepare the rest of the data
acoustic_data = np.stack(test_df['acoustic'])
visual_data = np.stack(test_df['visual'])
label_data = np.array([label[0] for label in test_df['_label']])

# Unpack the text_data into input_ids_data and attention_mask_data
input_ids_data, attention_mask_data = text_data
input_ids_data = input_ids_data.numpy().astype(np.int32)
attention_mask_data = attention_mask_data.numpy().astype(np.int32)

# Convert the label_data to float32 as it is expected for regression
label_data = label_data.astype(np.float32)
predictions = model.predict([acoustic_data, input_ids_data, attention_mask_data, visual_data])

2023-07-28 20:01:02.250810: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




In [19]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
mse = mean_squared_error(label_data, predictions)
mae = mean_absolute_error(label_data, predictions)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)

Mean Squared Error: 1.2492965
Mean Absolute Error: 0.83841306


In [20]:

# Save the model to a directory
model.save('./savedModels')

# Optionally, you can save the tokenizer used for encoding the text data
# tokenizer.save_pretrained('./savedModels')



INFO:tensorflow:Assets written to: ./savedModels/assets


INFO:tensorflow:Assets written to: ./savedModels/assets
