In [145]:
import sys
import os
import re
import pickle
import pandas as pd
import numpy as np
import random
import tensorflow as tf
from transformers import TFBertModel,BertTokenizer
from sklearn.preprocessing import StandardScaler
import sys
import os
import re
import pickle


DATA_PATH = '../data/MOSEI/'

def to_pickle(obj, path):
    with open(path, 'wb') as f:
        pickle.dump(obj, f)

def load_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

def get_length(x):
    return x.shape[1]-(np.sum(x, axis=-1) == 0).sum(1)

# first we align to words with averaging, collapse_function receives a list of functions
# dataset.align(text_field, collapse_functions=[avg])
# load pickle file for unaligned acoustic and visual source
pickle_filename = DATA_PATH+'mosei_senti_data_noalign.pkl'
csv_filename = DATA_PATH+'MOSEI-label.csv'

with open(pickle_filename, 'rb') as f:
    d = pickle.load(f)

# read csv file for label and text
df = pd.read_csv(csv_filename)
text = df['text']
vid = df['video_id']
cid = df['clip_id']

train_split_noalign = d['train']
dev_split_noalign = d['valid']
test_split_noalign = d['test']

# a sentinel epsilon for safe division, without it we will replace illegal values with a constant
EPS = 1e-6

# place holders for the final train/dev/test dataset
train = train = []
dev = dev = []
test = test = []

# define a regular expression to extract the video ID out of the keys
# pattern = re.compile('(.*)\[.*\]')
pattern = re.compile('(.*)_([.*])')
num_drop = 0 # a counter to count how many data points went into some processing issues

v = np.concatenate((train_split_noalign['vision'],dev_split_noalign['vision'], test_split_noalign['vision']),axis=0)
vlens = get_length(v)

a = np.concatenate((train_split_noalign['audio'],dev_split_noalign['audio'], test_split_noalign['audio']),axis=0)
alens = get_length(a)

label = np.concatenate((train_split_noalign['labels'],dev_split_noalign['labels'], test_split_noalign['labels']),axis=0)

L_V = v.shape[1]
L_A = a.shape[1]


all_id = np.concatenate((train_split_noalign['id'], dev_split_noalign['id'], test_split_noalign['id']),axis=0)[:,0]
all_id_list = all_id.tolist()

train_size = len(train_split_noalign['id'])
dev_size = len(dev_split_noalign['id'])
test_size = len(test_split_noalign['id'])

dev_start = train_size
test_start = train_size + dev_size

all_csv_id = [(vid[i], str(cid[i])) for i in range(len(vid))]

for i, idd in enumerate(all_id_list):
    # get the video ID and the features out of the aligned dataset

    # matching process
    try:
        index = i
    except:
        import ipdb; ipdb.set_trace()

    _words = text[index].split()
    _label = label[i].astype(np.float32)
    _visual = v[i]
    _acoustic = a[i]
    _vlen = vlens[i]
    _alen = alens[i]
    _id = '{}[{}]'.format(all_csv_id[0], all_csv_id[1])           

    # remove nan values
    # label = np.nan_to_num(label)
    _visual = np.nan_to_num(_visual)
    _acoustic = np.nan_to_num(_acoustic)

    # remove speech pause tokens - this is in general helpful
    # we should remove speech pauses and corresponding visual/acoustic features together
    # otherwise modalities would no longer be aligned
    actual_words = []
    words = []
    visual = []
    acoustic = []

    for word in _words:
        actual_words.append(word)

    visual = _visual[L_V - _vlen:,:]
    acoustic = _acoustic[L_A - _alen:,:]

    if i < dev_start:
        train.append((words, visual, acoustic, actual_words, _vlen, _alen, _label, idd))
    elif i >= dev_start and i < test_start:
        dev.append((words, visual, acoustic, actual_words, _vlen, _alen, _label, idd))
    elif i >= test_start:
        test.append((words, visual, acoustic, actual_words, _vlen, _alen, _label, idd))
    else:
        print(f"Found video that doesn't belong to any splits: {idd}")


# print(f"Total number of {num_drop} datapoints have been dropped.")
print(f"Total number of {num_drop} datapoints have been dropped.")
print("Dataset split")
print("Train Set: {}".format(len(train)))
print("Validation Set: {}".format(len(dev)))
print("Test Set: {}".format(len(test)))

# Save glove embeddings cache too
# self.pretrained_emb = pretrained_emb = load_emb(word2id, config.word_emb_path)
# torch.save((pretrained_emb, word2id), CACHE_PATH)
pretrained_emb = None

# Save pickles
to_pickle(train, DATA_PATH + '/dftrain.pkl')
to_pickle(dev, DATA_PATH + '/dfdev.pkl')
to_pickle(test, DATA_PATH + '/dftest.pkl')

Total number of 0 datapoints have been dropped.
Dataset split
Train Set: 16326
Validation Set: 1871
Test Set: 4659


In [146]:
train_df = pd.DataFrame(train,columns=['words', 'visual', 'acoustic', 'actual_words', '_vlen', '_alen', '_label', 'idd'])
dev_df = pd.DataFrame(dev,columns=['words', 'visual', 'acoustic', 'actual_words', '_vlen', '_alen', '_label', 'idd'])
test_df = pd.DataFrame(test,columns=['words', 'visual', 'acoustic', 'actual_words', '_vlen', '_alen', '_label', 'idd'])


In [147]:
def addPadding(df, colName, dim):
    max_length_x = df[colName].apply(lambda x: x.shape[0] if isinstance(x, np.ndarray) else 0).max()

    # Perform padding within the DataFrame
    df[colName] = df[colName].apply(lambda x: np.vstack([x, np.zeros((max_length_x - x.shape[0], dim))]) if isinstance(x, np.ndarray) else x)


In [148]:
addPadding(train_df,'visual',35)
addPadding(train_df,'acoustic',74)

In [149]:
np.shape(train_df['visual'][15])

(500, 35)

In [150]:
np.shape(train_df['acoustic'][74])

(500, 74)

In [151]:
max_length_x = train_df['actual_words'].apply(len).max()

In [152]:
max_length_x

307

In [153]:
def create_acoustic_model(input_shape):
    inputs = tf.keras.layers.Input(shape=input_shape)
    x = tf.keras.layers.Reshape((input_shape[0], input_shape[1], 1))(inputs)
    x = tf.keras.layers.Conv2D(64, kernel_size=(3, 3), activation='relu')(x)
    x = tf.keras.layers.MaxPooling2D(pool_size=(2, 2))(x)
    x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.Dense(128, activation='relu')(x)
    model = tf.keras.Model(inputs=inputs, outputs=x)
    return model

def create_video_model(input_shape):
    inputs = tf.keras.layers.Input(shape=input_shape)
    # Add a channel dimension to the input data
    x = tf.keras.layers.Reshape((*input_shape, 1))(inputs)
    x = tf.keras.layers.Conv2D(64, kernel_size=(3, 3), activation='relu')(x)
    x = tf.keras.layers.MaxPooling2D(pool_size=(2, 2))(x)
    x = tf.keras.layers.Flatten()(x)
    outputs = tf.keras.layers.Dense(32, activation='relu')(x)
    return tf.keras.models.Model(inputs=inputs, outputs=outputs)


def create_bert_model(max_length):
    input_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32)
    attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32)
    token_type_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32)

    bert_model = TFBertModel.from_pretrained("bert-base-uncased")
    bert_output = bert_model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)[0]
    cls_token = bert_output[:, 0, :]

    model = tf.keras.Model(inputs=[input_ids, attention_mask, token_type_ids], outputs=cls_token)
    return model

In [154]:
def create_combined_model(acoustic_input_shape, visual_input_shape, text_max_length):
    acoustic_model = create_acoustic_model(acoustic_input_shape)
    bert_model = create_bert_model(text_max_length)
    video_model = create_video_model(visual_input_shape)


    # Combine the models
    combined_input = tf.keras.layers.Concatenate()([acoustic_model.output, bert_model.output, video_model.output])
    x = tf.keras.layers.Dense(128, activation='relu')(combined_input)
    output = tf.keras.layers.Dense(1, activation='sigmoid')(x)

    model = tf.keras.Model(inputs=[acoustic_model.input, bert_model.input, video_model.input], outputs=output)
    return model


In [155]:
train_df['actual_words']

0        [Key, is, part, of, the, people, that, we, use...
1        [They've, been, able, to, find, solutions, or,...
2        [We're, a, huge, user, of, adhesives, for, our...
3        [Key, Polymer, brings, a, technical, aspect, t...
4        [Key, brings, those, types, of, aspects, to, a...
                               ...                        
16321    [I, read, other, articles,, what, other, train...
16322                               [I, do, all, of, that]
16323    [Now,, if, this, sounds, like, something, you'...
16324    [I, actually, speak, to, the, experts, myself,...
16325    [And, we’ve, seen, some, programs,, we’ve, see...
Name: actual_words, Length: 16326, dtype: object

In [156]:
# model = create_combined_model(acoustic_input_shape=(74,500), visual_input_shape=(35,500), text_max_length=307)

# bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# # Encode text data using BERT tokenizer
# @tf.function
# def encode_text_data(text_data, max_length=307):
#     encoded_texts = bert_tokenizer(text_data, padding='max_length', truncation=True, max_length=max_length, return_tensors='tf')
#     return encoded_texts

# # Preprocess text data
# train_df['encode_text_data'] = train_df['actual_words'].apply(encode_text_data)

# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# model.fit([train_df['acoustic'], train_df['encode_text_data'], train_df['visual']], train_df['_label'], epochs=10)

In [157]:
# model = create_combined_model(acoustic_input_shape=(74,500), visual_input_shape=(35,500), text_max_length=307)

# bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# # Encode text data using BERT tokenizer
# def encode_text_data(text_data, max_length=307):
#     encoded_texts = bert_tokenizer(text_data, padding='max_length', truncation=True, max_length=max_length, return_tensors='tf')
#     return encoded_texts

# # Preprocess text data
# train_df['encode_text_data'] = train_df['actual_words'].apply(encode_text_data)

# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# model.fit([train_df['acoustic'], train_df['encode_text_data'], train_df['visual']], train_df['_label'], epochs=10)


In [161]:
train_df['_label']

0              [[1.0]]
1        [[0.6666667]]
2              [[0.0]]
3              [[0.0]]
4              [[1.0]]
             ...      
16321          [[0.0]]
16322          [[0.0]]
16323    [[0.6666667]]
16324          [[1.0]]
16325          [[1.0]]
Name: _label, Length: 16326, dtype: object

In [164]:
# import tensorflow as tf

# model = create_combined_model(acoustic_input_shape=(74, 500), visual_input_shape=(35, 500), text_max_length=500)

# bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# # Encode text data using BERT tokenizer
# def encode_text_data(text_data, max_length=307):
#     # Join the list of words into a single string
#     text = ' '.join(text_data)
#     encoded_texts = bert_tokenizer(text, padding='max_length', truncation=True, max_length=max_length, return_tensors='tf')
#     # Return the required BERT input keys as separate tensors
#     return (
#         tf.convert_to_tensor(encoded_texts['input_ids']),
#         tf.convert_to_tensor(encoded_texts['attention_mask']),
#         tf.convert_to_tensor(encoded_texts['token_type_ids']),
#     )

# # Preprocess text data and create a tf.data.Dataset
# text_data = train_df['actual_words'].apply(encode_text_data, max_length=500)

# # Prepare the rest of the data
# acoustic_data = tf.convert_to_tensor(np.stack(train_df['acoustic']))
# visual_data = tf.convert_to_tensor(np.stack(train_df['visual']))
# # label_data = tf.convert_to_tensor(np.array(train_df['_label']))
# label_data = np.array([int(label[0][0]) for label in train_df['_label']])

# input_ids_list, attention_mask_list, token_type_ids_list = zip(*text_data)

# # Convert the lists of tensors to numpy arrays
# input_ids_data = np.array(input_ids_list)
# attention_mask_data = np.array(attention_mask_list)
# token_type_ids_data = np.array(token_type_ids_list)

# # Train the model using the prepared data
# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# model.fit([acoustic_data, input_ids_data, attention_mask_data, token_type_ids_data, visual_data], label_data, epochs=10)


Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/10


ValueError: in user code:

    File "/Users/deepaknandula/miniforge3/envs/env_tf/lib/python3.9/site-packages/keras/engine/training.py", line 1051, in train_function  *
        return step_function(self, iterator)
    File "/Users/deepaknandula/miniforge3/envs/env_tf/lib/python3.9/site-packages/keras/engine/training.py", line 1040, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/deepaknandula/miniforge3/envs/env_tf/lib/python3.9/site-packages/keras/engine/training.py", line 1030, in run_step  **
        outputs = model.train_step(data)
    File "/Users/deepaknandula/miniforge3/envs/env_tf/lib/python3.9/site-packages/keras/engine/training.py", line 889, in train_step
        y_pred = self(x, training=True)
    File "/Users/deepaknandula/miniforge3/envs/env_tf/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/var/folders/zj/hd8_xvbd3fvbp9wj5412cw_c0000gn/T/__autograph_generated_file0vg8_j1w.py", line 36, in tf__run_call_with_unpacked_inputs
        retval_ = ag__.converted_call(ag__.ld(func), (ag__.ld(self),), dict(**ag__.ld(unpacked_inputs)), fscope)
    File "/var/folders/zj/hd8_xvbd3fvbp9wj5412cw_c0000gn/T/__autograph_generated_filejqo70sky.py", line 30, in tf__call
        outputs = ag__.converted_call(ag__.ld(self).bert, (), dict(input_ids=ag__.ld(input_ids), attention_mask=ag__.ld(attention_mask), token_type_ids=ag__.ld(token_type_ids), position_ids=ag__.ld(position_ids), head_mask=ag__.ld(head_mask), inputs_embeds=ag__.ld(inputs_embeds), encoder_hidden_states=ag__.ld(encoder_hidden_states), encoder_attention_mask=ag__.ld(encoder_attention_mask), past_key_values=ag__.ld(past_key_values), use_cache=ag__.ld(use_cache), output_attentions=ag__.ld(output_attentions), output_hidden_states=ag__.ld(output_hidden_states), return_dict=ag__.ld(return_dict), training=ag__.ld(training)), fscope)
    File "/var/folders/zj/hd8_xvbd3fvbp9wj5412cw_c0000gn/T/__autograph_generated_file0vg8_j1w.py", line 36, in tf__run_call_with_unpacked_inputs
        retval_ = ag__.converted_call(ag__.ld(func), (ag__.ld(self),), dict(**ag__.ld(unpacked_inputs)), fscope)
    File "/var/folders/zj/hd8_xvbd3fvbp9wj5412cw_c0000gn/T/__autograph_generated_file8oc0f77c.py", line 75, in tf__call
        (batch_size, seq_length) = ag__.ld(input_shape)

    ValueError: Exception encountered when calling layer "tf_bert_model_18" (type TFBertModel).
    
    in user code:
    
        File "/Users/deepaknandula/miniforge3/envs/env_tf/lib/python3.9/site-packages/transformers/modeling_tf_utils.py", line 1088, in run_call_with_unpacked_inputs  *
            return func(self, **unpacked_inputs)
        File "/Users/deepaknandula/miniforge3/envs/env_tf/lib/python3.9/site-packages/transformers/models/bert/modeling_tf_bert.py", line 1115, in call  *
            outputs = self.bert(
        File "/Users/deepaknandula/miniforge3/envs/env_tf/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler  **
            raise e.with_traceback(filtered_tb) from None
        File "/var/folders/zj/hd8_xvbd3fvbp9wj5412cw_c0000gn/T/__autograph_generated_file0vg8_j1w.py", line 36, in tf__run_call_with_unpacked_inputs
            retval_ = ag__.converted_call(ag__.ld(func), (ag__.ld(self),), dict(**ag__.ld(unpacked_inputs)), fscope)
        File "/var/folders/zj/hd8_xvbd3fvbp9wj5412cw_c0000gn/T/__autograph_generated_file8oc0f77c.py", line 75, in tf__call
            (batch_size, seq_length) = ag__.ld(input_shape)
    
        ValueError: Exception encountered when calling layer "bert" (type TFBertMainLayer).
        
        in user code:
        
            File "/Users/deepaknandula/miniforge3/envs/env_tf/lib/python3.9/site-packages/transformers/modeling_tf_utils.py", line 1088, in run_call_with_unpacked_inputs  *
                return func(self, **unpacked_inputs)
            File "/Users/deepaknandula/miniforge3/envs/env_tf/lib/python3.9/site-packages/transformers/models/bert/modeling_tf_bert.py", line 775, in call  *
                batch_size, seq_length = input_shape
        
            ValueError: too many values to unpack (expected 2)
        
        
        Call arguments received by layer "bert" (type TFBertMainLayer):
          • self=tf.Tensor(shape=(None, 1, 500), dtype=int32)
          • input_ids=None
          • attention_mask=tf.Tensor(shape=(None, 1, 500), dtype=int32)
          • token_type_ids=tf.Tensor(shape=(None, 1, 500), dtype=int32)
          • position_ids=None
          • head_mask=None
          • inputs_embeds=None
          • encoder_hidden_states=None
          • encoder_attention_mask=None
          • past_key_values=None
          • use_cache=True
          • output_attentions=False
          • output_hidden_states=False
          • return_dict=True
          • training=True
    
    
    Call arguments received by layer "tf_bert_model_18" (type TFBertModel):
      • self=tf.Tensor(shape=(None, 1, 500), dtype=int32)
      • input_ids=None
      • attention_mask=tf.Tensor(shape=(None, 1, 500), dtype=int32)
      • token_type_ids=tf.Tensor(shape=(None, 1, 500), dtype=int32)
      • position_ids=None
      • head_mask=None
      • inputs_embeds=None
      • encoder_hidden_states=None
      • encoder_attention_mask=None
      • past_key_values=None
      • use_cache=None
      • output_attentions=None
      • output_hidden_states=None
      • return_dict=None
      • training=True


In [165]:
# ... (Previous code before model creation remains unchanged)

import tensorflow as tf
from keras.utils import to_categorical

model = create_combined_model(acoustic_input_shape=(74, 500), visual_input_shape=(35, 500), text_max_length=500)

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Encode text data using BERT tokenizer
def encode_text_data(text_data, max_length=307):
    # Join the list of words into a single string
    text = ' '.join(text_data)
    encoded_texts = bert_tokenizer(text, padding='max_length', truncation=True, max_length=max_length, return_tensors='tf')
    # Return the required BERT input keys as separate tensors
    return (
        tf.convert_to_tensor(encoded_texts['input_ids']),
        tf.convert_to_tensor(encoded_texts['attention_mask']),
        tf.convert_to_tensor(encoded_texts['token_type_ids']),
    )

# Filter out rows with missing 'actual_words'
train_df = train_df[train_df['actual_words'].apply(lambda x: len(x) > 0)]

# Preprocess text data and create a tf.data.Dataset
text_data = train_df['actual_words'].apply(encode_text_data, max_length=500)

# Prepare the rest of the data
acoustic_data = np.stack(train_df['acoustic'])
visual_data = np.stack(train_df['visual'])
label_data = np.array([int(label[0]) for label in train_df['_label']])

input_ids_list, attention_mask_list, token_type_ids_list = zip(*text_data)

# Convert the lists of tensors to numpy arrays
input_ids_data = np.array(input_ids_list)
attention_mask_data = np.array(attention_mask_list)
token_type_ids_data = np.array(token_type_ids_list)


# Train the model using the prepared data
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit([acoustic_data, input_ids_data, attention_mask_data, token_type_ids_data, visual_data], label_data, epochs=10)


Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
  label_data = np.array([int(label[0]) for label in train_df['_label']])


Epoch 1/10


ValueError: in user code:

    File "/Users/deepaknandula/miniforge3/envs/env_tf/lib/python3.9/site-packages/keras/engine/training.py", line 1051, in train_function  *
        return step_function(self, iterator)
    File "/Users/deepaknandula/miniforge3/envs/env_tf/lib/python3.9/site-packages/keras/engine/training.py", line 1040, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/deepaknandula/miniforge3/envs/env_tf/lib/python3.9/site-packages/keras/engine/training.py", line 1030, in run_step  **
        outputs = model.train_step(data)
    File "/Users/deepaknandula/miniforge3/envs/env_tf/lib/python3.9/site-packages/keras/engine/training.py", line 889, in train_step
        y_pred = self(x, training=True)
    File "/Users/deepaknandula/miniforge3/envs/env_tf/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/var/folders/zj/hd8_xvbd3fvbp9wj5412cw_c0000gn/T/__autograph_generated_file0vg8_j1w.py", line 36, in tf__run_call_with_unpacked_inputs
        retval_ = ag__.converted_call(ag__.ld(func), (ag__.ld(self),), dict(**ag__.ld(unpacked_inputs)), fscope)
    File "/var/folders/zj/hd8_xvbd3fvbp9wj5412cw_c0000gn/T/__autograph_generated_filejqo70sky.py", line 30, in tf__call
        outputs = ag__.converted_call(ag__.ld(self).bert, (), dict(input_ids=ag__.ld(input_ids), attention_mask=ag__.ld(attention_mask), token_type_ids=ag__.ld(token_type_ids), position_ids=ag__.ld(position_ids), head_mask=ag__.ld(head_mask), inputs_embeds=ag__.ld(inputs_embeds), encoder_hidden_states=ag__.ld(encoder_hidden_states), encoder_attention_mask=ag__.ld(encoder_attention_mask), past_key_values=ag__.ld(past_key_values), use_cache=ag__.ld(use_cache), output_attentions=ag__.ld(output_attentions), output_hidden_states=ag__.ld(output_hidden_states), return_dict=ag__.ld(return_dict), training=ag__.ld(training)), fscope)
    File "/var/folders/zj/hd8_xvbd3fvbp9wj5412cw_c0000gn/T/__autograph_generated_file0vg8_j1w.py", line 36, in tf__run_call_with_unpacked_inputs
        retval_ = ag__.converted_call(ag__.ld(func), (ag__.ld(self),), dict(**ag__.ld(unpacked_inputs)), fscope)
    File "/var/folders/zj/hd8_xvbd3fvbp9wj5412cw_c0000gn/T/__autograph_generated_file8oc0f77c.py", line 75, in tf__call
        (batch_size, seq_length) = ag__.ld(input_shape)

    ValueError: Exception encountered when calling layer "tf_bert_model_19" (type TFBertModel).
    
    in user code:
    
        File "/Users/deepaknandula/miniforge3/envs/env_tf/lib/python3.9/site-packages/transformers/modeling_tf_utils.py", line 1088, in run_call_with_unpacked_inputs  *
            return func(self, **unpacked_inputs)
        File "/Users/deepaknandula/miniforge3/envs/env_tf/lib/python3.9/site-packages/transformers/models/bert/modeling_tf_bert.py", line 1115, in call  *
            outputs = self.bert(
        File "/Users/deepaknandula/miniforge3/envs/env_tf/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler  **
            raise e.with_traceback(filtered_tb) from None
        File "/var/folders/zj/hd8_xvbd3fvbp9wj5412cw_c0000gn/T/__autograph_generated_file0vg8_j1w.py", line 36, in tf__run_call_with_unpacked_inputs
            retval_ = ag__.converted_call(ag__.ld(func), (ag__.ld(self),), dict(**ag__.ld(unpacked_inputs)), fscope)
        File "/var/folders/zj/hd8_xvbd3fvbp9wj5412cw_c0000gn/T/__autograph_generated_file8oc0f77c.py", line 75, in tf__call
            (batch_size, seq_length) = ag__.ld(input_shape)
    
        ValueError: Exception encountered when calling layer "bert" (type TFBertMainLayer).
        
        in user code:
        
            File "/Users/deepaknandula/miniforge3/envs/env_tf/lib/python3.9/site-packages/transformers/modeling_tf_utils.py", line 1088, in run_call_with_unpacked_inputs  *
                return func(self, **unpacked_inputs)
            File "/Users/deepaknandula/miniforge3/envs/env_tf/lib/python3.9/site-packages/transformers/models/bert/modeling_tf_bert.py", line 775, in call  *
                batch_size, seq_length = input_shape
        
            ValueError: too many values to unpack (expected 2)
        
        
        Call arguments received by layer "bert" (type TFBertMainLayer):
          • self=tf.Tensor(shape=(None, 1, 500), dtype=int32)
          • input_ids=None
          • attention_mask=tf.Tensor(shape=(None, 1, 500), dtype=int32)
          • token_type_ids=tf.Tensor(shape=(None, 1, 500), dtype=int32)
          • position_ids=None
          • head_mask=None
          • inputs_embeds=None
          • encoder_hidden_states=None
          • encoder_attention_mask=None
          • past_key_values=None
          • use_cache=True
          • output_attentions=False
          • output_hidden_states=False
          • return_dict=True
          • training=True
    
    
    Call arguments received by layer "tf_bert_model_19" (type TFBertModel):
      • self=tf.Tensor(shape=(None, 1, 500), dtype=int32)
      • input_ids=None
      • attention_mask=tf.Tensor(shape=(None, 1, 500), dtype=int32)
      • token_type_ids=tf.Tensor(shape=(None, 1, 500), dtype=int32)
      • position_ids=None
      • head_mask=None
      • inputs_embeds=None
      • encoder_hidden_states=None
      • encoder_attention_mask=None
      • past_key_values=None
      • use_cache=None
      • output_attentions=None
      • output_hidden_states=None
      • return_dict=None
      • training=True


In [198]:
import tensorflow as tf
from transformers import TFBertModel, BertTokenizer

def create_acoustic_model(input_shape):
    inputs = tf.keras.layers.Input(shape=input_shape)
    x = tf.keras.layers.Reshape((input_shape[0], input_shape[1], 1))(inputs)
    x = tf.keras.layers.Conv2D(64, kernel_size=(3, 3), activation='relu')(x)
    x = tf.keras.layers.MaxPooling2D(pool_size=(2, 2))(x)
    x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.Dense(128, activation='relu')(x)
    model = tf.keras.Model(inputs=inputs, outputs=x)
    return model

def create_video_model(input_shape):
    inputs = tf.keras.layers.Input(shape=input_shape)
    x = tf.keras.layers.Reshape((input_shape[0], input_shape[1], 1))(inputs)
    x = tf.keras.layers.Conv2D(64, kernel_size=(3, 3), activation='relu')(x)
    x = tf.keras.layers.MaxPooling2D(pool_size=(2, 2))(x)
    x = tf.keras.layers.Flatten()(x)
    outputs = tf.keras.layers.Dense(32, activation='relu')(x)
    return tf.keras.models.Model(inputs=inputs, outputs=outputs)

def create_bert_model(max_length):
    input_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32)
    attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32)

    bert_model = TFBertModel.from_pretrained("bert-base-uncased")
    bert_output = bert_model({"input_ids": input_ids, "attention_mask": attention_mask})[1]
    cls_token = bert_output[0]

    model = tf.keras.Model(inputs={"input_ids": input_ids, "attention_mask": attention_mask}, outputs=cls_token)
    return model
    
def create_combined_model(acoustic_input_shape, visual_input_shape, text_max_length):
    acoustic_model = create_acoustic_model(acoustic_input_shape)
    bert_model = create_bert_model(text_max_length)
    video_model = create_video_model(visual_input_shape)


    input_ids = tf.keras.layers.Input(shape=(text_max_length,), dtype=tf.int32)
    attention_mask = tf.keras.layers.Input(shape=(text_max_length,), dtype=tf.int32)
    acoustic_input = tf.keras.layers.Input(shape=acoustic_input_shape)
    visual_input = tf.keras.layers.Input(shape=visual_input_shape)

    # Assuming the output shape of your bert_model is (batch_size, bert_output_dim)
    bert_output = bert_model({'input_ids': input_ids, 'attention_mask': attention_mask})

    # Assuming the output shape of your video_model is (batch_size, video_output_dim)
    visual_output = video_model(visual_input)

    # Assuming the output shape of your acoustic_model is (batch_size, acoustic_output_dim)
    acoustic_output = acoustic_model(acoustic_input)

    # Modify the next lines according to your actual requirements for combining the models
    combined_input = tf.keras.layers.Concatenate()([acoustic_output, bert_output, visual_output])
    x = tf.keras.layers.Dense(128, activation='relu')(combined_input)
    outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)

    model = tf.keras.models.Model(inputs=[acoustic_input, {'input_ids': input_ids, 'attention_mask': attention_mask}, visual_input], outputs=outputs)
    return model
# Create the model

# Compile the model for regression


In [178]:
def create_combined_model(acoustic_input_shape, visual_input_shape, text_max_length):
    acoustic_model = create_acoustic_model(acoustic_input_shape)
    bert_model = create_bert_model(text_max_length)
    video_model = create_video_model(visual_input_shape)

    acoustic_input = tf.keras.layers.Input(shape=acoustic_input_shape)
    bert_input_ids = tf.keras.layers.Input(shape=(text_max_length,), dtype=tf.int32)
    bert_attention_mask = tf.keras.layers.Input(shape=(text_max_length,), dtype=tf.int32)
    bert_token_type_ids = tf.keras.layers.Input(shape=(text_max_length,), dtype=tf.int32)
    visual_input = tf.keras.layers.Input(shape=visual_input_shape)

    # Assuming the output shape of your acoustic_model is (batch_size, acoustic_output_dim)
    acoustic_output = acoustic_model(acoustic_input)

    # Assuming the output shape of your bert_model is (batch_size, bert_output_dim)
    bert_output = bert_model([bert_input_ids, bert_attention_mask, bert_token_type_ids])

    # Assuming the output shape of your video_model is (batch_size, video_output_dim)
    visual_output = video_model(visual_input)

    # Combine the outputs from the individual models
    combined_input = tf.keras.layers.Concatenate()([acoustic_output, bert_output, visual_output])
    x = tf.keras.layers.Dense(128, activation='relu')(combined_input)
    output = tf.keras.layers.Dense(1, activation='linear')(x)  # Using 'linear' activation for regression

    model = tf.keras.Model(inputs=[acoustic_input, bert_input_ids, bert_attention_mask, bert_token_type_ids, visual_input], outputs=output)
    return model

def create_combined_model(acoustic_input_shape, visual_input_shape, text_max_length):
    acoustic_model = create_acoustic_model(acoustic_input_shape)
    bert_model = create_bert_model(text_max_length)
    video_model = create_video_model(visual_input_shape)

    acoustic_input = tf.keras.layers.Input(shape=acoustic_input_shape)
    bert_input_ids = tf.keras.layers.Input(shape=(text_max_length,), dtype=tf.int32)
    bert_attention_mask = tf.keras.layers.Input(shape=(text_max_length,), dtype=tf.int32)
    bert_token_type_ids = tf.keras.layers.Input(shape=(text_max_length,), dtype=tf.int32)
    visual_input = tf.keras.layers.Input(shape=visual_input_shape)

    acoustic_output = acoustic_model(acoustic_input)
    bert_output = bert_model([bert_input_ids, bert_attention_mask, bert_token_type_ids])
    visual_output = video_model(visual_input)

    # Combine the outputs from the individual models
    combined_input = tf.keras.layers.Concatenate()([acoustic_output, bert_output, visual_output])
    x = tf.keras.layers.Dense(128, activation='relu')(combined_input)
    output = tf.keras.layers.Dense(1, activation='linear')(x)  # Using 'linear' activation for regression

    model = tf.keras.Model(inputs=[acoustic_input, bert_input_ids, bert_attention_mask, bert_token_type_ids, visual_input], outputs=output)
    return model



In [190]:
def processText(text):
    return ' '.join(text)

train_df['words'] = train_df['actual_words'].apply(processText)

In [196]:
train_df['_label']

0              [[1.0]]
1        [[0.6666667]]
2              [[0.0]]
3              [[0.0]]
4              [[1.0]]
             ...      
16321          [[0.0]]
16322          [[0.0]]
16323    [[0.6666667]]
16324          [[1.0]]
16325          [[1.0]]
Name: _label, Length: 16326, dtype: object

In [199]:
# ... (Previous code before model creation remains unchanged)

import tensorflow as tf
from keras.utils import to_categorical

model = create_combined_model(acoustic_input_shape=(74, 500), visual_input_shape=(35, 500), text_max_length=500)

# Encode text data using BERT tokenizer
def encode_text_data(text_data, max_length=500):
    # Load the BERT tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Define the maximum sequence length for padding/truncating
    max_length = 512

    # Tokenize the list of strings and convert them to input IDs, attention masks, and token type IDs
    input_ids = []
    attention_masks = []

    for text in text_data:
        # Tokenize the text and add the special [CLS] and [SEP] tokens
        encoded_dict = tokenizer.encode_plus(
                            text,                      # Text to encode
                            add_special_tokens = True, # Add [CLS] and [SEP] tokens
                            max_length = max_length,   # Pad/truncate to a maximum length
                            pad_to_max_length = True,
                            return_attention_mask = True,   # Generate attention masks
                            return_token_type_ids = False,   # Do not generate token type IDs
                            truncation=True,
                            )
        
        # Add the encoded sequence and attention mask to the lists
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    # Convert the lists to tensors
    input_ids = tf.convert_to_tensor(input_ids, dtype=tf.int32)
    attention_masks = tf.convert_to_tensor(attention_masks, dtype=tf.int32)
    
    # Return a tuple of input IDs and attention masks
    return input_ids, attention_masks

# Filter out rows with missing 'actual_words'
# train_df = train_df[train_df['actual_words'].apply(lambda x: len(x) > 0)]



# Preprocess text data and create a tf.data.Dataset
text_data = encode_text_data(train_df['words'].tolist())


# Prepare the rest of the data
acoustic_data = np.stack(train_df['acoustic'])
visual_data = np.stack(train_df['visual'])
label_data = np.array([label[0] for label in train_df['_label']])

input_ids_list, attention_mask_list, token_type_ids_list = zip(*text_data)

# Convert the lists of tensors to numpy arrays
input_ids_data = np.array(input_ids_list)
attention_mask_data = np.array(attention_mask_list)
token_type_ids_data = np.array(token_type_ids_list)

# Convert the label_data to float32 as it is expected for regression
label_data = label_data.astype(np.float32)

# Train the model using the prepared data for regression
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error'])
# acoustic_data = acoustic_data.transpose(0, 2, 1)
model.fit(
    [acoustic_data, {'input_ids': text_data[0], 'attention_mask': text_data[1]}, visual_data],
    label_data,
    epochs=10
)


Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


ValueError: A `Concatenate` layer requires inputs with matching shapes except for the concatenation axis. Received: input_shape=[(None, 128), (768,), (None, 32)]