In [1]:
import random
import numpy as np
import tensorflow as tf
from transformers import TFBertModel,BertTokenizer
from sklearn.preprocessing import StandardScaler
import sys
import os
import re
import pickle
import pandas as pd

DATA_PATH = '../data/MOSEI/'

def to_pickle(obj, path):
    with open(path, 'wb') as f:
        pickle.dump(obj, f)

def load_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

def get_length(x):
    return x.shape[1] - (np.sum(x, axis=-1) == 0).sum(1)

# Load pickle file for unaligned acoustic and visual source
pickle_filename = DATA_PATH + 'mosei_senti_data_noalign.pkl'
csv_filename = DATA_PATH + 'MOSEI-label.csv'

with open(pickle_filename, 'rb') as f:
    d = pickle.load(f)

# Read csv file for label and text
df = pd.read_csv(csv_filename)
text = df['text']
vid = df['video_id']
cid = df['clip_id']

train_split_noalign = d['train']
dev_split_noalign = d['valid']
test_split_noalign = d['test']

# Define a regular expression to extract the video ID out of the keys
# pattern = re.compile('(.*)\[.*\]')
pattern = re.compile('(.*)_([.*])')
num_drop = 0 # a counter to count how many data points went into some processing issues

v = np.concatenate((train_split_noalign['vision'], dev_split_noalign['vision'], test_split_noalign['vision']), axis=0)
vlens = get_length(v)

a = np.concatenate((train_split_noalign['audio'], dev_split_noalign['audio'], test_split_noalign['audio']), axis=0)
alens = get_length(a)

label = np.concatenate((train_split_noalign['labels'], dev_split_noalign['labels'], test_split_noalign['labels']), axis=0)

L_V = v.shape[1]
L_A = a.shape[1]

all_id = np.concatenate((train_split_noalign['id'], dev_split_noalign['id'], test_split_noalign['id']), axis=0)[:, 0]
all_id_list = all_id.tolist()

train_size = len(train_split_noalign['id'])
dev_size = len(dev_split_noalign['id'])
test_size = len(test_split_noalign['id'])

dev_start = train_size
test_start = train_size + dev_size

all_csv_id = [(vid[i], str(cid[i])) for i in range(len(vid))]

train = []
dev = []
test = []

for i, idd in enumerate(all_id_list):
    # get the video ID and the features out of the aligned dataset

    # matching process
    try:
        index = i
    except:
        import ipdb; ipdb.set_trace()

    _words = text[index].split()
    _label = label[i].astype(np.float32)
    _visual = v[i]
    _acoustic = a[i]
    _vlen = vlens[i]
    _alen = alens[i]
    _id = '{}[{}]'.format(all_csv_id[0], all_csv_id[1])

    # Remove nan values
    _visual = np.nan_to_num(_visual)
    _acoustic = np.nan_to_num(_acoustic)

    # Remove speech pause tokens - this is in general helpful
    # We should remove speech pauses and corresponding visual/acoustic features together
    # Otherwise, modalities would no longer be aligned
    actual_words = []
    words = []
    visual = []
    acoustic = []

    for word in _words:
        actual_words.append(word)

    visual = _visual[L_V - _vlen:, :]
    acoustic = _acoustic[L_A - _alen:, :]

    if i < dev_start:
        train.append((words, visual, acoustic, actual_words, _vlen, _alen, _label, idd))
    elif i >= dev_start and i < test_start:
        dev.append((words, visual, acoustic, actual_words, _vlen, _alen, _label, idd))
    elif i >= test_start:
        test.append((words, visual, acoustic, actual_words, _vlen, _alen, _label, idd))
    else:
        print(f"Found video that doesn't belong to any splits: {idd}")

# ... Rest of the data processing code ...
train_df = pd.DataFrame(train,columns=['words', 'visual', 'acoustic', 'actual_words', '_vlen', '_alen', '_label', 'idd'])
dev_df = pd.DataFrame(dev,columns=['words', 'visual', 'acoustic', 'actual_words', '_vlen', '_alen', '_label', 'idd'])
test_df = pd.DataFrame(test,columns=['words', 'visual', 'acoustic', 'actual_words', '_vlen', '_alen', '_label', 'idd'])

# Define the model using TensorFlow
def create_acoustic_model(input_shape):
    inputs = tf.keras.layers.Input(shape=input_shape)
    x = tf.keras.layers.Reshape((input_shape[0], input_shape[1], 1))(inputs)
    x = tf.keras.layers.Conv2D(64, kernel_size=(3, 3), activation='relu')(x)
    x = tf.keras.layers.MaxPooling2D(pool_size=(2, 2))(x)
    x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.Dense(128, activation='relu')(x)
    model = tf.keras.Model(inputs=inputs, outputs=x)
    return model

def create_video_model(input_shape):
    inputs = tf.keras.layers.Input(shape=input_shape)
    x = tf.keras.layers.Reshape(input_shape)(inputs)
    x = tf.keras.layers.Conv2D(64, kernel_size=(3, 3), activation='relu')(x)
    x = tf.keras.layers.MaxPooling2D(pool_size=(2, 2))(x)
    x = tf.keras.layers.Flatten()(x)
    outputs = tf.keras.layers.Dense(32, activation='relu')(x)
    return tf.keras.models.Model(inputs=inputs, outputs=outputs)


def create_bert_model(max_length):
    input_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32)
    attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32)
    token_type_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32)

    bert_model = TFBertModel.from_pretrained("bert-base-uncased")
    bert_output = bert_model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)[0]
    cls_token = bert_output[:, 0, :]

    model = tf.keras.Model(inputs=[input_ids, attention_mask, token_type_ids], outputs=cls_token)
    return model

def create_combined_model(acoustic_input_shape, visual_input_shape, text_max_length):
    acoustic_model = create_acoustic_model(acoustic_input_shape)
    bert_model = create_bert_model(text_max_length)
    video_model = create_video_model(visual_input_shape)

    # ... Create the CNN model for visual input ...

    # Combine the models
    combined_input = tf.keras.layers.Concatenate()([acoustic_model.output, bert_model.output, video_model.output])
    x = tf.keras.layers.Dense(128, activation='relu')(combined_input)
    output = tf.keras.layers.Dense(1, activation='sigmoid')(x)

    model = tf.keras.Model(inputs=[acoustic_model.input, bert_model.input, video_model.input], outputs=output)
    return model

# ... Rest of the model code ...

# Preprocessing
acoustic_data = np.array([item[2] for item in train])
video_data = np.array([item[1] for item in train])
text_data = [item[3] for item in train]
labels = np.array([item[6] for item in train])

# Normalize the acoustic and visual data
acoustic_scaler = StandardScaler()
video_scaler = StandardScaler()
acoustic_data = acoustic_scaler.fit_transform(acoustic_data)
video_data = video_scaler.fit_transform(video_data)

# BERT tokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Encode text data using BERT tokenizer
def encode_text_data(text_data, max_length=50):
    encoded_texts = bert_tokenizer(text_data, padding='max_length', truncation=True, max_length=max_length, return_tensors='tf')
    return encoded_texts

# Preprocess text data
text_data_encoded = encode_text_data(text_data)

# ... Rest of the training code ...

# Create TensorFlow Dataset
def create_tf_dataset(acoustic_data, video_data, text_data_encoded, labels):
    dataset = tf.data.Dataset.from_tensor_slices((acoustic_data, video_data, text_data_encoded, labels))
    return dataset

# Create a dataset for training
batch_size = 32
train_dataset = create_tf_dataset(acoustic_data, video_data, text_data_encoded, labels)
train_dataset = train_dataset.shuffle(buffer_size=len(labels)).batch(batch_size)

# Define the model
acoustic_input_shape = (acoustic_data.shape[1],)
visual_input_shape = (video_data.shape[1],)
text_max_length = 50  # This should be the maximum length of your text data
model = create_combined_model(acoustic_input_shape, visual_input_shape, text_max_length)

# ... Rest of the training code ...

# Train the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(train_dataset, epochs=10)

# ... Rest of the training code ...


  from .autonotebook import tqdm as notebook_tqdm


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (16326,) + inhomogeneous part.