In [3]:
import sys
import os
import re
import pickle
import pandas as pd
import numpy as np

DATA_PATH = '../data/MOSEI/'

def to_pickle(obj, path):
    with open(path, 'wb') as f:
        pickle.dump(obj, f)

def load_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

def get_length(x):
    return x.shape[1]-(np.sum(x, axis=-1) == 0).sum(1)

# first we align to words with averaging, collapse_function receives a list of functions
# dataset.align(text_field, collapse_functions=[avg])
# load pickle file for unaligned acoustic and visual source
pickle_filename = DATA_PATH+'mosei_senti_data_noalign.pkl'
csv_filename = DATA_PATH+'MOSEI-label.csv'

with open(pickle_filename, 'rb') as f:
    d = pickle.load(f)

# read csv file for label and text
df = pd.read_csv(csv_filename)
text = df['text']
vid = df['video_id']
cid = df['clip_id']

train_split_noalign = d['train']
dev_split_noalign = d['valid']
test_split_noalign = d['test']

# a sentinel epsilon for safe division, without it we will replace illegal values with a constant
EPS = 1e-6

# place holders for the final train/dev/test dataset
train = train = []
dev = dev = []
test = test = []

# define a regular expression to extract the video ID out of the keys
# pattern = re.compile('(.*)\[.*\]')
pattern = re.compile('(.*)_([.*])')
num_drop = 0 # a counter to count how many data points went into some processing issues

v = np.concatenate((train_split_noalign['vision'],dev_split_noalign['vision'], test_split_noalign['vision']),axis=0)
vlens = get_length(v)

a = np.concatenate((train_split_noalign['audio'],dev_split_noalign['audio'], test_split_noalign['audio']),axis=0)
alens = get_length(a)

label = np.concatenate((train_split_noalign['labels'],dev_split_noalign['labels'], test_split_noalign['labels']),axis=0)

L_V = v.shape[1]
L_A = a.shape[1]


all_id = np.concatenate((train_split_noalign['id'], dev_split_noalign['id'], test_split_noalign['id']),axis=0)[:,0]
all_id_list = all_id.tolist()

train_size = len(train_split_noalign['id'])
dev_size = len(dev_split_noalign['id'])
test_size = len(test_split_noalign['id'])

dev_start = train_size
test_start = train_size + dev_size

all_csv_id = [(vid[i], str(cid[i])) for i in range(len(vid))]

for i, idd in enumerate(all_id_list):
    # get the video ID and the features out of the aligned dataset

    # matching process
    try:
        index = i
    except:
        import ipdb; ipdb.set_trace()

    _words = text[index].split()
    _label = label[i].astype(np.float32)
    _visual = v[i]
    _acoustic = a[i]
    _vlen = vlens[i]
    _alen = alens[i]
    _id = '{}[{}]'.format(all_csv_id[0], all_csv_id[1])           

    # remove nan values
    # label = np.nan_to_num(label)
    _visual = np.nan_to_num(_visual)
    _acoustic = np.nan_to_num(_acoustic)

    # remove speech pause tokens - this is in general helpful
    # we should remove speech pauses and corresponding visual/acoustic features together
    # otherwise modalities would no longer be aligned
    actual_words = []
    words = []
    visual = []
    acoustic = []

    for word in _words:
        actual_words.append(word)

    visual = _visual[L_V - _vlen:,:]
    acoustic = _acoustic[L_A - _alen:,:]

    if i < dev_start:
        train.append((words, visual, acoustic, actual_words, _vlen, _alen, _label, idd))
    elif i >= dev_start and i < test_start:
        dev.append((words, visual, acoustic, actual_words, _vlen, _alen, _label, idd))
    elif i >= test_start:
        test.append((words, visual, acoustic, actual_words, _vlen, _alen, _label, idd))
    else:
        print(f"Found video that doesn't belong to any splits: {idd}")


# print(f"Total number of {num_drop} datapoints have been dropped.")
print(f"Total number of {num_drop} datapoints have been dropped.")
print("Dataset split")
print("Train Set: {}".format(len(train)))
print("Validation Set: {}".format(len(dev)))
print("Test Set: {}".format(len(test)))

# Save glove embeddings cache too
# self.pretrained_emb = pretrained_emb = load_emb(word2id, config.word_emb_path)
# torch.save((pretrained_emb, word2id), CACHE_PATH)
pretrained_emb = None

# Save pickles
to_pickle(train, DATA_PATH + '/dftrain.pkl')
to_pickle(dev, DATA_PATH + '/dfdev.pkl')
to_pickle(test, DATA_PATH + '/dftest.pkl')

Total number of 0 datapoints have been dropped.
Dataset split
Train Set: 16326
Validation Set: 1871
Test Set: 4659


In [4]:
train_df = pd.DataFrame(train,columns=['words', 'visual', 'acoustic', 'actual_words', '_vlen', '_alen', '_label', 'idd'])

In [5]:
train_df

Unnamed: 0,words,visual,acoustic,actual_words,_vlen,_alen,_label,idd
0,[],"[[-1.2108299732208252, -0.46178698539733887, -...","[[194.5, 0.0, 0.07899338752031326, 0.412973552...","[Key, is, part, of, the, people, that, we, use...",267,356,[[1.0]],-3g5yACwYnA
1,[],"[[-1.7858200073242188, -0.6380839943885803, 0....","[[105.5, 1.0, 0.021022265776991844, 0.08599962...","[They've, been, able, to, find, solutions, or,...",81,107,[[0.6666667]],-3g5yACwYnA
2,[],"[[-1.7642099857330322, -0.7958599925041199, -0...","[[106.5, 1.0, 0.11148008704185486, 0.726543664...","[We're, a, huge, user, of, adhesives, for, our...",215,286,[[0.0]],-3g5yACwYnA
3,[],"[[-1.2986199855804443, -0.2510870099067688, -0...","[[115.5, 1.0, 0.057251088321208954, 0.33875322...","[Key, Polymer, brings, a, technical, aspect, t...",138,184,[[0.0]],-3g5yACwYnA
4,[],"[[-1.6502399444580078, -0.3371959924697876, -0...","[[100.5, 1.0, 0.14517584443092346, 0.675116181...","[Key, brings, those, types, of, aspects, to, a...",221,295,[[1.0]],-3g5yACwYnA
...,...,...,...,...,...,...,...,...
16321,[],"[[-1.8842400312423706, -0.6028929948806763, 0....","[[127.5, 1.0, 0.11853820085525513, 0.720579862...","[I, read, other, articles,, what, other, train...",72,103,[[0.0]],zwTrXwi54us
16322,[],"[[-2.153140068054199, -0.04792049899697304, -0...","[[129.5, 1.0, 0.1991356760263443, 0.6945900321...","[I, do, all, of, that]",28,37,[[0.0]],zwTrXwi54us
16323,[],"[[-4.401090145111084, -1.0127899646759033, -1....","[[188.5, 0.0, 0.05910159647464752, 0.331484466...","[Now,, if, this, sounds, like, something, you'...",163,227,[[0.6666667]],zwTrXwi54us
16324,[],"[[-2.3272500038146973, -1.1171799898147583, 0....","[[133.0, 0.0, 0.027281710878014565, 0.07664652...","[I, actually, speak, to, the, experts, myself,...",80,106,[[1.0]],zwTrXwi54us


In [6]:
train_df['acoustic'][0]

array([[ 1.94500000e+02,  0.00000000e+00,  7.89933875e-02, ...,
        -3.43945742e-01, -3.39845806e-01, -2.55919456e-01],
       [ 1.20500000e+02,  0.00000000e+00,  6.85892552e-02, ...,
        -3.29696000e-01, -3.68389398e-01, -2.19704941e-01],
       [ 1.68000000e+02,  0.00000000e+00,  5.81851192e-02, ...,
        -3.20934534e-01, -3.78141046e-01, -2.37061188e-01],
       ...,
       [ 8.55000000e+01,  0.00000000e+00,  5.52437678e-02, ...,
        -3.50019425e-01, -2.89983302e-01, -2.81043440e-01],
       [ 2.06000000e+02,  0.00000000e+00,  4.93893474e-02, ...,
        -3.87713164e-01, -3.68427217e-01, -2.66391695e-01],
       [ 1.01500000e+02,  0.00000000e+00,  4.35349271e-02, ...,
        -4.55446243e-01, -4.16983306e-01, -2.69667059e-01]])

In [10]:
np.shape(train_df['acoustic'][0])

(356, 74)

In [13]:
np.shape(train_df['visual'][15])

(64, 35)

In [15]:

np.shape(train_df['actual_words'][0])

(42,)

In [9]:
# pd.DataFrame(train_split_noalign,columns=['a'])
d

{'train': {'vision': array([[[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
            0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
          [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
            0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
          [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
            0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
          ...,
          [-1.71436000e+00,  4.13351990e-02, -9.11785007e-01, ...,
            2.26524997e+00,  1.35622997e+01, -7.79251993e-01],
          [-1.46472001e+00,  1.42652005e-01, -9.68686998e-01, ...,
            1.66319001e+00,  1.43842001e+01, -1.30941999e+00],
          [-1.61250997e+00, -1.18235998e-01, -8.67457986e-01, ...,
            2.01573992e+00,  1.48933001e+01, -1.63349998e+00]],
  
         [[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
            0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
          [ 0.00000000e+00,  0.00000000e+00,  0.0000

In [5]:
df

Unnamed: 0,video_id,clip_id,text,label,annotation,mode,label_by
0,-3g5yACwYnA,10,Key is part of the people that we use to solve...,1.000000,Positive,train,0
1,-3g5yACwYnA,13,They've been able to find solutions or at leas...,0.666667,Positive,train,0
2,-3g5yACwYnA,3,We're a huge user of adhesives for our operati...,0.000000,Neutral,train,0
3,-3g5yACwYnA,2,Key Polymer brings a technical aspect to our o...,0.000000,Neutral,train,0
4,-3g5yACwYnA,4,Key brings those types of aspects to a busines...,1.000000,Positive,train,0
...,...,...,...,...,...,...,...
22851,zhNksSReaQk,35,"And yet, it's like, how a/Autistic people defe...",0.000000,Neutral,test,0
22852,zhNksSReaQk,34,But the thing is that intelligence [scoffs] is...,-2.000000,Negative,test,0
22853,zhNksSReaQk,33,"They're like, ""Oh, they have a high IQ, they a...",0.000000,Neutral,test,0
22854,zvZd3V5D5Ik,3,"If you're ready to strengthen your skills, whi...",1.000000,Positive,test,0


In [None]:
dev

In [None]:
test

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import LSTM, Dense, Concatenate
from tensorflow.keras.models import Model

# Define the model architecture
def create_model(input_shape, num_classes):
    # Acoustic LSTM
    acoustic_input = tf.keras.Input(shape=input_shape)
    acoustic_lstm = LSTM(units=64)(acoustic_input)

    # BERT Encoder
    text_input = tf.keras.Input(shape=input_shape)
    bert_encoder = Dense(units=64)(text_input)

    # Visual LSTM
    visual_input = tf.keras.Input(shape=input_shape)
    visual_lstm = LSTM(units=64)(visual_input)

    # Concatenate the outputs
    concatenated = Concatenate()([acoustic_lstm, bert_encoder, visual_lstm])

    # MLP layers
    dense1 = Dense(units=128, activation='relu')(concatenated)
    dense2 = Dense(units=64, activation='relu')(dense1)
    output = Dense(units=num_classes, activation='softmax')(dense2)

    # Create the model
    model = Model(inputs=[acoustic_input, text_input, visual_input], outputs=output)
    return model

# Define the training process
def train_model(model, train_data, val_data, num_epochs, batch_size):
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    model.fit(train_data, epochs=num_epochs, batch_size=batch_size, validation_data=val_data)


In [None]:
import os
import numpy as np
import pandas as pd
import librosa
import cv2
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Step 1: Feature Extraction

# Extract audio features using Librosa
def extract_audio_features(audio_path):
    audio, sample_rate = librosa.load(audio_path, res_type='kaiser_fast')
    mfcc_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    return np.mean(mfcc_features.T, axis=0)

# Extract video features using OpenCV
def extract_video_features(video_path):
    cap = cv2.VideoCapture(video_path)
    frames = []
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        frame = cv2.resize(frame, (64, 64))
        frames.append(frame)
    cap.release()
    return np.mean(frames, axis=0).flatten()

# Step 2: Data Preparation

# Load text data from CSV
def load_text_data(text_csv_path):
    df = pd.read_csv(text_csv_path)
    texts = df['text'].tolist()
    labels = df['label'].tolist()
    return texts, labels

# Load audio data from directory
def load_audio_data(audio_directory):
    audio_files = os.listdir(audio_directory)
    audio_paths = [os.path.join(audio_directory, file) for file in audio_files]
    audio_features = [extract_audio_features(path) for path in audio_paths]
    return audio_features

# Load video data from directory
def load_video_data(video_directory):
    video_files = os.listdir(video_directory)
    video_paths = [os.path.join(video_directory, file) for file in video_files]
    video_features = [extract_video_features(path) for path in video_paths]
    return video_features

# Step 3: Modality-specific Classification

# Train audio classifier
def train_audio_classifier(audio_features, labels):
    X_train, X_test, y_train, y_test = train_test_split(
        audio_features, labels, test_size=0.2, random_state=42
    )
    classifier = RandomForestClassifier()
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return classifier, accuracy

# Train video classifier
def train_video_classifier(video_features, labels):
    X_train, X_test, y_train, y_test = train_test_split(
        video_features, labels, test_size=0.2, random_state=42
    )
    classifier = RandomForestClassifier()
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return classifier, accuracy

# Train text classifier
def train_text_classifier(texts, labels):
    # Implement your own text classification method here
    pass

# Step 4: Multi-modal Fusion

# Combine predictions from different classifiers
def combine_predictions(audio_pred, video_pred, text_pred):
    # Implement your own fusion method here (e.g., weighted average)
    pass

# Step 5: Usage

# Load data for each modality
texts, labels = load_text_data('text_data.csv')
audio_features = load_audio_data('audio_directory')
video_features = load_video


In [None]:
import tensorflow as tf

def create_audio_model(input_shape):
    inputs = tf.keras.layers.Input(shape=input_shape)
    # Your audio processing layers go here
    # Example:
    x = tf.keras.layers.Conv2D(32, (3, 3), activation='relu')(inputs)
    x = tf.keras.layers.MaxPooling2D((2, 2))(x)
    x = tf.keras.layers.Flatten()(x)
    outputs = tf.keras.layers.Dense(128, activation='relu')(x)
    return tf.keras.models.Model(inputs=inputs, outputs=outputs)


In [None]:
import tensorflow_hub as hub

def create_bert_model():
    # Load the BERT model from TensorFlow Hub
    bert_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3"
    bert_layer = hub.KerasLayer(bert_url, trainable=True)

    # Define input and output
    input_text = tf.keras.layers.Input(shape=(), dtype=tf.string)
    bert_output = bert_layer(input_text)

    return tf.keras.models.Model(inputs=input_text, outputs=bert_output)


In [None]:
def create_cnn_model(input_shape):
    inputs = tf.keras.layers.Input(shape=input_shape)
    # Your CNN layers go here
    # Example:
    x = tf.keras.layers.Conv2D(64, (3, 3), activation='relu')(inputs)
    x = tf.keras.layers.MaxPooling2D((2, 2))(x)
    x = tf.keras.layers.Flatten()(x)
    outputs = tf.keras.layers.Dense(128, activation='relu')(x)
    return tf.keras.models.Model(inputs=inputs, outputs=outputs)


In [None]:
def create_combined_model(audio_input_shape, cnn_input_shape):
    # Create each sub-model
    audio_model = create_audio_model(audio_input_shape)
    bert_model = create_bert_model()
    cnn_model = create_cnn_model(cnn_input_shape)

    # Define inputs for all three models
    audio_input = tf.keras.layers.Input(shape=audio_input_shape)
    bert_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
    cnn_input = tf.keras.layers.Input(shape=cnn_input_shape)

    # Get outputs from each sub-model
    audio_output = audio_model(audio_input)
    bert_output = bert_model(bert_input)
    cnn_output = cnn_model(cnn_input)

    # Concatenate the outputs
    combined_output = tf.keras.layers.concatenate([audio_output, bert_output, cnn_output])

    # Add a classification layer on top
    predictions = tf.keras.layers.Dense(1, activation='sigmoid')(combined_output)

    # Create the final model
    model = tf.keras.models.Model(inputs=[audio_input, bert_input, cnn_input], outputs=predictions)

    return model
