In [1]:
import os

import numpy as np
from mmsdk import mmdatasdk as md
import h5py

MOSI_PATH = "F:\MOSEI\model\data\MOSI\cmumosi"
MOSI_DATASET = md.cmu_mosi

# cmumosi_highlevel=md.mmdataset(MOSI_DATASET.highlevel, MOSI_PATH)
# cmumosi_raw=md.mmdataset(MOSI_DATASET.raw, MOSI_PATH)
# cmumosi_labels=md.mmdataset(MOSI_DATASET.labels, MOSI_PATH)

visual_field = 'CMU_MOSI_Visual_Facet_42'
acoustic_field = 'CMU_MOSI_COVAREP'
text_field = 'CMU_MOSI_TimestampedWords'

features = [
    text_field,
    visual_field,
    acoustic_field
]

recipe = {feat: os.path.join(MOSI_PATH, feat) + '.csd' for feat in features}
dataset = md.mmdataset(recipe)


# we define a simple averaging function that does not depend on intervals
def avg(intervals: np.array, features: np.array) -> np.array:
    try:
        return np.average(features, axis=0)
    except:
        return features


# first we align to words with averaging, collapse_function receives a list of functions
dataset.align(text_field, collapse_functions=[avg])

label_field = 'CMU_MOSI_Opinion_Labels'

# we add and align to lables to obtain labeled segments
# this time we don't apply collapse functions so that the temporal sequences are preserved
label_recipe = {label_field: os.path.join(MOSI_PATH, label_field + '.csd')}
dataset.add_computational_sequences(label_recipe, destination=None)
dataset.align(label_field)


[92m[1m[2024-12-26 19:14:25.188] | Success | [0mComputational sequence read from file F:\MOSEI\model\data\MOSI\cmumosi\CMU_MOSI_TimestampedWords.csd ...
[94m[1m[2024-12-26 19:14:25.197] | Status  | [0mChecking the integrity of the <words> computational sequence ...
[94m[1m[2024-12-26 19:14:25.197] | Status  | [0mChecking the format of the data in <words> computational sequence ...


                                                                   

[92m[1m[2024-12-26 19:14:25.221] | Success | [0m<words> computational sequence data in correct format.
[94m[1m[2024-12-26 19:14:25.221] | Status  | [0mChecking the format of the metadata in <words> computational sequence ...
[92m[1m[2024-12-26 19:14:25.228] | Success | [0mComputational sequence read from file F:\MOSEI\model\data\MOSI\cmumosi\CMU_MOSI_Visual_Facet_42.csd ...
[94m[1m[2024-12-26 19:14:25.239] | Status  | [0mChecking the integrity of the <FACET_4.2> computational sequence ...
[94m[1m[2024-12-26 19:14:25.239] | Status  | [0mChecking the format of the data in <FACET_4.2> computational sequence ...


                                                                   

[92m[1m[2024-12-26 19:14:25.280] | Success | [0m<FACET_4.2> computational sequence data in correct format.
[94m[1m[2024-12-26 19:14:25.280] | Status  | [0mChecking the format of the metadata in <FACET_4.2> computational sequence ...
[92m[1m[2024-12-26 19:14:25.287] | Success | [0mComputational sequence read from file F:\MOSEI\model\data\MOSI\cmumosi\CMU_MOSI_COVAREP.csd ...
[94m[1m[2024-12-26 19:14:25.296] | Status  | [0mChecking the integrity of the <COVAREP> computational sequence ...
[94m[1m[2024-12-26 19:14:25.296] | Status  | [0mChecking the format of the data in <COVAREP> computational sequence ...


                                                                   

[92m[1m[2024-12-26 19:14:25.328] | Success | [0m<COVAREP> computational sequence data in correct format.
[94m[1m[2024-12-26 19:14:25.329] | Status  | [0mChecking the format of the metadata in <COVAREP> computational sequence ...
[92m[1m[2024-12-26 19:14:25.329] | Success | [0mDataset initialized successfully ... 
[94m[1m[2024-12-26 19:14:25.329] | Status  | [0mUnify was called ...
[92m[1m[2024-12-26 19:14:25.329] | Success | [0mUnify completed ...
[94m[1m[2024-12-26 19:14:25.329] | Status  | [0mPre-alignment based on <CMU_MOSI_TimestampedWords> computational sequence started ...




[94m[1m[2024-12-26 19:14:28.071] | Status  | [0mPre-alignment done for <CMU_MOSI_COVAREP> ...
[94m[1m[2024-12-26 19:14:28.666] | Status  | [0mPre-alignment done for <CMU_MOSI_Visual_Facet_42> ...
[94m[1m[2024-12-26 19:14:28.719] | Status  | [0mAlignment starting ...


Overall Progress:   0%|          | 0/92 [00:00<?, ? Computational Sequence Entries/s]
  0%|          | 0/464 [00:00<?, ? Segments/s][A
Aligning 03bSnISJMiM:   0%|          | 0/464 [00:00<?, ? Segments/s][A
Aligning 03bSnISJMiM:  19%|█▉        | 90/464 [00:00<00:00, 894.06 Segments/s][A
Aligning 03bSnISJMiM:  42%|████▏     | 193/464 [00:00<00:00, 969.13 Segments/s][A
Aligning 03bSnISJMiM:  62%|██████▎   | 290/464 [00:00<00:00, 951.41 Segments/s][A
Aligning 03bSnISJMiM:  83%|████████▎ | 386/464 [00:00<00:00, 945.20 Segments/s][A
Overall Progress:   1%|          | 1/92 [00:00<00:44,  2.03 Computational Sequence Entries/s]
  0%|          | 0/485 [00:00<?, ? Segments/s][A
Aligning 0h-zjBukYpk:   0%|          | 0/485 [00:00<?, ? Segments/s][A
Aligning 0h-zjBukYpk:  20%|█▉        | 95/485 [00:00<00:00, 949.35 Segments/s][A
Aligning 0h-zjBukYpk:  40%|████      | 195/485 [00:00<00:00, 978.66 Segments/s][A
Aligning 0h-zjBukYpk:  61%|██████    | 296/485 [00:00<00:00, 987.22 Segments/s]

[92m[1m[2024-12-26 19:15:18.277] | Success | [0mAlignment to <CMU_MOSI_TimestampedWords> complete.
[94m[1m[2024-12-26 19:15:18.278] | Status  | [0mReplacing dataset content with aligned computational sequences
[92m[1m[2024-12-26 19:15:18.285] | Success | [0mInitialized empty <CMU_MOSI_TimestampedWords> computational sequence.
[94m[1m[2024-12-26 19:15:18.285] | Status  | [0mChecking the format of the data in <CMU_MOSI_TimestampedWords> computational sequence ...


                                                                      

[92m[1m[2024-12-26 19:15:18.338] | Success | [0m<CMU_MOSI_TimestampedWords> computational sequence data in correct format.
[94m[1m[2024-12-26 19:15:18.338] | Status  | [0mChecking the format of the metadata in <CMU_MOSI_TimestampedWords> computational sequence ...
[92m[1m[2024-12-26 19:15:18.338] | Success | [0mInitialized empty <CMU_MOSI_Visual_Facet_42> computational sequence.
[94m[1m[2024-12-26 19:15:18.338] | Status  | [0mChecking the format of the data in <CMU_MOSI_Visual_Facet_42> computational sequence ...


                                                                      

[92m[1m[2024-12-26 19:15:18.388] | Success | [0m<CMU_MOSI_Visual_Facet_42> computational sequence data in correct format.
[94m[1m[2024-12-26 19:15:18.388] | Status  | [0mChecking the format of the metadata in <CMU_MOSI_Visual_Facet_42> computational sequence ...
[92m[1m[2024-12-26 19:15:18.388] | Success | [0mInitialized empty <CMU_MOSI_COVAREP> computational sequence.
[94m[1m[2024-12-26 19:15:18.388] | Status  | [0mChecking the format of the data in <CMU_MOSI_COVAREP> computational sequence ...


                                                                      

[92m[1m[2024-12-26 19:15:18.442] | Success | [0m<CMU_MOSI_COVAREP> computational sequence data in correct format.
[94m[1m[2024-12-26 19:15:18.442] | Status  | [0mChecking the format of the metadata in <CMU_MOSI_COVAREP> computational sequence ...
[92m[1m[2024-12-26 19:15:18.494] | Success | [0mComputational sequence read from file F:\MOSEI\model\data\MOSI\cmumosi\CMU_MOSI_Opinion_Labels.csd ...
[94m[1m[2024-12-26 19:15:18.502] | Status  | [0mChecking the integrity of the <Opinion Segment Labels> computational sequence ...
[94m[1m[2024-12-26 19:15:18.502] | Status  | [0mChecking the format of the data in <Opinion Segment Labels> computational sequence ...


                                                                   

[92m[1m[2024-12-26 19:15:18.534] | Success | [0m<Opinion Segment Labels> computational sequence data in correct format.
[94m[1m[2024-12-26 19:15:18.534] | Status  | [0mChecking the format of the metadata in <Opinion Segment Labels> computational sequence ...
[94m[1m[2024-12-26 19:15:18.534] | Status  | [0mUnify was called ...
[92m[1m[2024-12-26 19:15:18.604] | Success | [0mUnify completed ...
[94m[1m[2024-12-26 19:15:18.606] | Status  | [0mPre-alignment based on <CMU_MOSI_Opinion_Labels> computational sequence started ...
[94m[1m[2024-12-26 19:15:18.683] | Status  | [0mPre-alignment done for <CMU_MOSI_TimestampedWords> ...
[94m[1m[2024-12-26 19:15:18.780] | Status  | [0mPre-alignment done for <CMU_MOSI_COVAREP> ...
[94m[1m[2024-12-26 19:15:18.859] | Status  | [0mPre-alignment done for <CMU_MOSI_Visual_Facet_42> ...
[94m[1m[2024-12-26 19:15:18.861] | Status  | [0mAlignment starting ...


Overall Progress:   0%|          | 0/92 [00:00<?, ? Computational Sequence Entries/s]
  0%|          | 0/13 [00:00<?, ? Segments/s][A
Aligning 03bSnISJMiM:   0%|          | 0/13 [00:00<?, ? Segments/s][A
                                                                   [A
  0%|          | 0/25 [00:00<?, ? Segments/s][A
Aligning 0h-zjBukYpk:   0%|          | 0/25 [00:00<?, ? Segments/s][A
                                                                   [A
  0%|          | 0/14 [00:00<?, ? Segments/s][A
Aligning 1DmNV9C1hbY:   0%|          | 0/14 [00:00<?, ? Segments/s][A
                                                                   [A
  0%|          | 0/30 [00:00<?, ? Segments/s][A
Aligning 1iG0909rllw:   0%|          | 0/30 [00:00<?, ? Segments/s][A
                                                                   [A
  0%|          | 0/63 [00:00<?, ? Segments/s][A
Aligning 2WGyTLYerpo:   0%|          | 0/63 [00:00<?, ? Segments/s][A
                              

[92m[1m[2024-12-26 19:15:20.288] | Success | [0mAlignment to <CMU_MOSI_Opinion_Labels> complete.
[94m[1m[2024-12-26 19:15:20.288] | Status  | [0mReplacing dataset content with aligned computational sequences
[92m[1m[2024-12-26 19:15:20.330] | Success | [0mInitialized empty <CMU_MOSI_TimestampedWords> computational sequence.
[94m[1m[2024-12-26 19:15:20.330] | Status  | [0mChecking the format of the data in <CMU_MOSI_TimestampedWords> computational sequence ...


                                                                     

[92m[1m[2024-12-26 19:15:20.334] | Success | [0m<CMU_MOSI_TimestampedWords> computational sequence data in correct format.
[94m[1m[2024-12-26 19:15:20.334] | Status  | [0mChecking the format of the metadata in <CMU_MOSI_TimestampedWords> computational sequence ...
[92m[1m[2024-12-26 19:15:20.334] | Success | [0mInitialized empty <CMU_MOSI_Visual_Facet_42> computational sequence.
[94m[1m[2024-12-26 19:15:20.334] | Status  | [0mChecking the format of the data in <CMU_MOSI_Visual_Facet_42> computational sequence ...


                                                                     

[92m[1m[2024-12-26 19:15:20.338] | Success | [0m<CMU_MOSI_Visual_Facet_42> computational sequence data in correct format.
[94m[1m[2024-12-26 19:15:20.338] | Status  | [0mChecking the format of the metadata in <CMU_MOSI_Visual_Facet_42> computational sequence ...
[92m[1m[2024-12-26 19:15:20.338] | Success | [0mInitialized empty <CMU_MOSI_COVAREP> computational sequence.
[94m[1m[2024-12-26 19:15:20.338] | Status  | [0mChecking the format of the data in <CMU_MOSI_COVAREP> computational sequence ...


                                                                     

[92m[1m[2024-12-26 19:15:20.343] | Success | [0m<CMU_MOSI_COVAREP> computational sequence data in correct format.
[94m[1m[2024-12-26 19:15:20.344] | Status  | [0mChecking the format of the metadata in <CMU_MOSI_COVAREP> computational sequence ...
[92m[1m[2024-12-26 19:15:20.344] | Success | [0mInitialized empty <CMU_MOSI_Opinion_Labels> computational sequence.
[94m[1m[2024-12-26 19:15:20.344] | Status  | [0mChecking the format of the data in <CMU_MOSI_Opinion_Labels> computational sequence ...


                                                                     

[92m[1m[2024-12-26 19:15:20.347] | Success | [0m<CMU_MOSI_Opinion_Labels> computational sequence data in correct format.
[94m[1m[2024-12-26 19:15:20.347] | Status  | [0mChecking the format of the metadata in <CMU_MOSI_Opinion_Labels> computational sequence ...




In [2]:
print(dataset.keys())

dict_keys(['CMU_MOSI_TimestampedWords', 'CMU_MOSI_Visual_Facet_42', 'CMU_MOSI_COVAREP', 'CMU_MOSI_Opinion_Labels'])


In [3]:
import pickle
import numpy as np
from collections import defaultdict
from transformers import BertTokenizer, BertModel
import torch
from tqdm import tqdm
import torchtext as text
from torchtext.vocab import Vectors

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")
bert_model.eval()  # Set BERT model to evaluation mode

# Dataset structure is assumed to be like dataset['modality']['video_id']['features']

DATASET = md.cmu_mosi
train_fold = DATASET.standard_folds.standard_train_fold
valid_fold = DATASET.standard_folds.standard_valid_fold
test_fold = DATASET.standard_folds.standard_test_fold

# Define folds
data_folds = {
    "train": train_fold,
    "valid": valid_fold,
    "test": test_fold,
}

# Define modalities and labels
AUDIO = 'CMU_MOSI_COVAREP'
VIDEO = 'CMU_MOSI_Visual_Facet_42'
WORD = 'CMU_MOSI_TimestampedWords'
LABEL = 'CMU_MOSI_Opinion_Labels'
MODALITIES = [AUDIO, VIDEO, WORD, LABEL]
SEQ_LEN = 50

# Padding helper function
def lpad(this_array, seq_len):
    temp_array = np.concatenate(
        [np.zeros([seq_len] + list(this_array.shape[1:])), this_array], axis=0
    )[-seq_len:, ...]
    return temp_array

# Detect entry fold
def detect_entry_fold(entry, folds):
    entry_id = entry.split("[")[0]
    for fold_name, fold_videos in folds.items():
        if entry_id in fold_videos:
            return fold_name
    return None

# Extract raw text data
def get_rawtext(dataset, text_field, video_ids):
    text_data = []
    valid_video_ids = []
    for vid in video_ids:
        try:
            words = [
                word[0].decode('utf-8')
                for word in dataset[text_field][vid]['features']
                if word[0] != b'sp'
            ]
            text_data.append(' '.join(words))
            valid_video_ids.append(vid)
        except KeyError:
            print(f"Missing text data for {vid}")
    return text_data, valid_video_ids

# Generate BERT embeddings
def bert_embeddings(text_data, max_seq_len=50):
    embeddings = []
    for text in tqdm(text_data, desc="Extracting BERT embeddings"):
        encoded = tokenizer(
            text,
            max_length=max_seq_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        with torch.no_grad():
            output = bert_model(**encoded)
            last_hidden_state = output.last_hidden_state.squeeze(0).numpy()
        embeddings.append(last_hidden_state)
    return np.array(embeddings)

# Generate GloVe embeddings
def get_word2id(text_data, vids):
    word2id = defaultdict(lambda: len(word2id))
    UNK = word2id['unk']
    data_processed = dict()
    for i, segment in enumerate(text_data):
        words = []
        _words = segment.split()
        for word in _words:
            words.append(word2id[word])
        words = np.asarray(words)
        data_processed[vids[i]] = words

    def _return_unk():
        return UNK

    word2id.default_factory = _return_unk
    return data_processed, word2id


def get_word_embeddings(word2id):
    glove_path = '../glove.840B.300d.txt'
    vec = Vectors(name=glove_path)
    tokens = []
    for w, _ in word2id.items():
        tokens.append(w)
    return vec.get_vecs_by_tokens(tokens, lower_case_backup=True)


def glove_embeddings(text_data, vids, paddings=50):
    data_prod, w2id = get_word2id(text_data, vids)
    word_embeddings_looks_up = get_word_embeddings(w2id)
    looks_up = word_embeddings_looks_up.numpy()
    embedd_data = []
    for vid in vids:
        d = data_prod[vid]
        tmp = []
        look_up = [looks_up[x] for x in d]
        if len(d) > paddings:
            for x in d[:paddings]:
                tmp.append(looks_up[x])
        else:
            for i in range(paddings - len(d)):
                tmp.append(np.zeros(300, ))
            for x in d:
                tmp.append(looks_up[x])
        embedd_data.append(np.array(tmp))
    return np.array(embedd_data)

# Process audio, visual, and text modalities
def process_modalities(dataset, video_ids, seq_len, text_embeddings):
    data = {fold: {mod: [] for mod in ["audio", "vision", "text", "labels"]} for fold in data_folds}
    data['train']['id'], data['valid']['id'], data['test']['id'] = [], [], []

    for i, vid in enumerate(video_ids):
        fold_name = detect_entry_fold(vid, data_folds)
        if not fold_name:
            print(f"Video {vid} doesn't belong to any fold")
            continue

        for modality, new_key in zip([AUDIO, VIDEO], ["audio", "vision"]):
            try:
                features = dataset[modality][vid]['features']
                data[fold_name][new_key].append(lpad(features, seq_len))
            except KeyError:
                print(f"Missing {modality} data for {vid}")

        try:
            label = dataset[LABEL][vid]['features']
            data[fold_name]["labels"].append(label)
        except KeyError:
            print(f"Missing label for {vid}")

        data[fold_name]["text"].append(text_embeddings[i])
        data[fold_name]['id'].append(vid)

    for fold_name in data_folds:
        for modality in ["audio", "vision", "text", "labels"]:
            data[fold_name][modality] = np.array(data[fold_name][modality])

    return data

# Main execution



In [4]:
video_ids = list(dataset[WORD].keys())
raw_text, valid_video_ids = get_rawtext(dataset, WORD, video_ids)
print(f"Extracted {len(raw_text)} valid text entries")

# Generate BERT embeddings
text_bert = bert_embeddings(raw_text, max_seq_len=SEQ_LEN)
print(f"BERT Embeddings Shape: {text_bert.shape}")

# Generate GloVe embeddings
text_glove = glove_embeddings(raw_text, valid_video_ids, paddings=SEQ_LEN)
print(f"GloVe Embeddings Shape: {text_glove.shape}")


Extracted 2183 valid text entries


Extracting BERT embeddings: 100%|██████████| 2183/2183 [02:19<00:00, 15.59it/s]


BERT Embeddings Shape: (2183, 50, 768)


100%|█████████▉| 2196016/2196017 [04:28<00:00, 8167.98it/s] 


GloVe Embeddings Shape: (2183, 50, 300)


In [5]:
# Process data using BERT embeddings
processed_data_bert = process_modalities(dataset, valid_video_ids, SEQ_LEN, text_bert)
# Save processed data
with open("mosi_raw_bert.pkl", "wb") as f:
    pickle.dump(processed_data_bert, f)
    
print("Processed data saved as mosi_raw_bert.pkl")

Processed data saved as mosi_raw_bert.pkl


In [6]:
# Process data using GloVe embeddings
processed_data_glove = process_modalities(dataset, valid_video_ids, SEQ_LEN, text_glove)

with open("mosi_raw_glove.pkl", "wb") as f:
    pickle.dump(processed_data_glove, f)
    
print("Processed data saved as mosi_raw_glove.pkl")

Processed data saved as mosi_raw_glove.pkl
