# I. Import libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
import torch
import torchaudio
import pickle
from transformers import BertTokenizer, BertModel
from speechbrain.pretrained import EncoderClassifier
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# II. Define metadata paths

In [None]:
# ECES dataset paths
ECES_TRAIN_PATH = "metadata/ECES_metadata_train.csv"
ECES_VAL_PATH = "metadata/ECES_metadata_val.csv"
ECES_TEST_PATH = "metadata/ECES_metadata_test.csv"
# IEMOCAP dataset paths
IEMOCAP_TRAIN_PATH = "metadata/IEMOCAP_metadata_train.csv"
IEMOCAP_VAL_PATH = "metadata/IEMOCAP_metadata_val.csv"
IEMOCAP_TEST_PATH = "metadata/IEMOCAP_metadata_test.csv"

# III. ECAPA for audio embeddings + BERT for text embeddings

In [None]:
tokenizer_eng = BertTokenizer.from_pretrained('bert-base-uncased')
text_model_eng = BertModel.from_pretrained('bert-base-uncased')
text_model_eng.to(device)
tokenizer_cmn = BertTokenizer.from_pretrained('bert-base-chinese')
text_model_cmn = BertModel.from_pretrained('bert-base-chinese')
text_model_cmn.to(device)
TEXT_MAX_LENGTH = 100
audio_model = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb")
print("All loaded")

## 1. ECES ENG-CMN

In [None]:
# Create training set for ECES ENG-CMN
train_list = pd.read_csv(ECES_TRAIN_PATH)
train_pkl = []
with torch.no_grad():
    for idx in range(len(train_list)):
        text = train_list['raw_text'][idx]
        if train_list['language'][idx] == "eng":
            text_token = tokenizer_eng(text, return_tensors="pt")
            text_token = text_token.to(device)
            text_outputs = text_model_eng(**text_token)
        elif train_list['language'][idx] == "cmn":
            text_token = tokenizer_cmn(text, return_tensors="pt")
            text_token = text_token.to(device)
            text_outputs = text_model_cmn(**text_token)
        text_embeddings = text_outputs.last_hidden_state
        text_embed = text_embeddings[:, 0, :][0].cpu()
        audio_file = train_list['audio_file'][idx]
        audio_signal, _ = torchaudio.load(audio_file, normalize=True)
        audio_outputs = audio_model.encode_batch(audio_signal)
        audio_embed = audio_outputs.mean(axis=0)[0]
        label = train_list['label'][idx]
        label = torch.tensor(label)
        train_pkl.append({
            'text_embed': text_embed,
            'audio_embed': audio_embed,
            'label': label
        })
train_file = open("features/ECESD_ENG_CMN_BERT_ECAPA_train.pkl", "wb")

pickle.dump(train_pkl, train_file)

train_file.close()

In [None]:
# Create validation set for ECES ENG-CMN
val_list = pd.read_csv(ECES_VAL_PATH)
val_pkl = []
with torch.no_grad():
    for idx in range(len(val_list)):
        text = val_list['raw_text'][idx]
        if val_list['language'][idx] == "eng":
            text_token = tokenizer_eng(text, return_tensors="pt")
            text_token = text_token.to(device)
            text_outputs = text_model_eng(**text_token)
        elif val_list['language'][idx] == "cmn":
            text_token = tokenizer_cmn(text, return_tensors="pt")
            text_token = text_token.to(device)
            text_outputs = text_model_cmn(**text_token)
        text_embeddings = text_outputs.last_hidden_state
        text_embed = text_embeddings[:, 0, :][0].cpu()
        audio_file = val_list['audio_file'][idx]
        audio_signal, _ = torchaudio.load(audio_file, normalize=True)
        audio_outputs = audio_model.encode_batch(audio_signal)
        audio_embed = audio_outputs.mean(axis=0)[0]
        label = val_list['label'][idx]
        label = torch.tensor(label)
        val_pkl.append({
            'text_embed': text_embed,
            'audio_embed': audio_embed,
            'label': label
        })
val_file = open("features/ECESD_ENG_CMN_BERT_ECAPA_val.pkl", "wb")

pickle.dump(val_pkl, val_file)

val_file.close()

In [None]:
# Create independent testing set for ECES ENG-CMN
test_list = pd.read_csv(ECES_TEST_PATH)
test_pkl = []
with torch.no_grad():
    for idx in range(len(test_list)):
        text = test_list['raw_text'][idx]
        if test_list['language'][idx] == "eng":
            text_token = tokenizer_eng(text, return_tensors="pt")
            text_token = text_token.to(device)
            text_outputs = text_model_eng(**text_token)
        elif test_list['language'][idx] == "cmn":
            text_token = tokenizer_cmn(text, return_tensors="pt")
            text_token = text_token.to(device)
            text_outputs = text_model_cmn(**text_token)
        text_embeddings = text_outputs.last_hidden_state
        text_embed = text_embeddings[:, 0, :][0].cpu()
        audio_file = test_list['audio_file'][idx]
        audio_signal, _ = torchaudio.load(audio_file, normalize=True)
        audio_outputs = audio_model.encode_batch(audio_signal)
        audio_embed = audio_outputs.mean(axis=0)[0]
        label = test_list['label'][idx]
        label = torch.tensor(label)
        test_pkl.append({
            'text_embed': text_embed,
            'audio_embed': audio_embed,
            'label': label
        })
test_file = open("features/ECESD_ENG_CMN_BERT_ECAPA_test.pkl", "wb")

pickle.dump(test_pkl, test_file)

test_file.close()

## 2. ECES CMN

In [None]:
# Create training set for ECES CMN
train_list = pd.read_csv(ECES_TRAIN_PATH)
train_pkl = []
with torch.no_grad():
    for idx in range(len(train_list)):
        text = train_list['raw_text'][idx]
        if train_list['language'][idx] == "cmn":
            text_token = tokenizer_cmn(text, return_tensors="pt")
            text_token = text_token.to(device)
            text_outputs = text_model_cmn(**text_token)
            text_embeddings = text_outputs.last_hidden_state
            text_embed = text_embeddings[:, 0, :][0].cpu()
            audio_file = train_list['audio_file'][idx]
            audio_signal, _ = torchaudio.load(audio_file, normalize=True)
            audio_outputs = audio_model.encode_batch(audio_signal)
            audio_embed = audio_outputs.mean(axis=0)[0]
            label = train_list['label'][idx]
            label = torch.tensor(label)
            train_pkl.append({
                'text_embed': text_embed,
                'audio_embed': audio_embed,
                'label': label
            })
train_file = open("features/ECESD_CMN_BERT_ECAPA_train.pkl", "wb")

pickle.dump(train_pkl, train_file)

train_file.close()

In [None]:
# Create validation set for ECES CMN
val_list = pd.read_csv(ECES_VAL_PATH)
val_pkl = []
with torch.no_grad():
    for idx in range(len(val_list)):
        text = val_list['raw_text'][idx]
        if val_list['language'][idx] == "cmn":
            text_token = tokenizer_cmn(text, return_tensors="pt")
            text_token = text_token.to(device)
            text_outputs = text_model_cmn(**text_token)
            text_embeddings = text_outputs.last_hidden_state
            text_embed = text_embeddings[:, 0, :][0].cpu()
            audio_file = val_list['audio_file'][idx]
            audio_signal, _ = torchaudio.load(audio_file, normalize=True)
            audio_outputs = audio_model.encode_batch(audio_signal)
            audio_embed = audio_outputs.mean(axis=0)[0]
            label = val_list['label'][idx]
            label = torch.tensor(label)
            val_pkl.append({
                'text_embed': text_embed,
                'audio_embed': audio_embed,
                'label': label
            })
val_file = open("features/ECESD_CMN_BERT_ECAPA_val.pkl", "wb")

pickle.dump(val_pkl, val_file)

val_file.close()

In [None]:
# Create independent testing set for ECES CMN
test_list = pd.read_csv(ECES_TEST_PATH)
test_pkl = []
with torch.no_grad():
    for idx in range(len(test_list)):
        text = test_list['raw_text'][idx]
        if test_list['language'][idx] == "cmn":
            text_token = tokenizer_cmn(text, return_tensors="pt")
            text_token = text_token.to(device)
            text_outputs = text_model_cmn(**text_token)
            text_embeddings = text_outputs.last_hidden_state
            text_embed = text_embeddings[:, 0, :][0].cpu()
            audio_file = test_list['audio_file'][idx]
            audio_signal, _ = torchaudio.load(audio_file, normalize=True)
            audio_outputs = audio_model.encode_batch(audio_signal)
            audio_embed = audio_outputs.mean(axis=0)[0]
            label = test_list['label'][idx]
            label = torch.tensor(label)
            test_pkl.append({
                'text_embed': text_embed,
                'audio_embed': audio_embed,
                'label': label
            })
test_file = open("features/ECESD_CMN_BERT_ECAPA_test.pkl", "wb")

pickle.dump(test_pkl, test_file)

test_file.close()

## 3. ECES ENG

In [None]:
# Create training set for ECES ENG
train_list = pd.read_csv(ECES_TRAIN_PATH)
train_pkl = []
with torch.no_grad():
    for idx in range(len(train_list)):
        text = train_list['raw_text'][idx]
        if train_list['language'][idx] == "eng":
            text_token = tokenizer_eng(text, return_tensors="pt")
            text_token = text_token.to(device)
            text_outputs = text_model_eng(**text_token)
            text_embeddings = text_outputs.last_hidden_state
            text_embed = text_embeddings[:, 0, :][0].cpu()
            audio_file = train_list['audio_file'][idx]
            audio_signal, _ = torchaudio.load(audio_file, normalize=True)
            audio_outputs = audio_model.encode_batch(audio_signal)
            audio_embed = audio_outputs.mean(axis=0)[0]
            label = train_list['label'][idx]
            label = torch.tensor(label)
            train_pkl.append({
                'text_embed': text_embed,
                'audio_embed': audio_embed,
                'label': label
            })
train_file = open("features/ECESD_ENG_BERT_ECAPA_train.pkl", "wb")

pickle.dump(train_pkl, train_file)

train_file.close()

In [None]:
# Create validation set for ECES ENG
val_list = pd.read_csv(ECES_VAL_PATH)
val_pkl = []
with torch.no_grad():
    for idx in range(len(val_list)):
        text = val_list['raw_text'][idx]
        if val_list['language'][idx] == "eng":
            text_token = tokenizer_eng(text, return_tensors="pt")
            text_token = text_token.to(device)
            text_outputs = text_model_eng(**text_token)
            text_embeddings = text_outputs.last_hidden_state
            text_embed = text_embeddings[:, 0, :][0].cpu()
            audio_file = val_list['audio_file'][idx]
            audio_signal, _ = torchaudio.load(audio_file, normalize=True)
            audio_outputs = audio_model.encode_batch(audio_signal)
            audio_embed = audio_outputs.mean(axis=0)[0]
            label = val_list['label'][idx]
            label = torch.tensor(label)
            val_pkl.append({
                'text_embed': text_embed,
                'audio_embed': audio_embed,
                'label': label
            })
val_file = open("features/ECESD_ENG_BERT_ECAPA_val.pkl", "wb")

pickle.dump(val_pkl, val_file)

val_file.close()

In [None]:
# Create independent testing set for ECES ENG
test_list = pd.read_csv(ECES_TEST_PATH)
test_pkl = []
with torch.no_grad():
    for idx in range(len(test_list)):
        text = test_list['raw_text'][idx]
        if test_list['language'][idx] == "eng":
            text_token = tokenizer_eng(text, return_tensors="pt")
            text_token = text_token.to(device)
            text_outputs = text_model_eng(**text_token)
            text_embeddings = text_outputs.last_hidden_state
            text_embed = text_embeddings[:, 0, :][0].cpu()
            audio_file = test_list['audio_file'][idx]
            audio_signal, _ = torchaudio.load(audio_file, normalize=True)
            audio_outputs = audio_model.encode_batch(audio_signal)
            audio_embed = audio_outputs.mean(axis=0)[0]
            label = test_list['label'][idx]
            label = torch.tensor(label)
            test_pkl.append({
                'text_embed': text_embed,
                'audio_embed': audio_embed,
                'label': label
            })
test_file = open("features/ECESD_ENG_BERT_ECAPA_test.pkl", "wb")

pickle.dump(test_pkl, test_file)

test_file.close()

## 4. IEMOCAP

In [None]:
# Create training set for IEMOCAP
train_list = pd.read_csv(IEMOCAP_TRAIN_PATH)
train_pkl = []
with torch.no_grad():
    for idx in range(len(train_list)):
        text = train_list['raw_text'][idx]
        text_token = tokenizer_eng(text, return_tensors="pt")
        text_token = text_token.to(device)
        text_outputs = text_model_eng(**text_token)
        text_embeddings = text_outputs.last_hidden_state
        text_embed = text_embeddings[:, 0, :][0].cpu()
        audio_file = train_list['audio_file'][idx]
        audio_signal, _ = torchaudio.load(audio_file, normalize=True)
        audio_outputs = audio_model.encode_batch(audio_signal)
        audio_embed = audio_outputs.mean(axis=0)[0]
        label = train_list['label'][idx]
        label = torch.tensor(label)
        train_pkl.append({
            'text_embed': text_embed,
            'audio_embed': audio_embed,
            'label': label
        })
train_file = open("features/IEMOCAP_BERT_ECAPA_train.pkl", "wb")

pickle.dump(train_pkl, train_file)

train_file.close()

In [None]:
# Create validation set for IEMOCAP
val_list = pd.read_csv(IEMOCAP_VAL_PATH)
val_pkl = []
with torch.no_grad():
    for idx in range(len(val_list)):
        text = val_list['raw_text'][idx]
        text_token = tokenizer_eng(text, return_tensors="pt")
        text_token = text_token.to(device)
        text_outputs = text_model_eng(**text_token)
        text_embeddings = text_outputs.last_hidden_state
        text_embed = text_embeddings[:, 0, :][0].cpu()
        audio_file = val_list['audio_file'][idx]
        audio_signal, _ = torchaudio.load(audio_file, normalize=True)
        audio_outputs = audio_model.encode_batch(audio_signal)
        audio_embed = audio_outputs.mean(axis=0)[0]
        label = val_list['label'][idx]
        label = torch.tensor(label)
        val_pkl.append({
            'text_embed': text_embed,
            'audio_embed': audio_embed,
            'label': label
        })
val_file = open("features/IEMOCAP_BERT_ECAPA_val.pkl", "wb")

pickle.dump(val_pkl, val_file)

val_file.close()

In [None]:
# Create independent testing set for IEMOCAP
test_list = pd.read_csv(IEMOCAP_TEST_PATH)
test_pkl = []
with torch.no_grad():
    for idx in range(len(test_list)):
        text = test_list['raw_text'][idx]
        text_token = tokenizer_eng(text, return_tensors="pt")
        text_token = text_token.to(device)
        text_outputs = text_model_eng(**text_token)
        text_embeddings = text_outputs.last_hidden_state
        text_embed = text_embeddings[:, 0, :][0].cpu()
        audio_file = test_list['audio_file'][idx]
        audio_signal, _ = torchaudio.load(audio_file, normalize=True)
        audio_outputs = audio_model.encode_batch(audio_signal)
        audio_embed = audio_outputs.mean(axis=0)[0]
        label = test_list['label'][idx]
        label = torch.tensor(label)
        test_pkl.append({
            'text_embed': text_embed,
            'audio_embed': audio_embed,
            'label': label
        })
test_file = open("features/IEMOCAP_BERT_ECAPA_test.pkl", "wb")

pickle.dump(test_pkl, test_file)

test_file.close()