In [1]:
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
import torch
import torchaudio
import pickle
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertModel
from speechbrain.pretrained import EncoderClassifier
from transformers import Wav2Vec2Processor, Wav2Vec2Model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

INFO:speechbrain.utils.quirks:Applied quirks (see `speechbrain.utils.quirks`): [disable_jit_profiling, allow_tf32]
INFO:speechbrain.utils.quirks:Excluded quirks specified by the `SB_DISABLE_QUIRKS` environment (comma-separated list): []


In [2]:
# IEMOCAP dataset paths
IEMOCAP_TRAIN_PATH = "C:/Users/admin/Documents/Speech-Emotion_Recognition-2/metadata/IEMOCAP_metadata_train.csv"
IEMOCAP_VAL_PATH = "C:/Users/admin/Documents/Speech-Emotion_Recognition-2/metadata/IEMOCAP_metadata_val.csv"
IEMOCAP_TEST_PATH = "C:/Users/admin/Documents/Speech-Emotion_Recognition-2/metadata/IEMOCAP_metadata_test.csv"

In [3]:
tokenizer_eng = BertTokenizer.from_pretrained('bert-base-uncased')
text_model_eng = BertModel.from_pretrained('bert-base-uncased')
text_model_eng.to(device)
tokenizer_cmn = BertTokenizer.from_pretrained('bert-base-chinese')
text_model_cmn = BertModel.from_pretrained('bert-base-chinese')
text_model_cmn.to(device)
TEXT_MAX_LENGTH = 100
audio_model = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb")
wav2vec_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
wav2vec_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base")
print("All loaded")


INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch classifier.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch label_encoder.txt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.parameter_transfer:Loading pretrained files for: embedding_model, mean_var_norm_emb, classifier, label_encoder


Downloading (…)rocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


All loaded


In [4]:
# Create training set for IEMOCAP
train_list = pd.read_csv(IEMOCAP_TRAIN_PATH)
train_pkl = []
with torch.no_grad():
    for idx in range(len(train_list)):
        text = train_list['raw_text'][idx]
        text_token = tokenizer_eng(text, return_tensors="pt")
        text_token = text_token.to(device)
        text_outputs = text_model_eng(**text_token)
        text_embeddings = text_outputs.last_hidden_state
        text_embed = text_embeddings[:, 0, :][0].cpu()
        
        audio_file = train_list['audio_file'][idx]
        audio_signal, _ = torchaudio.load(audio_file, normalize=True)
        audio_signal = audio_signal.to(device)
        
        inputs = wav2vec_processor(audio_signal.squeeze().cpu(), return_tensors="pt", sampling_rate=16000)
        audio_outputs = wav2vec_model(**inputs)
        audio_embed = audio_outputs.last_hidden_state.mean(axis=1)[0].cpu()  
        
        label = train_list['label'][idx]
        label = torch.tensor(label)
        train_pkl.append({
            'text_embed': text_embed,
            'audio_embed': audio_embed,
            'label': label
        })
train_file = open("C:/Users/admin/Documents/Speech-Emotion_Recognition-2/features/IEMOCAP_BERT_wav2vec_train.pkl", "wb")

pickle.dump(train_pkl, train_file)

train_file.close()


In [5]:
# Create validation set for IEMOCAP
val_list = pd.read_csv(IEMOCAP_VAL_PATH)
val_pkl = []
with torch.no_grad():
    for idx in range(len(val_list)):
        text = val_list['raw_text'][idx]
        text_token = tokenizer_eng(text, return_tensors="pt")
        text_token = text_token.to(device)
        text_outputs = text_model_eng(**text_token)
        text_embeddings = text_outputs.last_hidden_state
        text_embed = text_embeddings[:, 0, :][0].cpu()
        
        audio_file = train_list['audio_file'][idx]
        audio_signal, _ = torchaudio.load(audio_file, normalize=True)
        audio_signal = audio_signal.to(device)
        
        inputs = wav2vec_processor(audio_signal.squeeze().cpu(), return_tensors="pt", sampling_rate=16000)
        audio_outputs = wav2vec_model(**inputs)
        audio_embed = audio_outputs.last_hidden_state.mean(axis=1)[0].cpu() 
        
        label = val_list['label'][idx]
        label = torch.tensor(label)
        val_pkl.append({
            'text_embed': text_embed,
            'audio_embed': audio_embed,
            'label': label
        })
val_file = open("C:/Users/admin/Documents/Speech-Emotion_Recognition-2/features/IEMOCAP_BERT_wav2vec_val.pkl", "wb")

pickle.dump(val_pkl, val_file)

val_file.close()

In [6]:
# Create independent testing set for IEMOCAP
test_list = pd.read_csv(IEMOCAP_TEST_PATH)
test_pkl = []
with torch.no_grad():
    for idx in range(len(test_list)):
        text = test_list['raw_text'][idx]
        text_token = tokenizer_eng(text, return_tensors="pt")
        text_token = text_token.to(device)
        text_outputs = text_model_eng(**text_token)
        text_embeddings = text_outputs.last_hidden_state
        text_embed = text_embeddings[:, 0, :][0].cpu()
        
        audio_file = train_list['audio_file'][idx]
        audio_signal, _ = torchaudio.load(audio_file, normalize=True)
        audio_signal = audio_signal.to(device)
        
        inputs = wav2vec_processor(audio_signal.squeeze().cpu(), return_tensors="pt", sampling_rate=16000)
        audio_outputs = wav2vec_model(**inputs)
        audio_embed = audio_outputs.last_hidden_state.mean(axis=1)[0].cpu()
        
        label = test_list['label'][idx]
        label = torch.tensor(label)
        test_pkl.append({
            'text_embed': text_embed,
            'audio_embed': audio_embed,
            'label': label
        })
test_file = open("C:/Users/admin/Documents/Speech-Emotion_Recognition-2/features/IEMOCAP_BERT_wav2vec_test.pkl", "wb")

pickle.dump(test_pkl, test_file)

test_file.close()