#Dataset

In [1]:
%%capture
!tar xvf /content/drive/MyDrive/work/ksu_emotions_LDC2017S12.tar

In [2]:
from os import walk

emotions_map = {
    0:"Neutral",
    1:"Happiness",
    2:"Sadness",
    3:"Surprise",
    4:"Questioning",
    5:"Anger"
}
ids = []
files = []
emotion_ids = []
emotion = []
phases = []

for (dirpath, dirnames, filenames) in walk("/content/ksu_emotions/data/SPEECH"):
    for filename in filenames:
      path_splits = dirpath.split("/")
      phase = int(path_splits[-2][-1])
      emotion_id = int(path_splits[-1][-1])
      id = filename.replace(".flac","")
      ids.append(id)
      files.append(dirpath.replace("/content/", "") + "/" + filename)
      emotion.append(emotions_map[emotion_id])
      emotion_ids.append(emotion_id)
      phases.append(phase)



In [3]:
import pandas as pd


# Creating two Series by passing lists
ids = pd.Series(ids)
files = pd.Series(files)
emotion_ids = pd.Series(emotion_ids)
emotion = pd.Series(emotion)
phases = pd.Series(phases)

# Creating a dictionary by passing Series objects as values
frame = {'id': ids,
         'file': files,
         "emotion_id":emotion_ids,
         "emotion":emotion,
         "phase":phases
}

# Creating DataFrame by passing Dictionary
ksu_emotions_dataset = pd.DataFrame(frame)
ksu_emotions_dataset = ksu_emotions_dataset.reset_index(drop=True)

# Printing elements of Dataframe
# print(ksu_emotions_dataset.head())

In [4]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cpu


In [5]:
ksu_emotions_dataset.to_csv("ksu_emotions_metadata.csv")

In [6]:
import os
import pandas as pd
# from torchvision.io import read_image
from torch.utils.data import Dataset
from transformers import AutoFeatureExtractor, AutoModel, HubertModel, Wav2Vec2FeatureExtractor
import soundfile as sf


class KSUEMotionsDataset(Dataset):
    def __init__(self, ksu_emotions_metadata_path, model_path, transform=None, target_transform=None):
        self.metadata = pd.read_csv(ksu_emotions_metadata_path)
        self.processor = AutoFeatureExtractor.from_pretrained(model_path)

    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, idx):
        audio_path = os.path.join("/content", self.metadata.iloc[idx, 2])
        speech, samplerate = sf.read(audio_path)

        preprocessed_speech = self.processor(speech, padding="max_length",  max_length = 250000,return_tensors="pt", sampling_rate = 16000).input_values.to(device)

        label = self.metadata.iloc[idx, 3]

        return preprocessed_speech, label

In [7]:
dataset = KSUEMotionsDataset("/content/ksu_emotions_metadata.csv", "facebook/hubert-base-ls960")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/213 [00:00<?, ?B/s]

In [8]:
from torch.utils.data import DataLoader

allData_dataloader = DataLoader(dataset, batch_size = 4, shuffle=True)

# Model Creation

In [9]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
from transformers import AutoFeatureExtractor, AutoModel, HubertModel, Wav2Vec2FeatureExtractor
import soundfile as sf

In [10]:
# Classification Head
class ClassificationHead(nn.Module):
    def __init__(self, hidden_size, num_labels):
        super().__init__()
        self.linearLayer = nn.Linear(hidden_size, hidden_size)
        self.out_proj = nn.Linear(hidden_size, num_labels)

    def forward(self, features):
        x = features
        x = self.linearLayer(x)
        x = torch.tanh(x)
        x = self.out_proj(x)
        return x

In [11]:
# Model class
class SER_model(nn.Module):
  def __init__(self, model_path = "facebook/hubert-base-ls960", pooling_mode = "mean", num_output_labels = 6):
        super().__init__()
        # self.processor = AutoFeatureExtractor.from_pretrained(model_path)
        self.SSLModel = AutoModel.from_pretrained(model_path)
        self.classificationHead = ClassificationHead(768, num_output_labels)
        self.pooling_mode = pooling_mode

  def freeze_feature_extractor(self):
        self.SSLModel.feature_extractor._freeze_parameters()

  def merged_strategy(self, output_features, mode="mean"):
        if mode == "mean":
            outputs = torch.mean(output_features, dim=1)
        elif mode == "sum":
            outputs = torch.sum(output_features, dim=1)
        elif mode == "max":
            outputs = torch.max(output_features, dim=1)[0]
        else:
            raise Exception(
                "The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")

        return outputs

  def forward(self, x):


    output_features = self.SSLModel(x).last_hidden_state
    hidden_states = self.merged_strategy(output_features, mode=self.pooling_mode)
    logits = self.classificationHead(hidden_states)

    return logits


In [12]:
model = SER_model().to(device)
model.freeze_feature_extractor()

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/hubert-base-ls960 were not used when initializing HubertModel: ['encoder.pos_conv_embed.conv.weight_v', 'encoder.pos_conv_embed.conv.weight_g']
- This IS expected if you are initializing HubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertModel were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

In [13]:
# Test model
preprocessed_speech_sample, label = dataset[1]
output = model(preprocessed_speech_sample)
print(output.shape)

torch.Size([1, 6])


# Training

In [14]:
learning_rate = 1e-3
epochs = 5

In [None]:
loss_fn = torch.nn.CrossEntropyLoss()

optimizer = torch.optim.SGD(model.parameters(), lr= learning_rate)

size = len(allData_dataloader.dataset)

for batch, (batch_input, batch_labels) in enumerate(allData_dataloader):
  batch_input = torch.squeeze(batch_input, 1)
  pred = model(batch_input)
  loss = loss_fn(pred, batch_labels)
  loss.backward()
  optimizer.step()
  optimizer.zero_grad()
  if batch % 100 == 0:
        loss, current = loss.item(), (batch + 1) * len(batch_input)
        print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


