#Dataset

In [None]:
%%capture
!pip install wandb
!tar -xvf /content/drive/MyDrive/work/Dysarthria_VIVO_system/data/UASpeech.tar

In [None]:
import wandb
import torch
import os
import pandas as pd
from os import walk
import librosa
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoFeatureExtractor, AutoModel, HubertModel, Wav2Vec2FeatureExtractor

In [None]:
# setup wandb
wandb_API_key = "af0ebd78dadd977aadb9b94cc811dc60924219fc"
wandb.login(key = wandb_API_key)

[34m[1mwandb[0m: Currently logged in as: [33mmukhtaralgezoli[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
# specify device for training
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [None]:
class UASpeechDataset(Dataset):
    def __init__(self, UASpeech_metadata_path, model_path, transform=None, target_transform=None):
        self.metadata = pd.read_csv(UASpeech_metadata_path)
        self.processor = AutoFeatureExtractor.from_pretrained(model_path)

    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, idx):
        audio_path = self.metadata.loc[self.metadata.index[idx], 'path']
        speech, samplerate = sf.read(audio_path)
        # seconds = librosa.get_duration(path= audio_path)
        # print(seconds)
        speech = speech[0:250000]

        preprocessed_speech = self.processor(speech, padding="max_length",  max_length = 250000,return_tensors="pt", sampling_rate = 16000).input_values
        # label = self.metadata.iloc[idx, 3]
        label = self.metadata.loc[self.metadata.index[idx], "Intelligibility_Label_id"]

        return preprocessed_speech, label

In [None]:
train_set = UASpeechDataset("/content/drive/MyDrive/work/Dysarthria_VIVO_system/data/train_df_4labels.csv", "facebook/hubert-base-ls960")
test_set = UASpeechDataset("/content/drive/MyDrive/work/Dysarthria_VIVO_system/data/test_df_4labels.csv", "facebook/hubert-base-ls960")
val_set = UASpeechDataset("/content/drive/MyDrive/work/Dysarthria_VIVO_system/data/val_df_4labels.csv", "facebook/hubert-base-ls960")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
batch_size = 8

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_set, batch_size = batch_size, shuffle=True)
test_dataloader = DataLoader(test_set, batch_size = batch_size, shuffle=True)
val_dataloader = DataLoader(val_set, batch_size = batch_size, shuffle=True)

# Model Creation

In [None]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
from transformers import AutoFeatureExtractor, AutoModel, HubertModel, Wav2Vec2FeatureExtractor
import soundfile as sf

In [None]:
# Classification Head
class ClassificationHead(nn.Module):
    def __init__(self, hidden_size, num_labels):
        super().__init__()
        self.linearLayer = nn.Linear(hidden_size, hidden_size)
        self.out_proj = nn.Linear(hidden_size, num_labels)

    def forward(self, features):
        x = features
        x = self.linearLayer(x)
        x = torch.tanh(x)
        x = self.out_proj(x)
        return x

In [None]:
# Model class
class Dysarthria_model(nn.Module):
  def __init__(self, model_path = "facebook/hubert-base-ls960", pooling_mode = "mean", num_output_labels = 4):
        super().__init__()
        # self.processor = AutoFeatureExtractor.from_pretrained(model_path)
        self.SSLModel = AutoModel.from_pretrained(model_path)
        self.classificationHead = ClassificationHead(768, num_output_labels)
        self.pooling_mode = pooling_mode

  def freeze_feature_extractor(self):
        self.SSLModel.feature_extractor._freeze_parameters()

  def merged_strategy(self, output_features, mode="mean"):
        if mode == "mean":
            outputs = torch.mean(output_features, dim=1)
        elif mode == "sum":
            outputs = torch.sum(output_features, dim=1)
        elif mode == "max":
            outputs = torch.max(output_features, dim=1)[0]
        else:
            raise Exception(
                "The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")

        return outputs

  def forward(self, x):


    output_features = self.SSLModel(x).last_hidden_state
    hidden_states = self.merged_strategy(output_features, mode=self.pooling_mode)
    logits = self.classificationHead(hidden_states)

    return logits


In [None]:
model = Dysarthria_model(model_path = "facebook/hubert-base-ls960").to(device)
model.freeze_feature_extractor()

Some weights of the model checkpoint at facebook/hubert-base-ls960 were not used when initializing HubertModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertModel were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

In [None]:
# checkpoint = torch.load("/content/drive/MyDrive/work/Emotions_SER_project/Saved Models/EXP4/Hubert/Checkpoint_Epoch23.pt")
# model.load_state_dict(checkpoint['model_state_dict'])
# # optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
# # epoch = checkpoint['epoch']
# # loss = checkpoint['loss']

In [None]:
# Test model
preprocessed_speech_sample, label = train_set[1]
output = model(preprocessed_speech_sample.to(device))
print(output.shape)

torch.Size([1, 4])


# Training

In [None]:
learning_rate = 1e-3

In [None]:
# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="Dysarthria_classification",
    # id="pwim79rh",
    # resume="must",
    name = "test-run",

    # track hyperparameters and run metadata
    config={
    "learning_rate": learning_rate,
    "architecture": "Hubert for classification",
    "dataset": "UASpeech",
    "epochs": 40,
    }
)

In [None]:
run_id = wandb.run.id
run_name = wandb.run.name
print(run_id)
print(run_name)

qehrpyfo
test-run


In [None]:
def train_loop(dataloader, model, loss_fn, optimizer):

    size = len(dataloader.dataset)
    num_batches = len(dataloader)

    model.train()
    train_loss, correct = 0, 0

    for batch, (batch_input, batch_labels) in enumerate(dataloader):
      batch_input = torch.squeeze(batch_input, 1)
      pred = model(batch_input.to(device))
      loss = loss_fn(pred, batch_labels.to(device))
      loss.backward()
      optimizer.step()
      optimizer.zero_grad()

      train_loss += loss_fn(pred, batch_labels.to(device)).item()
      correct += (pred.argmax(1) == batch_labels.to(device)).type(torch.float).sum().item()

      if batch % 5 == 0:
            loss, current = loss.item(), (batch + 1) * len(batch_input)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

    train_loss /= num_batches
    correct /= size

    wandb.log({"train loss": train_loss})
    wandb.log({"train accuracy": 100*correct})
    return train_loss, 100*correct


def test_loop(dataloader, model, loss_fn):

    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0


    with torch.no_grad():
        for batch_input, batch_labels in dataloader:
            batch_input = torch.squeeze(batch_input, 1)
            pred = model(batch_input.to(device))
            test_loss += loss_fn(pred, batch_labels.to(device)).item()
            correct += (pred.argmax(1) == batch_labels.to(device)).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size

    wandb.log({"val loss": test_loss})
    wandb.log({"val accuracy": 100*correct})

    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    return test_loss, 100*correct


In [None]:
loss_fn = torch.nn.CrossEntropyLoss()

optimizer = torch.optim.SGD(model.parameters(), lr= learning_rate)
# optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

epochs = 40

for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loss, train_acc = train_loop(train_dataloader, model, loss_fn, optimizer)
    val_loss, val_acc = test_loop(val_dataloader, model, loss_fn)
    torch.save({
            'epoch': t,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'train loss': train_loss,
            'val loss': val_loss,
            'train accuracy': train_acc,
            'val accuracy': val_acc,
            "run Id":run_id,
            "run name":run_name,
            }, f"/content/drive/MyDrive/work/Dysarthria_VIVO_system/Saved Models/test_run/Hubert/Checkpoint_4labels_Epoch{t}.pt")

print("Done!")

Epoch 1
-------------------------------
loss: 1.387804  [    8/ 7105]
loss: 1.406658  [   48/ 7105]
loss: 1.404901  [   88/ 7105]
loss: 1.397587  [  128/ 7105]
loss: 1.357024  [  168/ 7105]
loss: 1.322815  [  208/ 7105]
loss: 1.370412  [  248/ 7105]
loss: 1.346765  [  288/ 7105]
loss: 1.302205  [  328/ 7105]
loss: 1.279243  [  368/ 7105]
loss: 1.331513  [  408/ 7105]
loss: 1.419997  [  448/ 7105]
loss: 1.382215  [  488/ 7105]
loss: 1.274992  [  528/ 7105]
loss: 1.351652  [  568/ 7105]
loss: 1.361107  [  608/ 7105]
loss: 1.221396  [  648/ 7105]
loss: 1.353175  [  688/ 7105]
loss: 1.252706  [  728/ 7105]
loss: 1.260732  [  768/ 7105]
loss: 1.293949  [  808/ 7105]
loss: 1.268990  [  848/ 7105]
loss: 1.203893  [  888/ 7105]
loss: 1.354618  [  928/ 7105]
loss: 1.245376  [  968/ 7105]
loss: 1.439895  [ 1008/ 7105]
loss: 1.456529  [ 1048/ 7105]
loss: 1.309897  [ 1088/ 7105]
loss: 1.466358  [ 1128/ 7105]
loss: 1.449846  [ 1168/ 7105]
loss: 1.347154  [ 1208/ 7105]
loss: 1.318473  [ 1248/ 7105]


In [None]:
test_loss, test_acc = test_loop(test_dataloader, model, loss_fn)

In [None]:
val_loss, val_acc = test_loop(val_dataloader, model, loss_fn)