In [2]:
from datasets import load_dataset, Audio
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
train_ds = load_dataset("MLCommons/peoples_speech", 'clean', split='train')
val_ds = load_dataset("MLCommons/peoples_speech", 'clean', split='validation')
test_ds = load_dataset("MLCommons/peoples_speech", 'clean', split='test')

In [33]:
print(next(iter(train_ds)))

{'id': '07282016HFUUforum_SLASH_07-28-2016_HFUUforum_DOT_mp3_00000.flac', 'audio': {'path': '07282016HFUUforum_SLASH_07-28-2016_HFUUforum_DOT_mp3_00000.flac', 'array': array([ 0.14205933,  0.20620728,  0.27151489, ...,  0.00402832,
       -0.00628662, -0.01422119]), 'sampling_rate': 16000}, 'duration_ms': 14920, 'text': "i wanted this to share a few things but i'm going to not share as much as i wanted to share because we are starting late i'd like to get this thing going so we all get home at a decent hour this this election is very important to"}


We observe the structure of the Dataset by printing this first row. 

The dataset should be already clean, but let's see if we can find something that is out of place.


Next step is now to handle different lengths. In order to do that, we'll:

1. pad: If the length of the audio is below 14 seconds

2. trucate: if the length of the audio is above 15 seconds.

In [None]:
from typing import Any
from torch.utils.data import Dataset, DataLoader
import librosa


class Transform:
  def __init__(
      self,
      n_mels: int,
      fmax: int = 8000,
      max_length_in_s: int = 14,
      target_sr: int = 1600
  ) -> None:
    self.max_length_in_s = max_length_in_s
    self.target_sr = target_sr
    self.n_mels = n_mels
    self.fmax = fmax

  def __call__(self, sample: dict[str, Any]) -> dict[str, Any]:
    sample = self.pad_or_truncate(sample)
    sample = self.resample_sr(sample)
    sample['spectrogram'] = self.generate_mel_spectrogram(sample)
    sample = self.keep_necessary_columns(sample)

    return sample

  def pad_or_truncate(self, sample: dict[str, Any]) -> dict[str, Any]:
    target_freq = 16000
    audio = sample['audio']['array']

    max_length_per_sample = self.max_length_in_s * target_freq

    audio_length = len(audio)

    if (audio_length > max_length_per_sample):
      audio = audio[:max_length_per_sample]

    elif (audio_length < max_length_per_sample):
      pad_array = np.zeros((max_length_per_sample - audio_length), dtype=int)
      audio = np.pad(audio, pad_array, mode='constant')

    sample['audio']['array'] = audio
    sample['duration_ms'] = (len(audio) / target_freq) * 1000

    return sample

  def resample_sr(self, sample: dict[str, Any]) -> dict[str, Any]:
    frequency = sample['audio']['sampling_rate']
    audio = sample['audio']['array']

    resampled = librosa.resample(audio, orig_sr=frequency, target_sr=self.target_sr)

    sample['audio']['array'] = resampled
    sample['sampling_rate'] = self.target_sr

    return sample

  def generate_mel_spectrogram(self, sample: dict[str, Any]) -> np.ndarray:
    audio = sample['audio']['array']
    spectro = librosa.feature.melspectrogram(
      y=audio,
      sr=self.target_sr,
      n_mels=self.n_mels,
      fmax=self.fmax
    )

    db_spectro = librosa.power_to_db(spectro, ref=np.max)

    return db_spectro

  def keep_necessary_columns(self, sample: dict[str, Any]) -> dict[str, Any]:
    formatted_sample: dict[str, Any] = {}

    formatted_sample['text'] = sample['text']
    formatted_sample['spectrogram'] = sample['spectrogram']

    return formatted_sample

In [15]:
from sentence_transformers import (
  SentenceTransformer,
  SentenceTransformerTrainingArguments,
  SentenceTransformerModelCardData,
  SentenceTransformerTrainer
)
from sentence_transformers.losses import (
  MultipleNegativesRankingLoss,
  MatryoshkaLoss
)
from sentence_transformers.evaluation import (
  NanoBEIREvaluator
)
from sentence_transformers.training_args import BatchSamplers

In [16]:
model = SentenceTransformer(
  "sentence-transformers/clip-ViT-B-16",
  model_card_data=SentenceTransformerModelCardData(
    'english',
    'MIT',
    'One4All'
  )
)

In [17]:
training_args = SentenceTransformerTrainingArguments(
  output_dir='./OneClip4All/',
  num_train_epochs=1,
  per_device_train_batch_size=8,
  per_device_eval_batch_size=8,
  optim='adamw_torch',
  seed=42,
  dataloader_num_workers=4,
  eval_steps=500,
  save_steps=500,
  batch_sampler=BatchSamplers.NO_DUPLICATES
)

info_NCE_loss = MultipleNegativesRankingLoss(
  model
)
matryoska_loss = MatryoshkaLoss(
  model=model,
  loss=info_NCE_loss,
  matryoshka_dims=[512, 256, 128, 64]
)

In [18]:
evaluators = []
for layer in matryoska_loss.matryoshka_dims:
  evaluators.append(
    NanoBEIREvaluator()
  )


[A
[A
[A
[A

Generating train split: 100%|██████████| 123/123 [00:00<00:00, 77068.93 examples/s]

[A

Generating train split: 100%|██████████| 5090/5090 [00:00<00:00, 657155.40 examples/s]


Generating train split: 100%|██████████| 50/50 [00:00<00:00, 30908.65 examples/s]


Generating train split: 100%|██████████| 100/100 [00:00<00:00, 47820.13 examples/s]

[A

Generating train split: 100%|██████████| 5043/5043 [00:00<00:00, 783025.77 examples/s]


Generating train split: 100%|██████████| 50/50 [00:00<00:00, 33303.99 examples/s]


Generating train split: 100%|██████████| 50/50 [00:00<00:00, 32676.10 examples/s]

[A

Generating train split: 100%|██████████| 2953/2953 [00:00<00:00, 196157.55 examples/s]


Generating train split: 100%|██████████| 50/50 [00:00<00:00, 34738.31 examples/s]


Generating train split: 100%|██████████| 2518/2518 [00:00<00:00, 1220954.62 examples/s]

[A

Generating train split: 100%|██████████| 5035/5035 [00:00<00:00, 622996.07 examples/s]


Generating t

In [None]:
transform = Transform(40)

train_ds = train_ds.map(transform)
val_ds = val_ds.map(transform)
test_ds = test_ds.map(transform)

In [None]:
trainer = SentenceTransformerTrainer(
  model=model,
  args=training_args,
  train_dataset=train_ds,
  eval_dataset=val_ds,
  loss=matryoska_loss,
  evaluator=evaluators,
)

NameError: name 'train_ds_low_nmel' is not defined

In [None]:
trainer.train(
  resume_from_checkpoint=True,
)

model.push_to_hub(
  'OneClip4All'
)