In [1]:
!yum install -y python3-devel libsndfile-devel ffmpeg
!pip install librosa jiwer sndfile torchaudio

Loaded plugins: fastestmirror, ovl
Loading mirror speeds from cached hostfile
 * base: mirror.keystealth.org
 * centos-sclo-rh: centos.mirror.shastacoe.net
 * centos-sclo-sclo: mirrors.ocf.berkeley.edu
 * epel: mirror.prgmr.com
 * extras: mirrors.ocf.berkeley.edu
 * updates: mirror.keystealth.org
Package python3-devel-3.6.8-18.el7.x86_64 already installed and latest version
Package libsndfile-devel-1.0.25-12.el7_9.1.x86_64 already installed and latest version
Package ffmpeg-3.2.4-1.el7.centos.x86_64 already installed and latest version
Nothing to do


In [2]:
from datasets import load_dataset, load_metric, Dataset
from pathlib import Path
from glob import glob
import IPython
import librosa
import torchaudio
from tqdm.notebook import tqdm
import numpy as np

In [3]:
#all_recitations = [Path(f).name for f in sorted(glob("./poetaexmachina-mp3-recitations/txt/*"))]

In [4]:
#txt_recitations = [Path(f"./poetaexmachina-mp3-recitations/txt/{f}").read_bytes() for f in all_recitations]
#mp3_recitations = [Path(f"./poetaexmachina-mp3-recitations/mp3/{f}").read_bytes() for f in all_recitations]

In [5]:
IPython.display.Audio(
    data=torchaudio.load("./poetaexmachina-mp3-recitations/mp3/2000000",format="mp3")[0].numpy(),
    rate=22050
)

In [6]:
r22_16 = torchaudio.transforms.Resample(22050,16000)
IPython.display.Audio(
    data=r22_16(torchaudio.load("./poetaexmachina-mp3-recitations/mp3/2000000",format="mp3")[0]).numpy(),
    rate=16000
)

In [7]:
import re
def lowerjv(s):
    return re.sub(" +", " ", re.sub("[^a-z \n]"," ", s.lower().replace("j", "i").replace("v", "u"))).strip()

In [8]:
hex_recitations = [Path(f).name for f in sorted(glob("./poetaexmachina-mp3-recitations/txt/2*")) if re.match("[a-z]", Path(f).read_text())]
txt_recitations = [lowerjv(Path("./poetaexmachina-mp3-recitations/txt/" + f).read_text()) for f in tqdm(hex_recitations)]
mp3_recitations = [r22_16(torchaudio.load("./poetaexmachina-mp3-recitations/mp3/" + f, format="mp3")[0]) for f in tqdm(hex_recitations)]

  0%|          | 0/11931 [00:00<?, ?it/s]

  0%|          | 0/11931 [00:00<?, ?it/s]

In [9]:
txt_recitations[10000]

'sub laqueare domus animum non angit auarum'

In [10]:
IPython.display.Audio(data=mp3_recitations[10000], rate=16000)

In [11]:
vocab_dict = {v: k for k, v in enumerate(sorted(list(set(" ".join(txt_recitations)))))}
print(vocab_dict)
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
print(len(vocab_dict))
import json
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)


{' ': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'k': 10, 'l': 11, 'm': 12, 'n': 13, 'o': 14, 'p': 15, 'q': 16, 'r': 17, 's': 18, 't': 19, 'u': 20, 'x': 21, 'y': 22, 'z': 23}
26


In [12]:
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor

tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)


In [13]:
processor(mp3_recitations[10000], sampling_rate=16000).input_values[0].shape

(1, 68546)

In [14]:
processor(mp3_recitations[10000], sampling_rate=16000).input_values[0][0]

array([0.00075688, 0.00075688, 0.00075688, ..., 0.00075688, 0.00075688,
       0.00075688], dtype=float32)

In [15]:
with processor.as_target_processor():
    print(processor(txt_recitations[10000]))

{'input_ids': [18, 20, 2, 0, 11, 1, 16, 20, 5, 1, 17, 5, 0, 4, 14, 12, 20, 18, 0, 1, 13, 9, 12, 20, 12, 0, 13, 14, 13, 0, 1, 13, 7, 9, 19, 0, 1, 20, 1, 17, 20, 12], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [16]:
processor(mp3_recitations[10000], sampling_rate=16000)['input_values'][0].shape

(1, 68546)

In [17]:
mp3_input_values = [processor(i, sampling_rate=16000)['input_values'][0][0] for i in tqdm(mp3_recitations)]

  0%|          | 0/11931 [00:00<?, ?it/s]

In [18]:
with processor.as_target_processor():
    txt_labels = [processor(i).input_ids for i in tqdm(txt_recitations)]

  0%|          | 0/11931 [00:00<?, ?it/s]

In [19]:
txt_labels[0]

[11,
 9,
 19,
 14,
 17,
 1,
 0,
 12,
 20,
 11,
 19,
 20,
 12,
 0,
 9,
 11,
 11,
 5,
 0,
 5,
 19,
 0,
 19,
 5,
 17,
 17,
 9,
 18,
 0,
 9,
 1,
 3,
 19,
 1,
 19,
 20,
 18,
 0,
 5,
 19,
 0,
 1,
 11,
 19,
 14]

In [20]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [21]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [22]:
wer_metric = load_metric("wer")

In [23]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}


In [24]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-base-it-voxpopuli", 
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
)


Some weights of the model checkpoint at facebook/wav2vec2-base-it-voxpopuli were not used when initializing Wav2Vec2ForCTC: ['quantizer.codevectors', 'quantizer.weight_proj.weight', 'quantizer.weight_proj.bias', 'project_q.weight', 'project_hid.weight', 'project_hid.bias', 'project_q.bias']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-it-voxpopuli and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be a

In [25]:
model.freeze_feature_extractor()



In [26]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center>\n<img src=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [27]:
!git config -l

filter.lfs.clean=git-lfs clean -- %f
filter.lfs.smudge=git-lfs smudge -- %f
filter.lfs.process=git-lfs filter-process
filter.lfs.required=true
filter.lfs.clean=git-lfs clean -- %f
filter.lfs.smudge=git-lfs smudge -- %f
filter.lfs.process=git-lfs filter-process
filter.lfs.required=true
credential.helper=store
core.repositoryformatversion=0
core.filemode=true
core.bare=false
core.logallrefupdates=true
remote.origin.url=git@github.com:lsb/tironiculum.git
remote.origin.fetch=+refs/heads/*:refs/remotes/origin/*
branch.trunk.remote=origin
branch.trunk.merge=refs/heads/trunk


In [28]:
!git config --global -l

filter.lfs.clean=git-lfs clean -- %f
filter.lfs.smudge=git-lfs smudge -- %f
filter.lfs.process=git-lfs filter-process
filter.lfs.required=true
credential.helper=store


In [29]:
!git config --global credential.helper store

In [30]:
repo_name = "wav2vec2-base-it-latin"
tokenizer.push_to_hub(repo_name)

To https://huggingface.co/lsb/wav2vec2-base-it-latin
   ddb3a9f..6a384b6  main -> main



'https://huggingface.co/lsb/wav2vec2-base-it-latin/commit/6a384b695d9226e609fe9d43922394cedacd72dc'

In [31]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir=repo_name,
  group_by_length=True,
  per_device_train_batch_size=4,
  evaluation_strategy="steps",
  num_train_epochs=8,
  fp16=True,
  gradient_checkpointing=True, 
  save_steps=500,
  eval_steps=500,
  logging_steps=5, #500?
  learning_rate=4e-6,
  weight_decay=0.005,
  warmup_steps=10,
  save_total_limit=2,
)

In [32]:
# mp3_input_values
# txt_labels
splits = Dataset.from_dict({"input_values": mp3_input_values, "labels": txt_labels}).train_test_split(0.1, seed=42)

In [33]:
splits

DatasetDict({
    train: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 10737
    })
    test: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 1194
    })
})

In [34]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=splits["train"],
    eval_dataset=splits["test"],
    tokenizer=processor.feature_extractor,
)


Using amp half precision backend


In [35]:
data_collator.padding

True

In [36]:
torch.cuda.empty_cache()
trainer.train()

***** Running training *****
  Num examples = 10737
  Num Epochs = 8
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 21480


Step,Training Loss,Validation Loss,Wer
500,5.1809,4.714771,1.0
1000,4.1625,3.914009,1.0
1500,2.8799,2.836321,1.0
2000,1.7113,1.469339,1.0
2500,0.9095,0.668065,0.98589
3000,0.7212,0.54031,0.935865
3500,0.6637,0.467796,0.819779
4000,0.5471,0.429155,0.77745
4500,0.6913,0.393611,0.744613
5000,0.7837,0.366546,0.688815


***** Running Evaluation *****
  Num examples = 1194
  Batch size = 8
Saving model checkpoint to wav2vec2-base-it-latin/checkpoint-500
Configuration saved in wav2vec2-base-it-latin/checkpoint-500/config.json
Model weights saved in wav2vec2-base-it-latin/checkpoint-500/pytorch_model.bin
Configuration saved in wav2vec2-base-it-latin/checkpoint-500/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 1194
  Batch size = 8
Saving model checkpoint to wav2vec2-base-it-latin/checkpoint-1000
Configuration saved in wav2vec2-base-it-latin/checkpoint-1000/config.json
Model weights saved in wav2vec2-base-it-latin/checkpoint-1000/pytorch_model.bin
Configuration saved in wav2vec2-base-it-latin/checkpoint-1000/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 1194
  Batch size = 8
Saving model checkpoint to wav2vec2-base-it-latin/checkpoint-1500
Configuration saved in wav2vec2-base-it-latin/checkpoint-1500/config.json
Model weights saved in wav2vec2-base-it-

Deleting older checkpoint [wav2vec2-base-it-latin/checkpoint-8000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1194
  Batch size = 8
Saving model checkpoint to wav2vec2-base-it-latin/checkpoint-9500
Configuration saved in wav2vec2-base-it-latin/checkpoint-9500/config.json
Model weights saved in wav2vec2-base-it-latin/checkpoint-9500/pytorch_model.bin
Configuration saved in wav2vec2-base-it-latin/checkpoint-9500/preprocessor_config.json
Deleting older checkpoint [wav2vec2-base-it-latin/checkpoint-8500] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1194
  Batch size = 8
Saving model checkpoint to wav2vec2-base-it-latin/checkpoint-10000
Configuration saved in wav2vec2-base-it-latin/checkpoint-10000/config.json
Model weights saved in wav2vec2-base-it-latin/checkpoint-10000/pytorch_model.bin
Configuration saved in wav2vec2-base-it-latin/checkpoint-10000/preprocessor_config.json
Deleting older checkpoint [wav2vec2-base-it-latin/

***** Running Evaluation *****
  Num examples = 1194
  Batch size = 8
Saving model checkpoint to wav2vec2-base-it-latin/checkpoint-18000
Configuration saved in wav2vec2-base-it-latin/checkpoint-18000/config.json
Model weights saved in wav2vec2-base-it-latin/checkpoint-18000/pytorch_model.bin
Configuration saved in wav2vec2-base-it-latin/checkpoint-18000/preprocessor_config.json
Deleting older checkpoint [wav2vec2-base-it-latin/checkpoint-17000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1194
  Batch size = 8
Saving model checkpoint to wav2vec2-base-it-latin/checkpoint-18500
Configuration saved in wav2vec2-base-it-latin/checkpoint-18500/config.json
Model weights saved in wav2vec2-base-it-latin/checkpoint-18500/pytorch_model.bin
Configuration saved in wav2vec2-base-it-latin/checkpoint-18500/preprocessor_config.json
Deleting older checkpoint [wav2vec2-base-it-latin/checkpoint-17500] due to args.save_total_limit
***** Running Evaluation *****
  Num example

TrainOutput(global_step=21480, training_loss=0.8436163530169919, metrics={'train_runtime': 24871.8925, 'train_samples_per_second': 3.454, 'train_steps_per_second': 0.864, 'total_flos': 3.9900246686464696e+18, 'train_loss': 0.8436163530169919, 'epoch': 8.0})

In [39]:
model.push_to_hub(repo_name)

Configuration saved in wav2vec2-base-it-latin/config.json
Model weights saved in wav2vec2-base-it-latin/pytorch_model.bin


Upload file pytorch_model.bin:   0%|          | 32.0k/360M [00:00<?, ?B/s]

To https://huggingface.co/lsb/wav2vec2-base-it-latin
   6a384b6..f8ff755  main -> main



'https://huggingface.co/lsb/wav2vec2-base-it-latin/commit/f8ff7552f7aa325dc21edf5dd0111cf1246dbc77'

In [38]:
for f in ["./vivamus.mp3", "./vae11.mp3", "./poetaexmachina-mp3-recitations/mp3/1100000", "./poetaexmachina-mp3-recitations/mp3/1010000"]:
    soundfile = r22_16(torchaudio.load(f, format="mp3")[0]).numpy()
    soundfile_input_values = processor(soundfile, sampling_rate=16000)['input_values'][0][0]
    logits = model(torch.tensor([soundfile_input_values], device="cuda")).logits
    pred_ids = torch.argmax(logits, dim=-1)
    print(processor.batch_decode(pred_ids))
    IPython.display.Audio(data=soundfile, rate=16000)


  after removing the cwd from sys.path.


['uiuamus mea lesbit atqua memus rumoresque senum seueriorum pomnesunius aestimemus assis']
['arma uirumque cano troiae qui primus aboris']
['gloriabuntur']
['cos irauerunt']
