In [1]:
!yum install -y python3-devel libsndfile-devel ffmpeg
!pip install librosa jiwer sndfile torchaudio
!stat poetaexmachina-mp3-recitations || git clone https://github.com/lsb/poetaexmachina-mp3-recitations.git

Loaded plugins: fastestmirror, ovl
Loading mirror speeds from cached hostfile
 * base: mirror.keystealth.org
 * centos-sclo-rh: mirrors.sonic.net
 * centos-sclo-sclo: centos.mirror.shastacoe.net
 * epel: mirror.prgmr.com
 * extras: mirrors.xtom.com
 * updates: centos.mirror.shastacoe.net
Package python3-devel-3.6.8-18.el7.x86_64 already installed and latest version
Package libsndfile-devel-1.0.25-12.el7_9.1.x86_64 already installed and latest version
Package ffmpeg-3.2.4-1.el7.centos.x86_64 already installed and latest version
Nothing to do
  File: ‘poetaexmachina-mp3-recitations’ -> ‘/home/lsb/poetaexmachina-mp3-recitations/’
  Size: 41        	Blocks: 0          IO Block: 4096   symbolic link
Device: 802h/2050d	Inode: 2361893     Links: 1
Access: (0777/lrwxrwxrwx)  Uid: ( 1000/ UNKNOWN)   Gid: ( 1000/ UNKNOWN)
Access: 2022-02-16 17:13:57.960616580 +0000
Modify: 2022-02-03 10:27:40.381672775 +0000
Change: 2022-02-03 10:27:40.381672775 +0000
 Birth: -


In [2]:
from datasets import load_dataset, load_metric, Dataset
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor, Wav2Vec2ForCTC, Trainer, TrainingArguments
from pathlib import Path
from glob import glob
import IPython
import librosa
import torchaudio
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import math
import json

In [3]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [4]:
import re
def lowerjv(s):
    return re.sub(" +", " ", re.sub("[^a-z \n]"," ", s.lower().replace("j", "i").replace("v", "u"))).strip()

In [5]:
hex_recitations = [
    Path(f).name for f in tqdm(
        []
        + glob("./poetaexmachina-mp3-recitations/txt/3*") * 857
        + glob("./poetaexmachina-mp3-recitations/txt/2*") * 5
        + glob("./poetaexmachina-mp3-recitations/txt/1*0")
        + glob("./poetaexmachina-mp3-recitations/txt/1*2")
        + glob("./poetaexmachina-mp3-recitations/txt/1*4")
        + glob("./poetaexmachina-mp3-recitations/txt/1*6")
        + glob("./poetaexmachina-mp3-recitations/txt/1*8")
    ) if re.match("[a-zA-Z]", Path(f).read_text()) and len(Path(f.replace("txt", "mp3")).read_bytes()) > 2000
]
txt_recitations = [
    lowerjv(Path("./poetaexmachina-mp3-recitations/txt/" + f).read_text()) for f in tqdm(hex_recitations)
]
resamplers = [torchaudio.transforms.Resample(i, 16000) for i in (range(16000, 24001, 80))]
mp3_recitations = [
    resamplers[i % len(resamplers)](
        torchaudio.load("./poetaexmachina-mp3-recitations/mp3/" + f, format="mp3")[0]
    ).numpy().astype(np.float16) for (i,f) in enumerate(tqdm(hex_recitations))
]

  0%|          | 0/172262 [00:00<?, ?it/s]

  0%|          | 0/160174 [00:00<?, ?it/s]

  0%|          | 0/160174 [00:00<?, ?it/s]

In [6]:
txt_recitations[0]

'quam diu etiam furor iste tuus nos eludet'

In [7]:
IPython.display.Audio(data=mp3_recitations[0], rate=26000)

In [8]:
#vocab_dict = {v: k for k, v in enumerate(sorted(list(set(" ".join(txt_recitations)))))}
vocab_dict = {' ': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9,
              'k': 10, 'l': 11, 'm': 12, 'n': 13, 'o': 14, 'p': 15, 'q': 16, 'r': 17, 's': 18,
              't': 19, 'u': 20, 'x': 21, 'y': 22, 'z': 23}
print(vocab_dict)
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
print(len(vocab_dict))
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)


{' ': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'k': 10, 'l': 11, 'm': 12, 'n': 13, 'o': 14, 'p': 15, 'q': 16, 'r': 17, 's': 18, 't': 19, 'u': 20, 'x': 21, 'y': 22, 'z': 23}
26


In [9]:
tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)


In [10]:
processor(mp3_recitations[0], sampling_rate=16000).input_values[0].shape

(1, 148595)

In [11]:
processor(mp3_recitations[0], sampling_rate=16000).input_values[0][0]

array([-0.004185, -0.004185, -0.004185, ..., -0.00396 , -0.003412,
       -0.003464], dtype=float16)

In [12]:
with processor.as_target_processor():
    print(processor(txt_recitations[0]))

{'input_ids': [16, 20, 1, 12, 0, 4, 9, 20, 0, 5, 19, 9, 1, 12, 0, 6, 20, 17, 14, 17, 0, 9, 18, 19, 5, 0, 19, 20, 20, 18, 0, 13, 14, 18, 0, 5, 11, 20, 4, 5, 19], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [13]:
processor(mp3_recitations[0], sampling_rate=16000)['input_values'][0].shape

(1, 148595)

In [14]:
mp3_input_values = [processor(i, sampling_rate=16000)['input_values'][0][0].astype(np.float16) for i in tqdm(mp3_recitations)]
mp3_recitations = None

  0%|          | 0/160174 [00:00<?, ?it/s]

In [15]:
mp3_input_values[-1].shape

(23600,)

In [16]:
hex_recitations[-1]

'1031698'

In [17]:
with processor.as_target_processor():
    txt_labels = [processor(i).input_ids for i in tqdm(txt_recitations)]

  0%|          | 0/160174 [00:00<?, ?it/s]

In [18]:
json.dumps(txt_labels[0])

'[16, 20, 1, 12, 0, 4, 9, 20, 0, 5, 19, 9, 1, 12, 0, 6, 20, 17, 14, 17, 0, 9, 18, 19, 5, 0, 19, 20, 20, 18, 0, 13, 14, 18, 0, 5, 11, 20, 4, 5, 19]'

In [19]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [20]:
wer_metric = load_metric("wer")

In [21]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}


In [22]:
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-base-it-voxpopuli", 
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
)


Some weights of the model checkpoint at facebook/wav2vec2-base-it-voxpopuli were not used when initializing Wav2Vec2ForCTC: ['project_hid.weight', 'project_q.weight', 'quantizer.weight_proj.bias', 'quantizer.codevectors', 'quantizer.weight_proj.weight', 'project_q.bias', 'project_hid.bias']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-it-voxpopuli and are newly initialized: ['lm_head.weight', 'lm_head.bias']
You should probably TRAIN this model on a down-stream task to be a

In [23]:
model.freeze_feature_extractor()



In [24]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center>\n<img src=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [25]:
repo_name = "wav2vec2-base-pemlsb-la"
# tokenizer.push_to_hub(repo_name)

In [26]:
training_args = TrainingArguments(
  output_dir=repo_name,
  group_by_length=True,
  per_device_train_batch_size=2,
  per_device_eval_batch_size=4,
  gradient_accumulation_steps=2,
  evaluation_strategy="steps",
  num_train_epochs=1,
  fp16=True,
  fp16_opt_level="O3",
  #optim="adamw_apex_fused",
  gradient_checkpointing=True, 
  save_steps=2000,
  eval_steps=2000,
  load_best_model_at_end=True,
  logging_steps=5, #500?
  learning_rate=1e-5,
  weight_decay=0.005,
  warmup_steps=1000,
  save_total_limit=2,
)

In [27]:
splits = Dataset.from_pandas(pd.DataFrame(
    {"input_values": mp3_input_values, "labels": txt_labels}
)).train_test_split(test_size=0.1)

In [28]:
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=splits["train"],
    eval_dataset=splits["test"],
    tokenizer=processor.feature_extractor,
)


Using amp half precision backend


In [29]:
trainer.train()

***** Running training *****
  Num examples = 144156
  Num Epochs = 1
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 2
  Total optimization steps = 36039


Step,Training Loss,Validation Loss,Wer
2000,1.0536,0.515889,0.892301
4000,0.8894,0.262927,0.668806
6000,0.6863,0.226746,0.571854
8000,0.8792,0.171649,0.455049
10000,0.4741,0.17135,0.438983
12000,0.7879,0.145659,0.426325
14000,0.7251,0.125878,0.341724
16000,0.6659,0.121442,0.356076
18000,0.6042,0.110829,0.308709
20000,0.72,0.101714,0.28718


***** Running Evaluation *****
  Num examples = 16018
  Batch size = 4
Saving model checkpoint to wav2vec2-base-pemlsb-la/checkpoint-2000
Configuration saved in wav2vec2-base-pemlsb-la/checkpoint-2000/config.json
Model weights saved in wav2vec2-base-pemlsb-la/checkpoint-2000/pytorch_model.bin
Configuration saved in wav2vec2-base-pemlsb-la/checkpoint-2000/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 16018
  Batch size = 4
Saving model checkpoint to wav2vec2-base-pemlsb-la/checkpoint-4000
Configuration saved in wav2vec2-base-pemlsb-la/checkpoint-4000/config.json
Model weights saved in wav2vec2-base-pemlsb-la/checkpoint-4000/pytorch_model.bin
Configuration saved in wav2vec2-base-pemlsb-la/checkpoint-4000/preprocessor_config.json
Deleting older checkpoint [wav2vec2-base-pemlsb-la/checkpoint-6000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 16018
  Batch size = 4
Saving model checkpoint to wav2vec2-base-pemlsb-la/checkpoint-6000
C

Configuration saved in wav2vec2-base-pemlsb-la/checkpoint-36000/config.json
Model weights saved in wav2vec2-base-pemlsb-la/checkpoint-36000/pytorch_model.bin
Configuration saved in wav2vec2-base-pemlsb-la/checkpoint-36000/preprocessor_config.json
Deleting older checkpoint [wav2vec2-base-pemlsb-la/checkpoint-30000] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from wav2vec2-base-pemlsb-la/checkpoint-36000 (score: 0.0835757926106453).


TrainOutput(global_step=36039, training_loss=0.5076121249032763, metrics={'train_runtime': 83678.2136, 'train_samples_per_second': 1.723, 'train_steps_per_second': 0.431, 'total_flos': 4.464668585641189e+18, 'train_loss': 0.5076121249032763, 'epoch': 1.0})

In [30]:
for f in ["./vivamus.mp3", "./vae11.mp3", "./poetaexmachina-mp3-recitations/mp3/1100000", "./poetaexmachina-mp3-recitations/mp3/1010000"]:
    soundfile = torchaudio.transforms.Resample(22050, 16000)(torchaudio.load(f, format="mp3")[0]).numpy()
    soundfile_input_values = processor(soundfile, sampling_rate=16000)['input_values'][0][0]
    logits = model(torch.tensor([soundfile_input_values], device="cuda")).logits
    pred_ids = torch.argmax(logits, dim=-1)
    print(processor.batch_decode(pred_ids))
    IPython.display.Audio(data=soundfile, rate=16000)


  after removing the cwd from sys.path.


['ui uamus mea lesbitatque a memus rumoresque senum seueriorum omne s unius aestimemusassis']
['arma uirumque cano troiae qui primus ab oris']
['gloniabuntur']
['conspirauerunt']


In [31]:
for f in ["./qdefitne.mp3", "./qutacpn.mp3"]:
    soundfile = torchaudio.transforms.Resample(44100, 16000)(torchaudio.load(f, format="mp3")[0]).numpy()
    soundfile_input_values = processor(soundfile, sampling_rate=16000)['input_values'][0][0]
    logits = model(torch.tensor([soundfile_input_values], device="cuda")).logits
    pred_ids = torch.argmax(logits, dim=-1)
    print(processor.batch_decode(pred_ids))
    IPython.display.Audio(data=soundfile, rate=16000)


['uam diu etiam furor iste tuus nos eludet']
['quo usque tanndem abutere catilina patientia nostra']


In [36]:
model.push_to_hub(repo_name, "https://huggingface.co/lsb/wav2vec2-base-pemlsb-la")
tokenizer.push_to_hub(repo_name, "https://huggingface.co/lsb/wav2vec2-base-pemlsb-la")

/home/lsb/tironiculum/wav2vec2-base-pemlsb-la is already a clone of https://huggingface.co/lsb/wav2vec2-base-pemlsb-la. Make sure you pull the latest changes with `repo.git_pull()`.
Configuration saved in wav2vec2-base-pemlsb-la/config.json
Model weights saved in wav2vec2-base-pemlsb-la/pytorch_model.bin


Upload file pytorch_model.bin:   0%|          | 32.0k/360M [00:00<?, ?B/s]

To https://huggingface.co/lsb/wav2vec2-base-pemlsb-la
   81d9ca3..9482bb5  main -> main

/home/lsb/tironiculum/wav2vec2-base-pemlsb-la is already a clone of https://huggingface.co/lsb/wav2vec2-base-pemlsb-la. Make sure you pull the latest changes with `repo.git_pull()`.
tokenizer config file saved in wav2vec2-base-pemlsb-la/tokenizer_config.json
Special tokens file saved in wav2vec2-base-pemlsb-la/special_tokens_map.json
To https://huggingface.co/lsb/wav2vec2-base-pemlsb-la
   9482bb5..e236ffb  main -> main



'https://huggingface.co/lsb/wav2vec2-base-pemlsb-la/commit/e236ffba82538917a35e6da44a4b3242ad1e7967'

In [38]:
tokenizer.push_to_hub(repo_name)

tokenizer config file saved in wav2vec2-base-pemlsb-la/tokenizer_config.json
Special tokens file saved in wav2vec2-base-pemlsb-la/special_tokens_map.json
To https://huggingface.co/lsb/wav2vec2-base-pemlsb-la
   e236ffb..b1d66e3  main -> main



'https://huggingface.co/lsb/wav2vec2-base-pemlsb-la/commit/b1d66e3a8924acb93b16da9762ab411881fddfac'

In [39]:
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=TrainingArguments(
      output_dir=repo_name,
      group_by_length=True,
      per_device_train_batch_size=4,
      per_device_eval_batch_size=4,
      gradient_accumulation_steps=1,
      evaluation_strategy="steps",
      num_train_epochs=1,
      fp16=True,
      fp16_opt_level="O2",
      #optim="adamw_apex_fused",
      gradient_checkpointing=True, 
      save_steps=2000,
      eval_steps=2000,
      load_best_model_at_end=True,
      logging_steps=5, #500?
      learning_rate=1e-5,
      weight_decay=0.005,
      warmup_steps=1000,
      save_total_limit=2,
    ),
    compute_metrics=compute_metrics,
    train_dataset=splits["train"],
    eval_dataset=splits["test"],
    tokenizer=processor.feature_extractor,
)
trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using amp half precision backend
***** Running training *****
  Num examples = 144156
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 36039


Step,Training Loss,Validation Loss,Wer
2000,0.4418,0.089742,0.260169
4000,0.4232,0.08065,0.24158
6000,0.3765,0.079562,0.229427
8000,0.2622,0.08058,0.228435
10000,0.2205,0.074916,0.20687
12000,0.3562,0.079235,0.225893
14000,0.3886,0.066214,0.196809
16000,0.5374,0.068627,0.185972
18000,0.5262,0.061899,0.177317
20000,0.4993,0.066939,0.188785


***** Running Evaluation *****
  Num examples = 16018
  Batch size = 4
Saving model checkpoint to wav2vec2-base-pemlsb-la/checkpoint-2000
Configuration saved in wav2vec2-base-pemlsb-la/checkpoint-2000/config.json
Model weights saved in wav2vec2-base-pemlsb-la/checkpoint-2000/pytorch_model.bin
Configuration saved in wav2vec2-base-pemlsb-la/checkpoint-2000/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 16018
  Batch size = 4
Saving model checkpoint to wav2vec2-base-pemlsb-la/checkpoint-4000
Configuration saved in wav2vec2-base-pemlsb-la/checkpoint-4000/config.json
Model weights saved in wav2vec2-base-pemlsb-la/checkpoint-4000/pytorch_model.bin
Configuration saved in wav2vec2-base-pemlsb-la/checkpoint-4000/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 16018
  Batch size = 4
Saving model checkpoint to wav2vec2-base-pemlsb-la/checkpoint-6000
Configuration saved in wav2vec2-base-pemlsb-la/checkpoint-6000/config.json
Model weights saved in 

Model weights saved in wav2vec2-base-pemlsb-la/checkpoint-36000/pytorch_model.bin
Configuration saved in wav2vec2-base-pemlsb-la/checkpoint-36000/preprocessor_config.json
Deleting older checkpoint [wav2vec2-base-pemlsb-la/checkpoint-32000] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from wav2vec2-base-pemlsb-la/checkpoint-34000 (score: 0.05467037484049797).


TrainOutput(global_step=36039, training_loss=0.19258626866929093, metrics={'train_runtime': 68072.939, 'train_samples_per_second': 2.118, 'train_steps_per_second': 0.529, 'total_flos': 4.5555867331466404e+18, 'train_loss': 0.19258626866929093, 'epoch': 1.0})

In [40]:
model.push_to_hub(repo_name, "https://huggingface.co/lsb/wav2vec2-base-pemlsb-la")

/home/lsb/tironiculum/wav2vec2-base-pemlsb-la is already a clone of https://huggingface.co/lsb/wav2vec2-base-pemlsb-la. Make sure you pull the latest changes with `repo.git_pull()`.
Configuration saved in wav2vec2-base-pemlsb-la/config.json
Model weights saved in wav2vec2-base-pemlsb-la/pytorch_model.bin


Upload file checkpoint-34000/optimizer.pt:   0%|          | 32.0k/688M [00:00<?, ?B/s]

Upload file checkpoint-36000/rng_state.pth: 100%|##########| 14.2k/14.2k [00:00<?, ?B/s]

Upload file checkpoint-34000/rng_state.pth: 100%|##########| 14.2k/14.2k [00:00<?, ?B/s]

Upload file checkpoint-34000/pytorch_model.bin:   0%|          | 32.0k/360M [00:00<?, ?B/s]

Upload file checkpoint-34000/scheduler.pt: 100%|##########| 623/623 [00:00<?, ?B/s]

Upload file checkpoint-34000/training_args.bin: 100%|##########| 2.92k/2.92k [00:00<?, ?B/s]

Upload file checkpoint-36000/pytorch_model.bin:   0%|          | 32.0k/360M [00:00<?, ?B/s]

Upload file checkpoint-36000/optimizer.pt:   0%|          | 32.0k/688M [00:00<?, ?B/s]

Upload file checkpoint-36000/scheduler.pt: 100%|##########| 623/623 [00:00<?, ?B/s]

Upload file checkpoint-34000/scaler.pt: 100%|##########| 559/559 [00:00<?, ?B/s]

Upload file checkpoint-36000/scaler.pt: 100%|##########| 559/559 [00:00<?, ?B/s]

To https://huggingface.co/lsb/wav2vec2-base-pemlsb-la
   b1d66e3..8592e5f  main -> main



'https://huggingface.co/lsb/wav2vec2-base-pemlsb-la/commit/8592e5f3df959d85a989935b4793dc02e2073ff8'

In [41]:
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=TrainingArguments(
      output_dir=repo_name,
      group_by_length=True,
      per_device_train_batch_size=4,
      per_device_eval_batch_size=4,
      gradient_accumulation_steps=2,
      evaluation_strategy="steps",
      num_train_epochs=1,
      fp16=True,
      fp16_opt_level="O2",
      #optim="adamw_apex_fused",
      gradient_checkpointing=True, 
      save_steps=2000,
      eval_steps=2000,
      load_best_model_at_end=True,
      logging_steps=5, #500?
      learning_rate=2e-5,
      weight_decay=0.005,
      warmup_steps=10,
      save_total_limit=2,
    ),
    compute_metrics=compute_metrics,
    train_dataset=splits["train"],
    eval_dataset=splits["test"],
    tokenizer=processor.feature_extractor,
)
trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using amp half precision backend
***** Running training *****
  Num examples = 144156
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 2
  Total optimization steps = 18019


Step,Training Loss,Validation Loss,Wer
2000,0.4103,0.062143,0.187811
4000,0.4747,0.072895,0.202903
6000,0.5944,0.054132,0.151911
8000,0.3759,0.049066,0.129986
10000,0.3222,0.050108,0.132113
12000,0.3397,0.040807,0.119293
14000,0.1945,0.041531,0.111864
16000,0.3228,0.040702,0.108276
18000,0.2095,0.039904,0.107483


***** Running Evaluation *****
  Num examples = 16018
  Batch size = 4
Saving model checkpoint to wav2vec2-base-pemlsb-la/checkpoint-2000
Configuration saved in wav2vec2-base-pemlsb-la/checkpoint-2000/config.json
Model weights saved in wav2vec2-base-pemlsb-la/checkpoint-2000/pytorch_model.bin
Configuration saved in wav2vec2-base-pemlsb-la/checkpoint-2000/preprocessor_config.json
Deleting older checkpoint [wav2vec2-base-pemlsb-la/checkpoint-34000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 16018
  Batch size = 4
Saving model checkpoint to wav2vec2-base-pemlsb-la/checkpoint-4000
Configuration saved in wav2vec2-base-pemlsb-la/checkpoint-4000/config.json
Model weights saved in wav2vec2-base-pemlsb-la/checkpoint-4000/pytorch_model.bin
Configuration saved in wav2vec2-base-pemlsb-la/checkpoint-4000/preprocessor_config.json
Deleting older checkpoint [wav2vec2-base-pemlsb-la/checkpoint-36000] due to args.save_total_limit
***** Running Evaluation *****
  Num exa

TrainOutput(global_step=18019, training_loss=0.14957603770683175, metrics={'train_runtime': 35495.6669, 'train_samples_per_second': 4.061, 'train_steps_per_second': 0.508, 'total_flos': 4.499751589279761e+18, 'train_loss': 0.14957603770683175, 'epoch': 1.0})

In [42]:
model.push_to_hub(repo_name, "https://huggingface.co/lsb/wav2vec2-base-pemlsb-la")

/home/lsb/tironiculum/wav2vec2-base-pemlsb-la is already a clone of https://huggingface.co/lsb/wav2vec2-base-pemlsb-la. Make sure you pull the latest changes with `repo.git_pull()`.
Configuration saved in wav2vec2-base-pemlsb-la/config.json
Model weights saved in wav2vec2-base-pemlsb-la/pytorch_model.bin


Upload file checkpoint-16000/optimizer.pt:   0%|          | 32.0k/688M [00:00<?, ?B/s]

Upload file checkpoint-16000/rng_state.pth: 100%|##########| 14.2k/14.2k [00:00<?, ?B/s]

Upload file checkpoint-18000/pytorch_model.bin:   0%|          | 32.0k/360M [00:00<?, ?B/s]

Upload file checkpoint-18000/optimizer.pt:   0%|          | 32.0k/688M [00:00<?, ?B/s]

Upload file checkpoint-16000/training_args.bin: 100%|##########| 2.92k/2.92k [00:00<?, ?B/s]

Upload file checkpoint-16000/scheduler.pt: 100%|##########| 623/623 [00:00<?, ?B/s]

Upload file checkpoint-16000/pytorch_model.bin:   0%|          | 32.0k/360M [00:00<?, ?B/s]

Upload file checkpoint-18000/rng_state.pth: 100%|##########| 14.2k/14.2k [00:00<?, ?B/s]

Upload file checkpoint-16000/scaler.pt: 100%|##########| 559/559 [00:00<?, ?B/s]

Upload file checkpoint-18000/scheduler.pt: 100%|##########| 623/623 [00:00<?, ?B/s]

Upload file checkpoint-18000/scaler.pt: 100%|##########| 559/559 [00:00<?, ?B/s]

To https://huggingface.co/lsb/wav2vec2-base-pemlsb-la
   8592e5f..8486472  main -> main



'https://huggingface.co/lsb/wav2vec2-base-pemlsb-la/commit/84864726c9ca6121a1093a85e23dbc3527130e6f'

In [43]:
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=TrainingArguments(
      output_dir=repo_name,
      group_by_length=True,
      per_device_train_batch_size=4,
      per_device_eval_batch_size=4,
      gradient_accumulation_steps=4,
      evaluation_strategy="steps",
      num_train_epochs=2,
      fp16=True,
      fp16_opt_level="O2",
      #optim="adamw_apex_fused",
      gradient_checkpointing=True, 
      save_steps=2000,
      eval_steps=2000,
      load_best_model_at_end=True,
      logging_steps=5, #500?
      learning_rate=2e-5,
      weight_decay=0.005,
      warmup_steps=10,
      save_total_limit=2,
    ),
    compute_metrics=compute_metrics,
    train_dataset=splits["train"],
    eval_dataset=splits["test"],
    tokenizer=processor.feature_extractor,
)
trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using amp half precision backend
***** Running training *****
  Num examples = 144156
  Num Epochs = 2
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 4
  Total optimization steps = 18018


Step,Training Loss,Validation Loss,Wer
2000,0.3407,0.050166,0.13123
4000,0.2844,0.039119,0.108853
6000,0.3014,0.036844,0.100613
8000,0.302,0.034749,0.088514
10000,0.083,0.033625,0.082185
12000,0.1069,0.035375,0.078146
14000,0.1064,0.029886,0.07115
16000,0.1009,0.031791,0.070123
18000,0.0696,0.030903,0.068554


***** Running Evaluation *****
  Num examples = 16018
  Batch size = 4
Saving model checkpoint to wav2vec2-base-pemlsb-la/checkpoint-2000
Configuration saved in wav2vec2-base-pemlsb-la/checkpoint-2000/config.json
Model weights saved in wav2vec2-base-pemlsb-la/checkpoint-2000/pytorch_model.bin
Configuration saved in wav2vec2-base-pemlsb-la/checkpoint-2000/preprocessor_config.json
Deleting older checkpoint [wav2vec2-base-pemlsb-la/checkpoint-16000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 16018
  Batch size = 4
Saving model checkpoint to wav2vec2-base-pemlsb-la/checkpoint-4000
Configuration saved in wav2vec2-base-pemlsb-la/checkpoint-4000/config.json
Model weights saved in wav2vec2-base-pemlsb-la/checkpoint-4000/pytorch_model.bin
Configuration saved in wav2vec2-base-pemlsb-la/checkpoint-4000/preprocessor_config.json
Deleting older checkpoint [wav2vec2-base-pemlsb-la/checkpoint-18000] due to args.save_total_limit
***** Running Evaluation *****
  Num exa

TrainOutput(global_step=18018, training_loss=0.10783736026461414, metrics={'train_runtime': 59702.7648, 'train_samples_per_second': 4.829, 'train_steps_per_second': 0.302, 'total_flos': 8.932124473346989e+18, 'train_loss': 0.10783736026461414, 'epoch': 2.0})

In [44]:
model.push_to_hub(repo_name, "https://huggingface.co/lsb/wav2vec2-base-pemlsb-la")

/home/lsb/tironiculum/wav2vec2-base-pemlsb-la is already a clone of https://huggingface.co/lsb/wav2vec2-base-pemlsb-la. Make sure you pull the latest changes with `repo.git_pull()`.
Configuration saved in wav2vec2-base-pemlsb-la/config.json
Model weights saved in wav2vec2-base-pemlsb-la/pytorch_model.bin


Upload file checkpoint-14000/optimizer.pt:   0%|          | 32.0k/688M [00:00<?, ?B/s]

Upload file checkpoint-14000/training_args.bin: 100%|##########| 2.92k/2.92k [00:00<?, ?B/s]

Upload file checkpoint-18000/optimizer.pt:   0%|          | 32.0k/688M [00:00<?, ?B/s]

Upload file checkpoint-14000/pytorch_model.bin:   0%|          | 32.0k/360M [00:00<?, ?B/s]

Upload file checkpoint-14000/rng_state.pth: 100%|##########| 14.2k/14.2k [00:00<?, ?B/s]

Upload file checkpoint-18000/rng_state.pth: 100%|##########| 14.2k/14.2k [00:00<?, ?B/s]

Upload file checkpoint-18000/pytorch_model.bin:   0%|          | 32.0k/360M [00:00<?, ?B/s]

Upload file checkpoint-14000/scheduler.pt: 100%|##########| 623/623 [00:00<?, ?B/s]

Upload file checkpoint-18000/scheduler.pt: 100%|##########| 623/623 [00:00<?, ?B/s]

Upload file checkpoint-14000/scaler.pt: 100%|##########| 559/559 [00:00<?, ?B/s]

Upload file checkpoint-18000/scaler.pt: 100%|##########| 559/559 [00:00<?, ?B/s]

To https://huggingface.co/lsb/wav2vec2-base-pemlsb-la
   8486472..e17aaba  main -> main



'https://huggingface.co/lsb/wav2vec2-base-pemlsb-la/commit/e17aaba8d2921786260ceadbfae4a06ff04678e6'

In [45]:
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=TrainingArguments(
      output_dir=repo_name,
      group_by_length=True,
      per_device_train_batch_size=4,
      per_device_eval_batch_size=4,
      gradient_accumulation_steps=8,
      evaluation_strategy="steps",
      num_train_epochs=4,
      fp16=True,
      fp16_opt_level="O2",
      #optim="adamw_apex_fused",
      gradient_checkpointing=True, 
      save_steps=1000,
      eval_steps=1000,
      load_best_model_at_end=True,
      logging_steps=5, #500?
      learning_rate=2e-5,
      weight_decay=0.005,
      warmup_steps=10,
      save_total_limit=2,
    ),
    compute_metrics=compute_metrics,
    train_dataset=splits["train"],
    eval_dataset=splits["test"],
    tokenizer=processor.feature_extractor,
)
trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using amp half precision backend
***** Running training *****
  Num examples = 144156
  Num Epochs = 4
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 8
  Total optimization steps = 18016


Step,Training Loss,Validation Loss,Wer
1000,0.2756,0.037704,0.081915
2000,0.2826,0.031898,0.073116
3000,0.3028,0.028801,0.068374
4000,0.2618,0.026282,0.063397
5000,0.1228,0.025643,0.058475
6000,0.0941,0.030543,0.058835
7000,0.1204,0.027117,0.056058
8000,0.0972,0.026916,0.054886
9000,0.1341,0.031054,0.056924
10000,0.0612,0.02774,0.053805


***** Running Evaluation *****
  Num examples = 16018
  Batch size = 4
Saving model checkpoint to wav2vec2-base-pemlsb-la/checkpoint-1000
Configuration saved in wav2vec2-base-pemlsb-la/checkpoint-1000/config.json
Model weights saved in wav2vec2-base-pemlsb-la/checkpoint-1000/pytorch_model.bin
Configuration saved in wav2vec2-base-pemlsb-la/checkpoint-1000/preprocessor_config.json
Deleting older checkpoint [wav2vec2-base-pemlsb-la/checkpoint-14000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 16018
  Batch size = 4
Saving model checkpoint to wav2vec2-base-pemlsb-la/checkpoint-2000
Configuration saved in wav2vec2-base-pemlsb-la/checkpoint-2000/config.json
Model weights saved in wav2vec2-base-pemlsb-la/checkpoint-2000/pytorch_model.bin
Configuration saved in wav2vec2-base-pemlsb-la/checkpoint-2000/preprocessor_config.json
Deleting older checkpoint [wav2vec2-base-pemlsb-la/checkpoint-18000] due to args.save_total_limit
***** Running Evaluation *****
  Num exa

  Num examples = 16018
  Batch size = 4
Saving model checkpoint to wav2vec2-base-pemlsb-la/checkpoint-18000
Configuration saved in wav2vec2-base-pemlsb-la/checkpoint-18000/config.json
Model weights saved in wav2vec2-base-pemlsb-la/checkpoint-18000/pytorch_model.bin
Configuration saved in wav2vec2-base-pemlsb-la/checkpoint-18000/preprocessor_config.json
Deleting older checkpoint [wav2vec2-base-pemlsb-la/checkpoint-17000] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from wav2vec2-base-pemlsb-la/checkpoint-16000 (score: 0.023078329861164093).


TrainOutput(global_step=18016, training_loss=0.07917359423865837, metrics={'train_runtime': 112171.2783, 'train_samples_per_second': 5.141, 'train_steps_per_second': 0.161, 'total_flos': 1.7787242525946636e+19, 'train_loss': 0.07917359423865837, 'epoch': 4.0})

In [46]:
model.push_to_hub(repo_name, "https://huggingface.co/lsb/wav2vec2-base-pemlsb-la")

/home/lsb/tironiculum/wav2vec2-base-pemlsb-la is already a clone of https://huggingface.co/lsb/wav2vec2-base-pemlsb-la. Make sure you pull the latest changes with `repo.git_pull()`.
Configuration saved in wav2vec2-base-pemlsb-la/config.json
Model weights saved in wav2vec2-base-pemlsb-la/pytorch_model.bin


Upload file checkpoint-16000/optimizer.pt:   0%|          | 32.0k/688M [00:00<?, ?B/s]

Upload file checkpoint-18000/rng_state.pth: 100%|##########| 14.2k/14.2k [00:00<?, ?B/s]

Upload file checkpoint-16000/scheduler.pt: 100%|##########| 623/623 [00:00<?, ?B/s]

Upload file checkpoint-16000/training_args.bin: 100%|##########| 2.92k/2.92k [00:00<?, ?B/s]

Upload file checkpoint-16000/pytorch_model.bin:   0%|          | 32.0k/360M [00:00<?, ?B/s]

Upload file checkpoint-16000/rng_state.pth: 100%|##########| 14.2k/14.2k [00:00<?, ?B/s]

Upload file checkpoint-18000/optimizer.pt:   0%|          | 32.0k/688M [00:00<?, ?B/s]

Upload file checkpoint-18000/pytorch_model.bin:   0%|          | 32.0k/360M [00:00<?, ?B/s]

Upload file checkpoint-18000/scheduler.pt: 100%|##########| 623/623 [00:00<?, ?B/s]

Upload file checkpoint-16000/scaler.pt: 100%|##########| 559/559 [00:00<?, ?B/s]

Upload file checkpoint-18000/scaler.pt: 100%|##########| 559/559 [00:00<?, ?B/s]

To https://huggingface.co/lsb/wav2vec2-base-pemlsb-la
   e17aaba..66874a0  main -> main



'https://huggingface.co/lsb/wav2vec2-base-pemlsb-la/commit/66874a0c746c7a733133f9c6ae89ed28ef2929e4'