In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd /content/drive/MyDrive/Data/combined

/content/drive/MyDrive/Data/combined


In [3]:
%%capture
!pip install transformers
!pip install datasets
!pip install pandas
!pip install soundfile
!pip install jiwer
!pip install wandb

In [4]:
import wandb

In [5]:
import os
import gc
import soundfile as sf
import pandas as pd
from scipy.io import wavfile
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import IPython.display as ipd
%matplotlib inline
matplotlib.style.use('ggplot')
import torch
from torch.utils.data import Dataset, DataLoader
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
from datasets import Dataset, load_metric
from transformers import Wav2Vec2CTCTokenizer
from transformers import Wav2Vec2FeatureExtractor
from transformers import Wav2Vec2Processor
from transformers import Wav2Vec2ForCTC
from transformers import TrainingArguments
from transformers import Trainer

In [6]:
class AudioDataset(torch.utils.data.Dataset):
    

    def __init__(self, csv_file, root_dir, processor, transform=None):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the audio.
            processor : contain FeatureExtractor and Tokenizer.
            transform (callable, optional): Optional transform to be applied on a sample.
        """
        self.df_annotation = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.processor = processor
        self.transform = transform

    def __len__(self):
        return len(self.df_annotation)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        audio_name = os.path.join(self.root_dir,
                                self.df_annotation.iloc[idx, 1])
        audio, sr = sf.read(audio_name, dtype = 'float32')
        if self.transform:
            audio = self.transform(audio)
        transcript = self.df_annotation.iloc[idx, 2].lower()
        nor_audio = self.processor(audio,sampling_rate = sr).input_values
        with processor.as_target_processor():
          labels = self.processor([transcript]).input_ids
        sample = {'input_values': nor_audio[0], 'labels': labels[0]}
        return sample

In [7]:
tokenizer = Wav2Vec2CTCTokenizer('/content/drive/MyDrive/vocab.json', unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0,
                                                 do_normalize=True, return_attention_mask=True)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [8]:
CSV_FILE = "/content/drive/MyDrive/Data/combined/train_data10s.csv"
ROOT_DIR = "/content/drive/MyDrive/Data/combined"

In [9]:
train_data = AudioDataset(csv_file= CSV_FILE, root_dir= ROOT_DIR, processor=processor)

In [10]:
dataloader = DataLoader(train_data, batch_size=4, shuffle=True, num_workers=1)

In [10]:
ipd.Audio('/content/drive/MyDrive/Data/combined/output-wav/f3wav00001.wav')

In [11]:
test1 = train_data.__getitem__(1)
test2 = train_data.__getitem__(2)
test3 = train_data.__getitem__(3)
test4 = train_data.__getitem__(4)

In [12]:
print(test1["input_values"].mean(), test1["input_values"].var())

-2.5318787e-09 0.9999928


In [13]:
@dataclass
class DataCollatorCTCWithPadding:

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        #print(features)
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_attention_mask=True,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [14]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [15]:
test5 = data_collator([test1, test2, test3, test4])

In [16]:
test5

{'input_values': tensor([[-0.0274, -0.0628, -0.1088,  ...,  0.0061, -0.0289,  0.0464],
        [ 0.0713,  0.0573,  0.0789,  ...,  0.0000,  0.0000,  0.0000],
        [-0.1569, -0.2088, -0.1061,  ...,  0.0000,  0.0000,  0.0000],
        [-0.5398, -0.5760, -0.6107,  ...,  0.0000,  0.0000,  0.0000]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], dtype=torch.int32), 'labels': tensor([[  11,   36,   27,   30,   13,    5,   35,    6,    4,   20,   39,   20,
            5,   36,   13,   35,    4, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -10

In [17]:
model = Wav2Vec2ForCTC.from_pretrained(
        "facebook/wav2vec2-base",
        attention_dropout=0.1,
        hidden_dropout=0.1,
        feat_proj_dropout=0.0,
        mask_time_prob=0.05,
        layerdrop=0.1,
        gradient_checkpointing=True,
        ctc_loss_reduction="mean",
        pad_token_id=processor.tokenizer.pad_token_id,
        vocab_size=len(processor.tokenizer),
    )

  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "
Some weights of the model checkpoint at facebook/wav2vec2-base were not used when initializing Wav2Vec2ForCTC: ['project_q.weight', 'project_q.bias', 'project_hid.bias', 'quantizer.weight_proj.bias', 'quantizer.codevectors', 'project_hid.weight', 'quantizer.weight_proj.weight']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['lm_head.bias', 'lm_h

In [18]:
model.freeze_feature_extractor()
wer_metric = load_metric("wer")

In [19]:
def compute_metrics(pred):
  pred_logits = pred.predictions
  pred_ids = np.argmax(pred_logits, axis=-1)
  pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
  pred_str = processor.batch_decode(pred_ids)
  # we do not want to group tokens when computing the metrics
  label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
  wer = wer_metric.compute(predictions=pred_str, references=label_str)
  return {"wer": wer}

In [20]:
training_args = TrainingArguments(
        output_dir="./wav2vec2-base-malay",
        group_by_length=True,
        per_device_train_batch_size=4,
        num_train_epochs=5,
        save_steps=500,
        eval_steps=500,
        logging_steps=500,
        learning_rate=1e-4,
        weight_decay=0.005,
        warmup_steps=1000,
        save_total_limit=1
    )

In [21]:
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    train_dataset=train_data,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
  )

In [22]:
trainer.train()

***** Running training *****
  Num examples = 10871
  Num Epochs = 5
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 13590
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


Step,Training Loss
500,5.3925
1000,3.086
1500,3.1378
2000,3.1165
2500,3.1356
3000,3.0777
3500,3.1799
4000,3.1554
4500,3.1406
5000,3.0886


Saving model checkpoint to ./wav2vec2-base-malay/checkpoint-500
Configuration saved in ./wav2vec2-base-malay/checkpoint-500/config.json
Model weights saved in ./wav2vec2-base-malay/checkpoint-500/pytorch_model.bin
Configuration saved in ./wav2vec2-base-malay/checkpoint-500/preprocessor_config.json
Saving model checkpoint to ./wav2vec2-base-malay/checkpoint-1000
Configuration saved in ./wav2vec2-base-malay/checkpoint-1000/config.json
Model weights saved in ./wav2vec2-base-malay/checkpoint-1000/pytorch_model.bin
Configuration saved in ./wav2vec2-base-malay/checkpoint-1000/preprocessor_config.json
Deleting older checkpoint [wav2vec2-base-malay/checkpoint-500] due to args.save_total_limit
Saving model checkpoint to ./wav2vec2-base-malay/checkpoint-1500
Configuration saved in ./wav2vec2-base-malay/checkpoint-1500/config.json
Model weights saved in ./wav2vec2-base-malay/checkpoint-1500/pytorch_model.bin
Configuration saved in ./wav2vec2-base-malay/checkpoint-1500/preprocessor_config.json
Del

TrainOutput(global_step=13590, training_loss=3.222583724155945, metrics={'train_runtime': 10868.2516, 'train_samples_per_second': 5.001, 'train_steps_per_second': 1.25, 'total_flos': 2.8383904670739446e+18, 'train_loss': 3.222583724155945, 'epoch': 5.0})

In [23]:
trainer.save_model("mymodel")

Saving model checkpoint to mymodel
Configuration saved in mymodel/config.json
Model weights saved in mymodel/pytorch_model.bin
Configuration saved in mymodel/preprocessor_config.json


In [40]:
def inference(path_audio: str) -> str:
    """
    :param path_audio:
    :return:
    """
    speech, _ = sf.read(path_audio)
    input_values = processor(speech, sampling_rate=16_000, return_tensors="pt").input_values.to("cuda")
    logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    print(predicted_ids)
    transcription = processor.decode(predicted_ids[0])
    print(transcription)
    return transcription

In [27]:
infer_processor = Wav2Vec2Processor.from_pretrained("mymodel")
infer_model = Wav2Vec2ForCTC.from_pretrained("mymodel")

loading feature extractor configuration file mymodel/preprocessor_config.json
Feature extractor Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": true,
  "sampling_rate": 16000
}

Didn't find file mymodel/tokenizer_config.json. We won't load it.
Didn't find file mymodel/added_tokens.json. We won't load it.
Didn't find file mymodel/special_tokens_map.json. We won't load it.
Didn't find file mymodel/tokenizer.json. We won't load it.
loading file mymodel/vocab.json
loading file None
loading file None
loading file None
loading file None
loading configuration file mymodel/config.json
Model config Wav2Vec2Config {
  "_name_or_path": "facebook/wav2vec2-base",
  "activation_dropout": 0.0,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForCTC"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 25

In [33]:
dftest = pd.read_csv("/content/drive/MyDrive/Data/combined/annotationpart3.csv")

In [41]:
i = 10
print(dftest.iloc[i]["path"], dftest.iloc[i]["transcript"])
audio_file = os.path.join(ROOT_DIR, dftest.iloc[i]["path"])

output-wav/f3wav00010.wav no like macam cool kecil tak Sebenarnya orang selalu guna ayat tu Cuma dia tak ada highlight kan kita pi sebenarnya kopi kopi


In [42]:
print(inference(audio_file))

tensor([[41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,
         41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,
         41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,
         41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,
         41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,
         41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,
         41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,
         41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,
         41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,
         41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,
         41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,
         41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,
         41, 41, 41, 41, 41,