In [1]:
import os
import json
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
from transformers import Trainer
from datasets import load_metric, Audio
from transformers import (
    Wav2Vec2ForCTC,
    Wav2Vec2Processor,
    Wav2Vec2CTCTokenizer,
    Wav2Vec2FeatureExtractor
) 
import re
import librosa

import warnings
warnings.simplefilter('ignore')
from transformers.trainer_utils import get_last_checkpoint, is_main_process
from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version
from transformers import TrainingArguments

2023-08-30 01:37:40.637986: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


[2023-08-30 01:37:43,140] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [2]:
class BengaliDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.processor = Wav2Vec2Processor(
    feature_extractor=feature_extractor, 
    tokenizer=tokenizer)
    
    def __getitem__(self, idx):
        # First read and pre-process the audio file
        audio = self.read_audio(self.data.loc[idx,'path'])
        audio = processor(
            audio, 
            sampling_rate=16000
        ).input_values[0]
        
        with processor.as_target_processor():
            labels = processor(self.data.loc[idx,'sentence']).input_ids
            
        return {'input_values': audio, 'labels': labels}
        
    def __len__(self):
        return len(self.data)
    
    def read_audio(self, mp3_path):
        target_sr = 16000  # Set the target sampling rate
        
        audio, sr = librosa.load(mp3_path, sr=target_sr)  # Load with original sampling rate
        
        
        return audio

In [3]:
def save_vocab(dataframe):
    """
    Saves the processed vocab file as 'vocab.json', to be ingested by tokenizer
    """
    vocab = construct_vocab(dataframe['sentence'].tolist())
    vocab_dict = {v: k for k, v in enumerate(vocab)}
    vocab_dict["__"] = vocab_dict[" "]
    _ = vocab_dict.pop(" ")
    vocab_dict["[UNK]"] = len(vocab_dict)
    vocab_dict["[PAD]"] = len(vocab_dict)

    with open('vocab.json', 'w') as fl:
        json.dump(vocab_dict, fl)


def ctc_data_collator(batch):
    """
    data collator function to dynamically pad the data
    """
    input_features = [{"input_values": sample["input_values"]} for sample in batch]
    label_features = [{"input_ids": sample["labels"]} for sample in batch]
    batch = processor.pad(
        input_features,
        padding=True,
        return_tensors="pt",
    )
    with processor.as_target_processor():
        labels_batch = processor.pad(
            label_features,
            padding=True,
            return_tensors="pt",
        )
        
    labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
    batch["labels"] = labels
    return batch

def construct_vocab(texts):
    """
    Get unique characters from all the text in a list
    """
    all_text = " ".join(texts)
    vocab = list(set(all_text))
    return vocab
    
### Data cleaning, remove punctuations and lowercase
def remove_special_characters(string):

    chars_to_ignore_regex = ', ? . ! - \; \: \" “ % ” �'
    
    clean = re.sub(chars_to_ignore_regex, "", string).lower() + " "
  
    return clean


# ### Word Error Rate (Evaluation Metrics)
# def compute_metrics(pred):

#     wer_metric = load_metric("wer")

#     pred_logits = pred.predictions
#     pred_ids = np.argmax(pred_logits, axis=-1)

#     pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

#     pred_str = processor.batch_decode(pred_ids)
#     # we do not want to group tokens when computing the metrics
#     label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

#     wer = wer_metric.compute(predictions=pred_str, references=label_str)

#     return {"wer": wer}
wer_metric = load_metric("wer")
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}
    

In [4]:
output_dir = './mms'
model_name = 'facebook/mms-300m'

# Load the CSV file
df = pd.read_csv("/home/ubuntu/bengali/data/train.csv")

df['sentence'] = df['sentence'].apply(lambda x: remove_special_characters(x))

# Add the full path to the audio files
df['path'] = df['id'].apply(lambda x: os.path.join('/home/ubuntu/bengali/data/train_mp3s', x+'.mp3'))

# Filter the dataset
train = df[df['split'] == 'train'].sample(frac=0.01, random_state=10).reset_index(drop=True)
val = df[df['split'] == 'valid'].sample(frac=0.01, random_state=200).reset_index(drop=True)

print(f"Training on samples: {len(train)}, Validation on samples: {len(val)}")

Training on samples: 9340, Validation on samples: 296


In [5]:
len(train)

9340

In [6]:
df.head()

Unnamed: 0,id,sentence,split,path
0,000005f3362c,ও বলেছে আপনার ঠিকানা!,train,/home/ubuntu/bengali/data/train_mp3s/000005f33...
1,00001dddd002,কোন মহান রাষ্ট্রের নাগরিক হতে চাও?,train,/home/ubuntu/bengali/data/train_mp3s/00001dddd...
2,00001e0bc131,"আমি তোমার কষ্টটা বুঝছি, কিন্তু এটা সঠিক পথ না।",train,/home/ubuntu/bengali/data/train_mp3s/00001e0bc...
3,000024b3d810,নাচ শেষ হওয়ার পর সকলে শরীর ধুয়ে একসঙ্গে ভোজন...,train,/home/ubuntu/bengali/data/train_mp3s/000024b3d...
4,000028220ab3,"হুমম, ওহ হেই, দেখো।",train,/home/ubuntu/bengali/data/train_mp3s/000028220...


In [7]:
train.head()

Unnamed: 0,id,sentence,split,path
0,93c612a8caa2,"তোমরা নিশ্চয়ই এখানে নতুন, তাই না?",train,/home/ubuntu/bengali/data/train_mp3s/93c612a8c...
1,eea39689fa84,দুই লেগের প্লে-অফের প্রথম ম্যাচে পর্তুগাল নিজে...,train,/home/ubuntu/bengali/data/train_mp3s/eea39689f...
2,2c3dabcc8c82,"ধারণা করা হচ্ছে, অতিরিক্ত মদ্যপান করার কারণে প...",train,/home/ubuntu/bengali/data/train_mp3s/2c3dabcc8...
3,4e1af4af2707,দুপুরে সেখানে যাওয়ার পর ইমন নানাভাবে কালক্ষেপণ...,train,/home/ubuntu/bengali/data/train_mp3s/4e1af4af2...
4,ebe5597e64ed,তিনি বেণীমাধবকে সাথে নিয়ে উত্তরের পথে গমন করেন।,train,/home/ubuntu/bengali/data/train_mp3s/ebe5597e6...


In [8]:
val.head()

Unnamed: 0,id,sentence,split,path
0,c64d5c2936c4,আব্দুল হাকিম মুক্তিযুদ্ধের সংগঠক ছিলেন।,valid,/home/ubuntu/bengali/data/train_mp3s/c64d5c293...
1,fa923f59c89c,তিনি অনুশীলন সমিতি কতিপয় সদস্যের সাথে তার বন্...,valid,/home/ubuntu/bengali/data/train_mp3s/fa923f59c...
2,9b69fbe6db68,চলচ্চিত্রটি পরিচালনা করেছেন অভিনেতা রাঘব লরেন্স।,valid,/home/ubuntu/bengali/data/train_mp3s/9b69fbe6d...
3,f4abf5b8fbf3,এটি এখন পর্যন্ত ফুটবলের সর্বোচ্চ সংস্থা ফিফার ...,valid,/home/ubuntu/bengali/data/train_mp3s/f4abf5b8f...
4,2041d3effe22,"বাংলা ও ইংরেজি ছাড়া উনি হিন্দি, ওড়িয়া, অসমী...",valid,/home/ubuntu/bengali/data/train_mp3s/2041d3eff...


In [9]:
save_vocab(df)

In [10]:
tokenizer = Wav2Vec2CTCTokenizer(
    "vocab.json", 
    unk_token="[UNK]",
    pad_token="[PAD]",
    word_delimiter_token="__"
)
feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1, 
    sampling_rate=16000, 
    padding_value=0.0, 
    do_normalize=True, 
    return_attention_mask=False
)
processor = Wav2Vec2Processor(
    feature_extractor=feature_extractor, 
    tokenizer=tokenizer
)

model = Wav2Vec2ForCTC.from_pretrained(
    model_name,
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size = len(tokenizer),
)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/mms-300m and are newly initialized: ['lm_head.weight', 'lm_head.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
train_ds = BengaliDataset(train)
valid_ds = BengaliDataset(val)



model.freeze_feature_encoder()
model.to('cuda')


training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=5,
        evaluation_strategy="steps",
        num_train_epochs=10,
        gradient_checkpointing=True,
        fp16=True,
        save_steps=100,
        eval_steps=100,
        logging_steps=10,
        learning_rate=2e-5,
        warmup_steps=0,
        save_total_limit=2,
        do_eval = False)


trainer = Trainer(
    model=model,
    data_collator=ctc_data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_ds,   
    eval_dataset=valid_ds,    
    tokenizer=processor.feature_extractor,
)

In [12]:
len(train_ds)

9340

In [13]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mariffnzhn[0m ([33mmalaysia-ai[0m). Use [1m`wandb login --relogin`[0m to force relogin


TypeError: 'NoneType' object is not subscriptable