### **Prepping Environments**

In [1]:
%%capture

!pip install --upgrade --quiet pip accelerate datasets transformers  evaluate jiwer tensorboard bitsandbytes

In [2]:
# view GPU information 

gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Thu Oct 24 10:27:52 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.90.07              Driver Version: 550.90.07      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla P100-PCIE-16GB           Off |   00000000:00:04.0 Off |                    0 |
| N/A   35C    P0             27W /  250W |       0MiB /  16384MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
import os
from huggingface_hub import notebook_login, login
from transformers import GenerationConfig
from datasets import Dataset, Audio, DatasetDict, load_dataset

In [4]:
login(token="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")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [5]:
# connect to huggingFace

notebook_login()
# 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

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### **Loading Dataset Spits**
(Train: 80%, Test: 20%)

In [6]:
train_test = ["/kaggle/input/fixed-ak-dataset2000/dataset/train", "/kaggle/input/fixed-ak-dataset2000/dataset/test"]

dataset_folder ="/kaggle/input/fixed-ak-dataset2000/dataset"

In [7]:
# count number of audio files (without two transcriptions)

def count_files(directory):
    # List all files in the directory
    files = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]
    return len(files)

train_count = count_files(train_test[0])
test_count = count_files(train_test[1])

print(f"Number of dataset: {(train_count + test_count) - 2}")

Number of dataset: 2000


In [8]:
#load custom test and train audio files
dataset = load_dataset("audiofolder", data_dir=dataset_folder)

Resolving data files:   0%|          | 0/1601 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/401 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/1603 [00:00<?, ?files/s]

Downloading data:   0%|          | 0/403 [00:00<?, ?files/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [9]:
# downsample audios from 41kHz to 16kHz
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
dataset 

DatasetDict({
    train: Dataset({
        features: ['audio', 'UNAMED', 'File_NO', 'IMAGE_PATH', 'IMAGE_SRC_URL', 'AUDIO_PATH', 'transcription', 'SPEAKER_ID', 'ORG_NAME', 'PROJECT_NAME', 'LOCALE', 'GENDER', 'AGE', 'DEVICE', 'ENVIRONMENT', 'YEAR', 'FILENAME'],
        num_rows: 1600
    })
    test: Dataset({
        features: ['audio', 'UNAMED', 'File_NO', 'IMAGE_PATH', 'IMAGE_SRC_URL', 'AUDIO_PATH', 'transcription', 'SPEAKER_ID', 'ORG_NAME', 'PROJECT_NAME', 'LOCALE', 'GENDER', 'AGE', 'DEVICE', 'ENVIRONMENT', 'YEAR', 'FILENAME'],
        num_rows: 400
    })
})

In [10]:
# remove dataset metadata
metadata_columns = ['UNAMED', 'File_NO', 'IMAGE_PATH', 'IMAGE_SRC_URL', 'AUDIO_PATH', 'SPEAKER_ID', 'ORG_NAME', 'PROJECT_NAME', 'LOCALE', 'GENDER', 'AGE', 'DEVICE', 'ENVIRONMENT', 'YEAR', 'FILENAME']

dataset = dataset.remove_columns(metadata_columns)
dataset

DatasetDict({
    train: Dataset({
        features: ['audio', 'transcription'],
        num_rows: 1600
    })
    test: Dataset({
        features: ['audio', 'transcription'],
        num_rows: 400
    })
})

In [11]:
dataset["train"][0]

{'audio': {'path': '/kaggle/input/fixed-ak-dataset2000/dataset/train/_image_0003_u192_1_1679700553382_03142.mp3',
  'array': array([ 0.00462688,  0.01018035,  0.00862849, ..., -0.00363392,
         -0.0027472 , -0.00274452]),
  'sampling_rate': 16000},
 'transcription': 'Ogya atɔ baabi ama baabi ahyew ama hɔ nyinaa nneɛma a ɛwɔ hɔ nyinaa ayɛ dadaw. Cars no a ɛwɔ hɔ nyinaa ayɛ tumm ɛwɔ hɔnom ma hɔnom nyinaa ayɛ nneɛma a ɛwɔ hɔ no nyinaa ayɛ basabasa ɛyɛ cars a ɛwɔ no nyinaa. '}

In [12]:
dataset["test"][0]

{'audio': {'path': '/kaggle/input/fixed-ak-dataset2000/dataset/test/_image_0011_u893_5_1683649978142_18932.mp3',
  'array': array([ 1.91111434e-02,  3.25603820e-02,  2.51497962e-02, ...,
          1.33048030e-04,  4.23055899e-05, -5.94349185e-05]),
  'sampling_rate': 16000},
 'transcription': 'Mmɔfra anan a ɔde bokiti ato nsu mu resaw. Asaase na ɛda wɔn ɛkyi no ɛyɛ kesee but ntama bɔ baako mu a ɔresrew. '}

In [13]:
# # save split dataset for future use
# split_folder = "/kaggle/input/fixed-ak-dataset2000/saved_dataset_split"

# dataset.save_to_disk(split_folder)

### **Prepare Feature Extractor, Tokenizer, Data**

In [14]:
model_ckeckpoint = "openai/whisper-small"
model_task = "transcribe"
model_language = "Yoruba"
hf_repo = "nyarkssss/epoch_small_pwer"

In [15]:
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained(f"{model_ckeckpoint}")

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

In [16]:
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained(f"{model_ckeckpoint}", language=f"{model_language}", task=f"{model_task}")
# f"{{model_language}}"

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

In [17]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained(f"{model_ckeckpoint}", language=f"{model_language}", task=f"{model_task}")

In [18]:
processor.save_pretrained(f"{hf_repo}", push_to_hub=True)

No files have been modified since last commit. Skipping to prevent empty commit.


[]

In [19]:
tokenizer.save_pretrained(f"{hf_repo}", push_to_hub=True)

No files have been modified since last commit. Skipping to prevent empty commit.


('nyarkssss/epoch_small_pwer/tokenizer_config.json',
 'nyarkssss/epoch_small_pwer/special_tokens_map.json',
 'nyarkssss/epoch_small_pwer/vocab.json',
 'nyarkssss/epoch_small_pwer/merges.txt',
 'nyarkssss/epoch_small_pwer/normalizer.json',
 'nyarkssss/epoch_small_pwer/added_tokens.json')

In [20]:
feature_extractor.save_pretrained(f"{hf_repo}", push_to_hub=True)

No files have been modified since last commit. Skipping to prevent empty commit.


['nyarkssss/epoch_small_pwer/preprocessor_config.json']

In [21]:
# verify that the tokenizer correctly encodes Akan characters 

input_str = dataset["train"][0]["transcription"]
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

print(f"Input:                 {input_str} \n")
print(f"Decoded w/ special:    {decoded_with_special} \n")
print(f"Decoded w/out special: {decoded_str}\n")
print(f"Are equal:             {input_str == decoded_str}")


Input:                 Ogya atɔ baabi ama baabi ahyew ama hɔ nyinaa nneɛma a ɛwɔ hɔ nyinaa ayɛ dadaw. Cars no a ɛwɔ hɔ nyinaa ayɛ tumm ɛwɔ hɔnom ma hɔnom nyinaa ayɛ nneɛma a ɛwɔ hɔ no nyinaa ayɛ basabasa ɛyɛ cars a ɛwɔ no nyinaa.  

Decoded w/ special:    <|startoftranscript|><|yo|><|transcribe|><|notimestamps|>Ogya atɔ baabi ama baabi ahyew ama hɔ nyinaa nneɛma a ɛwɔ hɔ nyinaa ayɛ dadaw. Cars no a ɛwɔ hɔ nyinaa ayɛ tumm ɛwɔ hɔnom ma hɔnom nyinaa ayɛ nneɛma a ɛwɔ hɔ no nyinaa ayɛ basabasa ɛyɛ cars a ɛwɔ no nyinaa. <|endoftext|> 

Decoded w/out special: Ogya atɔ baabi ama baabi ahyew ama hɔ nyinaa nneɛma a ɛwɔ hɔ nyinaa ayɛ dadaw. Cars no a ɛwɔ hɔ nyinaa ayɛ tumm ɛwɔ hɔnom ma hɔnom nyinaa ayɛ nneɛma a ɛwɔ hɔ no nyinaa ayɛ basabasa ɛyɛ cars a ɛwɔ no nyinaa. 

Are equal:             True


### **Prepare Dataset for Whisper Model**

In [22]:
def prepare_dataset(batch):
    # load audio
    audio = batch["audio"]

    # compute log-Mel input features from input audio array 
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids 
    batch["labels"] = tokenizer(batch["transcription"]).input_ids
    return batch

In [23]:
dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names["train"], num_proc=4)

  self.pid = os.fork()


Map (num_proc=4):   0%|          | 0/1600 [00:00<?, ? examples/s]



Map (num_proc=4):   0%|          | 0/400 [00:00<?, ? examples/s]



## **Model Training and Evaluation**

### **Load a Pre-Trained Model Checkpoint**

In [24]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained(f"{model_ckeckpoint}", use_cache=False)

config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

In [25]:
# setup model generation config

model.generation_config.forced_decoder_ids = None
model.config.suppress_tokens = []
# model.generation_config.task = "transcribe"
# model.generation_config.language = "yoruba"

In [26]:
# # setup model config

model.config.dropout = 0.15
model.config.activation_dropout = 0.1

# model.config.use_cache = False

In [27]:
# model.freeze_encoder()

### **Define a Data Collator**

In [28]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch


In [29]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

### **Configure Evaluation Metrics**

In [30]:
import evaluate

metric = evaluate.load("wer")

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

In [31]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [32]:
# # normalise transcriptions and predictions
# import jiwer   

# transcriptions_transform = jiwer.Compose([
# 	jiwer.ToLowerCase(),
# 	# jiwer.RemoveKaldiNonWords(),
# 	jiwer.RemoveMultipleSpaces(),
# 	jiwer.Strip(),
# 	jiwer.RemovePunctuation(),
# 	jiwer.ReduceToListOfListOfWords(),
# ])

In [33]:
# def compute_metrics(pred):
# 	pred_ids = pred.predictions
# 	label_ids = pred.label_ids

#     # replace -100 with the pad_token_id
# 	label_ids[label_ids == -100] = tokenizer.pad_token_id  # replace -100 with the pad_token_id

# 	wer = jiwer.wer(  # we do not want to group tokens when computing the metrics
# 		reference=tokenizer.batch_decode(label_ids, skip_special_tokens=True),
# 		hypothesis=tokenizer.batch_decode(pred_ids, skip_special_tokens=True),
# 		reference_transform=transcriptions_transform, hypothesis_transform=transcriptions_transform
# 	)
    
# 	return {"wer": 100 * wer}

### **Seting up Training Callbacks**

In [34]:
from transformers import EarlyStoppingCallback

# defining early stopping criteria
early_stopping = EarlyStoppingCallback(early_stopping_patience=3)

In [35]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir=f"{hf_repo}",  # change to a repo name of your choice
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-4, #2.5e-5, 2.5e-5
    warmup_steps=200,
    # warmup_ratio=0.2,
    # num_train_epochs=10,
    max_steps=2000,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    fp16=True,
    eval_strategy="steps", #steps, epoch
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    # save_strategy="epoch",
    save_steps=250,
    eval_steps=250,
    logging_steps=25,
    resume_from_checkpoint=True,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
    save_total_limit=1,
    optim="adamw_bnb_8bit",
)

In [36]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
    callbacks=[early_stopping],
)


  trainer = Seq2SeqTrainer(
max_steps is given, it will override any value given in num_train_epochs


### **Model Training**

In [37]:
trainer.train() #resume_from_checkpoint=True  

  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss,Wer
250,0.3432,0.663709,46.895973
500,0.1538,0.712659,45.729103
750,0.0515,0.834023,45.66809
1000,0.0282,0.864405,42.647956
1250,0.0111,0.892264,42.602196
1500,0.0016,0.905549,40.66504
1750,0.0003,0.925552,40.466748
2000,0.0003,0.931576,40.497254


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instea

TrainOutput(global_step=2000, training_loss=0.15395174433570355, metrics={'train_runtime': 22506.3266, 'train_samples_per_second': 1.422, 'train_steps_per_second': 0.089, 'total_flos': 9.23473281024e+18, 'train_loss': 0.15395174433570355, 'epoch': 20.0})

In [38]:
# push trained model to huggingface repo

model_name = (model_ckeckpoint.split('/')[1]).split('-')[1]

kwargs = {
    "dataset_tags": "speech_data_ghana_ug",
    "dataset": "Speech Data Ghana UG - Ghanaian Multilingual Sample Data",
    "language": "ak",
    "model_name": f"Whisper {model_name.capitalize()} Akan",
    "finetuned_from": f"openai/whisper-{model_name.lower()}",
    "tasks": "automatic-speech-recognition",
    "tags": f"akan-whispher-{model_name.lower()}",
}

trainer.push_to_hub(**kwargs)

CommitInfo(commit_url='https://huggingface.co/nyarkssss/epoch_small_pwer/commit/97a2bab2503b020900ca07b1f21f4075e9ee5488', commit_message='End of training', commit_description='', oid='97a2bab2503b020900ca07b1f21f4075e9ee5488', pr_url=None, repo_url=RepoUrl('https://huggingface.co/nyarkssss/epoch_small_pwer', endpoint='https://huggingface.co', repo_type='model', repo_id='nyarkssss/epoch_small_pwer'), pr_revision=None, pr_num=None)

### **Miscellaneous**

In [39]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


# FGHJKL