# fine-tine whisper large with PEFT-LoRA + int8

[![](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/phineas-pta/fine-tune-whisper-vi/blob/main/train/whisper-large-lora.ipynb)

on colab: mount gdrive using GUI before training

on kaggle: do not use P100 because int8 not properly supported

https://github.com/openai/whisper/discussions/988

https://github.com/huggingface/peft/blob/main/examples/int8_training/peft_bnb_whisper_large_v2_training.ipynb

https://www.kaggle.com/code/leonidkulyk/train-infer-mega-pack-wav2vec2-whisper-qlora

disable evaluation on test sets because it gets stuck indefinitely

In [None]:
from huggingface_hub import notebook_login
notebook_login()
# !huggingface-cli login --token=███

In [None]:
# workaround for a bug in `datasets` package
%pip uninstall -y cudf dask-cuda dask-cudf
%pip install -q cudf-cu12 --extra-index-url=https://pypi.nvidia.com
%pip install -qU "datasets[audio]" accelerate transformers bitsandbytes peft
# no compute metrics so no `jiwer`

In [None]:
from dataclasses import dataclass
import datasets as hugDS
from transformers import WhisperForConditionalGeneration, WhisperFeatureExtractor, WhisperTokenizer, BitsAndBytesConfig, Seq2SeqTrainingArguments, Seq2SeqTrainer
import accelerate
import peft

In [None]:
SAMPLING_RATE = 16_000
def load_my_data(mode, **kwargs):
	tmp = hugDS.load_dataset(**kwargs, trust_remote_code=True, streaming=True).cast_column("audio", hugDS.Audio(sampling_rate=SAMPLING_RATE))
	match mode:
		case 0:
			return tmp
		case 1:
			return tmp.select_columns(["audio", "transcription"])
		case 2:
			return tmp.select_columns(["audio", "sentence"]).rename_column("sentence", "transcription")
		case _:
			raise ValueError("oh no!")

MY_DATA = hugDS.concatenate_datasets([  # total: 159k samples
	load_my_data(path="google/fleurs",                        name="vi_vn", split="train", mode=1),  # 3k
	load_my_data(path="mozilla-foundation/common_voice_16_1", name="vi",    split="train", mode=2),  # 2.3k
	load_my_data(path="vivos",                                              split="train", mode=2),  # 11.7k
	load_my_data(path="doof-ferb/fpt_fosd",                                 split="train", mode=0),  # 25.9k
	load_my_data(path="doof-ferb/infore1_25hours",                          split="train", mode=0),  # 14.9k
	load_my_data(path="doof-ferb/vlsp2020_vinai_100h",                      split="train", mode=0),  # 56.4k
	load_my_data(path="doof-ferb/LSVSC",                                    split="train", mode=1),  # 45k
])

In [None]:
modelID = "openai/whisper-large-v3"
FEATURE_EXTRACTOR = WhisperFeatureExtractor.from_pretrained(modelID)
TOKENIZER = WhisperTokenizer.from_pretrained(modelID, language="vi", task="transcribe")
MODEL = WhisperForConditionalGeneration.from_pretrained(modelID, use_cache=False, quantization_config=BitsAndBytesConfig(load_in_8bit=True), device_map="auto")
# in case weird bug in `peft`: load_in_8bit=False, device_map={"": 0}
MODEL.config.forced_decoder_ids = None
MODEL.config.suppress_tokens = []

# setup to train on multi-GPU with PEFT, see: https://github.com/huggingface/peft/issues/242#issuecomment-1491447956
DEV_MAP = MODEL.hf_device_map.copy()
DEV_MAP["model.decoder.embed_tokens"] = DEV_MAP["model.decoder.embed_positions"] = DEV_MAP["proj_out"] = MODEL._hf_hook.execution_device
accelerate.dispatch_model(MODEL, device_map=DEV_MAP)
setattr(MODEL, "model_parallel", True)
setattr(MODEL, "is_parallelizable", True)

DUMMY_TOKEN = -100

In [None]:
def prepare_dataset(batch):
	audio = batch["audio"]
	batch["input_length"] = len(audio["array"])  # compute input length
	batch["input_features"] = FEATURE_EXTRACTOR(audio["array"], sampling_rate=SAMPLING_RATE).input_features[0]  # compute log-Mel input features
	batch["labels"] = TOKENIZER(batch["transcription"]).input_ids  # encode target text to label ids
	batch["labels_length"] = len(batch["labels"])  # compute labels length
	return batch

def filter_inputs(input_length):
	"""Filter inputs with zero input length or longer than 30s"""
	return 0 < input_length < 48e4  # 30s × 16kHz

def filter_labels(labels_length):
	"""Filter label sequences longer than max length 448 tokens"""
	return labels_length < 448  # MODEL.config.max_length

MY_DATA = (MY_DATA
	# .shuffle(seed=42)  # useless coz streaming multiple datasets (cannot set buffer too high coz not enough RAM)
	.map(prepare_dataset)  # no `num_proc` coz streaming
	.filter(filter_inputs, input_columns= ["input_length"], remove_columns= ["input_length"])
	.filter(filter_labels, input_columns=["labels_length"], remove_columns=["labels_length"])
)  # TODO: enable `batched=True` but don’t know how to write functions

In [None]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
	def __call__(self, features):
		# split inputs and labels since they have to be of different lengths and need different padding methods
		input_features = [{"input_features": feature["input_features"]} for feature in features]
		label_features = [{"input_ids"     : feature["labels"]        } for feature in features]  # get the tokenized label sequences

		batch = FEATURE_EXTRACTOR.pad(input_features, return_tensors="pt")  # treat the audio inputs by simply returning torch tensors
		labels_batch =  TOKENIZER.pad(label_features, return_tensors="pt")  # pad the labels to max length
		labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), DUMMY_TOKEN)  # replace padding with -100 to ignore loss correctly

		if (labels[:, 0] == TOKENIZER.bos_token_id).all().cpu().item():  # if bos token is appended in previous tokenization step,
			labels = labels[:, 1:]  # cut bos token here as it’s append later anyways

		batch["labels"] = labels
		return batch

DATA_COLLATOR = DataCollatorSpeechSeq2SeqWithPadding()

In [None]:
MODEL_BIS = peft.get_peft_model(
	peft.prepare_model_for_kbit_training(MODEL, use_gradient_checkpointing=True, gradient_checkpointing_kwargs={"use_reentrant": False}),
	peft.LoraConfig(r=32, lora_alpha=64, target_modules=["q_proj", "v_proj"], lora_dropout=.05, bias="none")
	# peft.LoraConfig(r=32, lora_alpha=64, target_modules=["fc1", "fc2", "q_proj", "v_proj", "k_proj", "out_proj"], lora_dropout=.1, bias="none")  # 3× bigger
)
MODEL_BIS.print_trainable_parameters()  # 16 millions = 1% of 1.6 billions params of whisper large v3
# if 3× bigger: 57 millions = 3.6% of 1.6 billions params, but no better WER improvement

In [None]:
# mount gdrive using GUI before training
%cd '/content/drive/My Drive/coder'
# %cd /kaggle/working
# !rm -rf ./my-whisper-lora

In [None]:
SAVE_PATH = "./my-whisper-lora"  # mount gdrive using GUI before training
BATCH_SIZE = 16  # should be a multiple of 8

# colab free tier can only run for 8-12h max daily
# kaggle free tier can only run for 30h max weekly but max 12h per session

TRAINING_ARGS = Seq2SeqTrainingArguments(
	output_dir=SAVE_PATH,
	per_device_train_batch_size=BATCH_SIZE,
	per_device_eval_batch_size=BATCH_SIZE,
	fp16=True,
	# bf16=True, tf32=True, torch_compile=True,  # GPU Ampere or later
	report_to=["tensorboard"],

	max_steps=3600,  # no `num_train_epochs` coz streaming
	logging_steps=25,
	save_steps=50,
	# eval_steps=50,
	evaluation_strategy="no",  # "steps"
	save_total_limit=3,

	learning_rate=1e-3,
	warmup_ratio=.05,  # keep between 5-15%
	# gradient_accumulation_steps=1,  # to increase if decrease batch size
	remove_unused_columns=False, label_names=["labels"],  # required by PEFT
	# predict_with_generate=True,  # must disable coz PEFT
)

TRAINER = Seq2SeqTrainer(
	args=TRAINING_ARGS,
	model=MODEL_BIS,
	train_dataset=MY_DATA,
	data_collator=DATA_COLLATOR,
	# compute_metrics=compute_metrics,  # must disable coz PEFT
	tokenizer=FEATURE_EXTRACTOR,  # not TOKENIZER
)

In [None]:
TRAINER.train()  # resume_from_checkpoint=True  # only if resume

In [None]:
TRAINER.save_model()
!zip -FSr res.zip ./my-whisper-lora