# Step 0 - Setting up environment

## Useful scripts for Colab

In [None]:
# set this flag to False to run code that only needs to run once (e.g. pip, cloning repos)
# useful if you're using the free version of Colab which does not save your settings/data/env
persistent = False

In [None]:
import os
from google.colab import files
import shutil


In [None]:
# for deleting a folder, colab won't let me do it in the GUI
remove = False

if remove:
  shutil.rmtree('/root/.cache/huggingface/hub/models--openai--whisper-large-v2')

In [None]:
# for downloading entire folders, colab won't let me do it in the GUI
download = False

if download:
  dir_to_zip = 'content/distil-whisper' #@param {type: "string"}
  output_filename = 'output.zip' #@param {type: "string"}

  os.system( "zip -r {} {}".format( output_filename , dir_to_zip ) )

  files.download( output_filename )

## Installing everything

In [None]:
if not persistent:
  !git clone https://github.com/huggingface/distil-whisper.git
  !cd distil-whisper && git checkout 398e93b # go back to old version of distil-whisper, latest has unfixable errors!

In [None]:
if not persistent:
  !pip install --upgrade pip
  !pip install -r /content/distil-whisper/training/requirements.txt
  !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
  !pip install optimum
  !pip install deepspeed

In [None]:
# replace bettertransformer on line 532 with 'pass'

if not persistent:
  file_path = '/content/distil-whisper/training/run_pseudo_labelling.py'
  line_number = 532

  try:
      with open(file_path, 'r') as file:
          lines = file.readlines()

      lines[line_number - 1] = '        pass\n'

      with open(file_path, 'w') as file:
          file.writelines(lines)

      print("Line replaced.")

  except FileNotFoundError:
      print(f"Error: File '{file_path}' not found.")
  except IOError:
      print(f"Error: Unable to read or write to file '{file_path}'.")


In [None]:
# create accelerate config file, default works fine
from accelerate.utils import write_basic_config

if not persistent:
  write_basic_config()

In [None]:
# connect Huggingface account
!git config --global credential.helper store
!huggingface-cli login --token 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 # PLEASE USE YOUR OWN TOKEN

In [None]:
# Set the locale to UTF-8, else the accelerate scripts won't run
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
# Makes Pytorch take up a little less GPU space
# This fixes the 'cuda out of memory' issue when evaluating with batch size 16
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

## Test if install was successful

In [None]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset, Audio

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny", low_cpu_mem_usage=True)
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")

model.to("cuda")

common_voice = load_dataset("mozilla-foundation/common_voice_13_0", "en", split="validation", streaming=True)
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))

inputs = processor(next(iter(common_voice))["audio"]["array"], sampling_rate=16000, return_tensors="pt")
input_features = inputs.input_features

generated_ids = model.generate(input_features.to("cuda"), max_new_tokens=128)
pred_text = processor.decode(generated_ids[0], skip_special_tokens=True)

print("Pred text:", pred_text)
print("Environment set up successful?", generated_ids.shape[-1] == 19)

# Step 1 - Pseudo-labelling

Run data through the teacher/full Whisper model to pseudo-label it

In [None]:
# pseudo-label Dutch common voice 13 - EVALUATION split
!accelerate launch distil-whisper/training/run_pseudo_labelling.py \
  --model_name_or_path "openai/whisper-medium" \
  --dataset_name "facebook/voxpopuli" \
  --dataset_config_name "nl" \
  --dataset_split_name "train" \
  --text_column_name "normalized_text" \
  --id_column_name "audio_id" \
  --output_dir "./voxpopuli_nl_TRAIN_pseudo_labelled" \
  --wandb_project "distil-whisper-labelling" \
  --per_device_eval_batch_size 16 \
  --dtype "float16" \
  --dataloader_num_workers 1 \
  --preprocessing_num_workers 1 \
  --logging_steps 500 \
  --max_label_length 128 \
  --report_to "wandb" \
  --language "nl" \
  --task "transcribe" \
  --return_timestamps \
  --attn_type "flash_attn" \
  --streaming False \
  --generation_num_beams 1 \
  --decode_token_ids False \
  --push_to_hub

2024-04-03 13:11:06.046309: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-03 13:11:06.046356: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-03 13:11:06.048124: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice: 3
[34m[1mwandb[0m: You chose "Don't visualize my results"
[34m[1mwandb[0m: Tracking run with wandb version 0.16.5
[34m[1mwandb[0m: W&B syncing is set to [1m`offline`[0

# Step 2 - Initialisation

 Create student (distilled) model

In [None]:
# uncomment to create new model repo on huggingface

# !huggingface-cli repo create distil-whisper-nl

In [None]:
# clone new model repo to Colab
if not persistent:
  !git lfs install
  !git clone https://huggingface.co/monsoonery/distil-whisper-nl

In [None]:
# Copy relevant scripts to huggingface repo
if not persistent:
  !cp /content/distil-whisper/training/create_student_model.py /content/distil-whisper-nl
  !cp /content/distil-whisper/training/run_distillation.py /content/distil-whisper-nl

In [None]:
# Initialize student model (aka the distilled model) and save it in the init folder
if not persistent:
  !python /content/distil-whisper/training/create_student_model.py --teacher_checkpoint 'openai/whisper-large-v2' --encoder_layers 32 --decoder_layers 2 --save_dir "/content/distil-whisper-nl/distil-large-v2-init"

# Step 3 - Training

Use the pseudo-labelled datasets to train the newly created student/distilled model.

In [None]:
# clone pseudo-labelled datasets if needed (i.e. streaming set to False in cell below)

# if not persistent:
#   !git clone https://huggingface.co/monsoonery/common_voice_13_0_nl_TEST_pseudo_labelled
#   !git clone https://huggingface.co/monsoonery/common_voice_13_0_nl_TRAIN_pseudo_labelled

In [None]:
# train the model on commonvoice and voxpopuli
!accelerate launch /content/distil-whisper/training/run_distillation.py \
  --model_name_or_path "/content/distil-whisper-nl/distil-large-v2-init" \
  --teacher_model_name_or_path "openai/whisper-medium" \
  --train_dataset_name "monsoonery/voxpopuli_nl_TRAIN_pseudo_labelled+monsoonery/common_voice_13_0_nl_TRAIN_pseudo_labelled" \
  --train_dataset_config_name "nl+nl" \
  --train_split_name "train+train" \
  --text_column_name "normalized_text+sentence" \
  --train_dataset_samples "10+10" \
  --eval_dataset_name "monsoonery/voxpopuli_nl_EVAL_pseudo_labelled+monsoonery/common_voice_13_0_nl_EVAL_pseudo_labelled" \
  --eval_dataset_config_name "nl+nl" \
  --eval_split_name "validation+validation" \
  --eval_text_column_name "normalized_text+sentence" \
  --eval_steps 1000 \
  --save_steps 1000 \
  --warmup_steps 50 \
  --learning_rate 0.0001 \
  --lr_scheduler_type "constant_with_warmup" \
  --logging_steps 10 \
  --save_total_limit 1 \
  --max_steps 1000 \
  --wer_threshold 10 \
  --per_device_train_batch_size 8 --per_device_eval_batch_size 8 \
  --dataloader_num_workers 1 \
  --preprocessing_num_workers 2 \
  --ddp_timeout 7200 \
  --dtype "float16" \
  --output_dir "/content/distil-whisper-nl/" \
  --do_train \
  --do_eval False\
  --gradient_checkpointing \
  --overwrite_output_dir \
  --predict_with_generate \
  --freeze_encoder \
  --streaming True \
  --push_to_hub \
  --language "nl"

2024-04-04 13:11:47.499737: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-04 13:11:47.499803: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-04 13:11:47.502176: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice: 3
[34m[1mwandb[0m: You chose "Don't visualize my results"
[34m[1mwandb[0m: Tracking run with wandb version 0.16.5
[34m[1mwandb[0m: W&B syncing is set to [1m`offline`[0