# Whisper

general-purpose speech recognition model

In [None]:
!nvidia-smi

download audio

In [None]:
!pip install -q yt-dlp

In [None]:
from yt_dlp import YoutubeDL

ydl_opts = {
	"format": "bestaudio",
	"outtmpl": "audio.%(ext)s",
	"overwrites": True,
	"postprocessors": [{"key": "FFmpegExtractAudio", "preferredcodec": "wav"}]
}

In [None]:
with YoutubeDL(ydl_opts) as ydl:
	ydl.download("0ncZ-4BENRU")

## Voice extraction

for better transcription

using SOTA faceboook `demucs`

In [None]:
%pip install -q demucs

In [None]:
import torchaudio
from demucs.pretrained import get_model
from demucs.separate import load_track
from demucs.apply import apply_model

In [None]:
demucs_model = get_model("htdemucs").cuda()

In [None]:
raw_audio = load_track("audio.wav", demucs_model.audio_channels, demucs_model.samplerate)
# should not be on GPU because sometimes not enough VRAM

if raw_audio.dim() == 1:
	raw_audio = raw_audio[None, None].repeat_interleave(2, -2)
elif raw_audio.shape[-2] == 1:
	raw_audio = raw_audio.repeat_interleave(2, -2)
elif raw_audio.dim() < 3:
	raw_audio = raw_audio[None]

In [None]:
demucs_extract = apply_model(demucs_model, raw_audio, device="cuda", split=True, overlap=.25)
torchaudio.save("audio.vocals.wav", demucs_extract[0, demucs_model.sources.index("vocals")].mean(0)[None], demucs_model.samplerate)

## Original model

In [None]:
%pip install -q openai-whisper

5 models:

|  Size  | Parameters | English-only model | Multilingual model | Required VRAM | Relative speed |
|:------:|:----------:|:------------------:|:------------------:|:-------------:|:--------------:|
|  tiny  |    39 M    |     `tiny.en`      |       `tiny`       |     ~1 GB     |      ~32x      |
|  base  |    74 M    |     `base.en`      |       `base`       |     ~1 GB     |      ~16x      |
| small  |   244 M    |     `small.en`     |      `small`       |     ~2 GB     |      ~6x       |
| medium |   769 M    |    `medium.en`     |      `medium`      |     ~5 GB     |      ~2x       |
| large  |   1550 M   |        N/A         |      `large`       |    ~10 GB     |       1x       |

In [None]:
import whisper
model = whisper.load_model("large")
writer = whisper.utils.get_writer("srt", ".")

In [None]:
audio = whisper.load_audio("audio.wav")
result = model.transcribe(audio, verbose=False)

possible extension: Silero VAD for better noise canceling

save as subtitle `.srt` file

In [None]:
writer(result, "yolo.srt")

### load fine-tuned model from `huggingface`

should have same size as `openai/whisper-large-v2`

In [None]:
!wget https://huggingface.co/vumichien/whisper-large-v2-mix-jp/resolve/main/pytorch_model.bin

In [None]:
import torch
hf_state_dict = torch.load("pytorch_model.bin", map_location=torch.device("cpu")) # in case not enough VRAM

compare `model.state_dict.keys()` vs `hf_state_dict.keys()`

ref:
 - https://github.com/openai/whisper/discussions/830#discussioncomment-4652413
 - https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/convert_openai_to_hf.py

In [None]:
rename_keys = lambda text: (text
	.replace("model.", "")
	.replace("layers", "blocks")
	.replace("fc1", "mlp.0")
	.replace("fc2", "mlp.2")
	.replace("final_layer_norm", "mlp_ln")
	.replace(".self_attn.q_proj", ".attn.query")
	.replace(".self_attn.k_proj", ".attn.key")
	.replace(".self_attn.v_proj", ".attn.value")
	.replace(".self_attn_layer_norm", ".attn_ln")
	.replace(".self_attn.out_proj", ".attn.out")
	.replace(".encoder_attn.q_proj", ".cross_attn.query")
	.replace(".encoder_attn.k_proj", ".cross_attn.key")
	.replace(".encoder_attn.v_proj", ".cross_attn.value")
	.replace(".encoder_attn_layer_norm", ".cross_attn_ln")
	.replace(".encoder_attn.out_proj", ".cross_attn.out")
	.replace("decoder.layer_norm.", "decoder.ln.")
	.replace("encoder.layer_norm.", "encoder.ln_post.")
	.replace("embed_tokens", "token_embedding")
	.replace("encoder.embed_positions.weight", "encoder.positional_embedding")
	.replace("decoder.embed_positions.weight", "decoder.positional_embedding")
	.replace("layer_norm", "ln_post")
)

In [None]:
for key in [*hf_state_dict]:
	new_key = rename_keys(key)
	hf_state_dict[new_key] = hf_state_dict.pop(key)
model.load_state_dict(hf_state_dict)

In [None]:
result = model.transcribe(audio, verbose=False, language="ja", task="translate")

## Faster Whisper

able to run `large` model as `float16` with <6GB VRAM

In [None]:
%pip install -q faster-whisper transformers

In [None]:
import faster_whisper
from tqdm import tqdm

model = faster_whisper.WhisperModel("large-v2", device="cuda") # compute_type="float16"

In [None]:
def convert_to_hms(seconds: float) -> str:
	hours, remainder = divmod(seconds, 3600)
	minutes, seconds = divmod(remainder, 60)
	milliseconds = round((seconds % 1) * 1000)
	output = f"{int(hours):02}:{int(minutes):02}:{int(seconds):02},{milliseconds:03}"
	return output

def convert_seg(segment: faster_whisper.transcribe.Segment) -> str:
	return f"{convert_to_hms(segment.start)} --> {convert_to_hms(segment.end)}\n{segment.text.strip()}"

In [None]:
segments, info = model.transcribe("audio.wav", vad_filter=True) # Silero VAD model to remove silence

full_txt = []
timestamps = 0.0  # for progress bar
with tqdm(total=round(info.duration, 2), unit=" audio seconds") as pbar:
	for i, segment in enumerate(segments, start=1):
		full_txt.append(f"{i}\n{convert_seg(segment)}\n\n")
		pbar.update(segment.end - timestamps)
		timestamps = segment.end
	if timestamps < info.duration:
		pbar.update(info.duration - timestamps)

with open("yolo.srt", mode="w", encoding="UTF-8") as f:
	f.writelines(full_txt)

### load fine-tuned model from `huggingface`

should have same size as `openai/whisper-large-v2`

In [None]:
!ct2-transformers-converter --model vumichien/whisper-large-v2-mix-jp --output_dir test-ct2 --quantization float16 # in case not enough RAM

In [None]:
model = faster_whisper.WhisperModel("test-ct2", device="cuda", compute_type="float16")