In [1]:
from sparseml.pytorch.optim import ScheduledModifierManager
from fastprogress import master_bar, progress_bar
from fastai.vision.all import SimpleNamespace, set_seed
import wandb
import whisper
import torch
from datasets import load_dataset, DatasetDict, Audio
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor
from dataset import JvsSpeechDataset, WhisperDataCollatorWhithPadding
from torch.nn import CrossEntropyLoss
from torch.optim import AdamW
from os.path import isfile
import evaluate
from utils import (
    create_dirs_if_not_exist,
    set_weight_decay,
    define_metrics,
    compute_metrics,
)
import os

models_dir = os.getenv("MODELS")
model_size = "small"
run_name = f"{model_size}_sparsify_full_data"
save_dir = f"{models_dir}/checkpoints/FinetuneWhisper/{model_size}/"
model_to_load = "small_10e_full_data_(9).tar"

if not isfile(f"{save_dir}{model_to_load}"):
    raise ValueError("Model to load does not exist")

create_dirs_if_not_exist(save_dir)

config = SimpleNamespace(
    seed=42,
    lr=0.0005,
    batch_size=2,
    epochs=10,
    dropout=0.2,
    weight_decay=0.01,
    acu_steps=128,
    sample_rate=16000,
)

run = wandb.init(
    project="finetune-whisper",
    entity="ludeksvoboda",
    config=config,
    job_type=run_name,
    name=run_name,
)

set_seed(config.seed)

config = wandb.config

common_voice = DatasetDict()

common_voice["train"] = load_dataset(
    "mozilla-foundation/common_voice_13_0", "cs", split="train+validation", token=True
)
common_voice["test"] = load_dataset(
    "mozilla-foundation/common_voice_13_0", "cs", split="test", token=True
)
common_voice = common_voice.remove_columns(
    [
        "accent",
        "age",
        "client_id",
        "down_votes",
        "gender",
        "locale",
        "path",
        "segment",
        "up_votes",
    ]
)

feature_extractor = WhisperFeatureExtractor.from_pretrained(
    f"openai/whisper-{model_size}"
)
tokenizer = WhisperTokenizer.from_pretrained(
    f"openai/whisper-{model_size}", language="cs", task="transcribe"
)
processor = WhisperProcessor.from_pretrained(
    f"openai/whisper-{model_size}", language="cs", task="transcribe"
)

common_voice = common_voice.cast_column(
    "audio", Audio(sampling_rate=config.sample_rate)
)


def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(
        audio["array"], sampling_rate=audio["sampling_rate"]
    ).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch


common_voice = common_voice.map(
    prepare_dataset, remove_columns=common_voice.column_names["train"], num_proc=4
)

woptions = whisper.DecodingOptions(language="cs", without_timestamps=True)
model = whisper.load_model(model_size)

checkpoint = torch.load(f"{save_dir}{model_to_load}")
model.load_state_dict(checkpoint["model_state_dict"])

dataset = JvsSpeechDataset(common_voice["train"])
loader = torch.utils.data.DataLoader(
    dataset, batch_size=config.batch_size, collate_fn=WhisperDataCollatorWhithPadding()
)

test_dataset = JvsSpeechDataset(common_voice["test"])
test_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=1,
    collate_fn=WhisperDataCollatorWhithPadding(),
    shuffle=False,
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mludeksvoboda[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
model = whisper.load_model(model_size).encoder

In [2]:
batch = next(iter(loader))

In [8]:
print((batch["input_ids"].shape))

torch.Size([2, 80, 3000])


In [8]:
models_dir = os.getenv("MODELS")
model_size = "small"
save_dir = f"{models_dir}/checkpoints/FinetuneWhisper/{model_size}/"
model_to_load = "small_sparsify_full_data_(9).tar"

if not isfile(f"{save_dir}{model_to_load}"):
    raise ValueError("Model to load does not exist")

model = whisper.load_model(model_size)

checkpoint = torch.load(f"{save_dir}{model_to_load}")
model.load_state_dict(checkpoint["model_state_dict"])
device = torch.device("cpu")
model = model.eval().to(device)

model = model.encoder

In [None]:
out = model.decoder(dec_input_ids, audio_features)

In [4]:
encoder = model.encoder

AttributeError: 'AudioEncoder' object has no attribute 'encoder'

In [6]:
batch['input_ids']

tensor([[[-0.6728, -0.6728, -0.6728,  ..., -0.6728, -0.6728, -0.6728],
         [-0.6728, -0.6728, -0.6728,  ..., -0.6728, -0.6728, -0.6728],
         [-0.6728, -0.6728, -0.6728,  ..., -0.6728, -0.6728, -0.6728],
         ...,
         [-0.6728, -0.6728, -0.6728,  ..., -0.6728, -0.6728, -0.6728],
         [-0.6728, -0.6728, -0.6728,  ..., -0.6728, -0.6728, -0.6728],
         [-0.6728, -0.6728, -0.6728,  ..., -0.6728, -0.6728, -0.6728]],

        [[-0.7202, -0.7202, -0.7202,  ..., -0.7202, -0.7202, -0.7202],
         [-0.7202, -0.7202, -0.7202,  ..., -0.7202, -0.7202, -0.7202],
         [-0.7202, -0.7202, -0.7202,  ..., -0.7202, -0.7202, -0.7202],
         ...,
         [-0.7202, -0.7202, -0.7202,  ..., -0.7202, -0.7202, -0.7202],
         [-0.7202, -0.7202, -0.7202,  ..., -0.7202, -0.7202, -0.7202],
         [-0.7202, -0.7202, -0.7202,  ..., -0.7202, -0.7202, -0.7202]]])

In [5]:
f = model(batch["input_ids"].cuda())

In [11]:
print(batch["dec_input_ids"].shape)

torch.Size([2, 31])


In [10]:
print(f.shape)

torch.Size([2, 1500, 768])


In [18]:
torch.randint(20, (1, 31))

tensor([[15, 16, 10,  5, 15,  1, 14, 15,  0, 10,  1,  2, 17,  7,  1,  1, 12, 12,
          9, 19,  9,  8,  7, 18,  0,  9,  0, 18,  3,  9, 16]])

: 

In [12]:
torch.cuda.empty_cache()