https://dev.classmethod.jp/articles/whisper-fine-tuning-by-huggingface/
Hugging FaceでOpenAIの音声認識”Whisper”をFine Tuningする方法が公開されました

In [1]:
!add-apt-repository -y ppa:jonathonf/ffmpeg-4
!apt update
!apt install -y ffmpeg

!pip install datasets>=2.6.1
!pip install git+https://github.com/huggingface/transformers
!pip install librosa
!pip install evaluate>=0.30
!pip install jiwer
!pip install gradio

0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/ InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64  InRelease
Get:3 http://security.ubuntu.com/ubuntu focal-security InRelease [114 kB]
Hit:4 http://archive.ubuntu.com/ubuntu focal InRelease
Get:5 http://archive.ubuntu.com/ubuntu focal-updates InRelease [114 kB]
Hit:6 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu focal InRelease
Hit:7 http://ppa.launchpad.net/cran/libgit2/ubuntu focal InRelease
Hit:8 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu focal InRelease
Get:9 http://archive.ubuntu.com/ubuntu focal-backports InRelease [108 kB]
Hit:10 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu focal InRelease
Get:11 http://archive.ubuntu.com/ubuntu focal-updates/main amd64 Packages [3,226 kB]
Ign:12 http://ppa.launchpad.net/jonathonf/ffmpeg-4/ubuntu focal InRelease
Hit:13 http://ppa.launchpad.net/ubuntugis/ppa/ubuntu focal InRelease
Err:14 http://pp

In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
from datasets import load_dataset, DatasetDict

common_voice = DatasetDict()

common_voice["train"] = load_dataset("mozilla-foundation/common_voice_11_0"
    , "ja", split="train", use_auth_token=True)
common_voice["validation"] = load_dataset("mozilla-foundation/common_voice_11_0"
    , "ja", split="validation", use_auth_token=True)
common_voice["test"] = load_dataset("mozilla-foundation/common_voice_11_0"
    , "ja", split="test", use_auth_token=True)



In [4]:
print(common_voice)

DatasetDict({
    train: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 6505
    })
    validation: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 4485
    })
    test: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 4604
    })
})


In [5]:
common_voice["train"][0]

{'client_id': '3c9d94093b88b178118bb3be8de72d64c0423a0b8e32bfd799ce83e1e54249d8552205c1a9407bc11a3a0400d934b0277042caeb24e90b8087a83d884591e5b5',
 'path': '/root/.cache/huggingface/datasets/downloads/extracted/0fb75edb8bd454df8bb23ae8bee20532bae3434c4b63d0b4e126d443f35c56e7/common_voice_ja_25861545.mp3',
 'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/0fb75edb8bd454df8bb23ae8bee20532bae3434c4b63d0b4e126d443f35c56e7/common_voice_ja_25861545.mp3',
  'array': array([ 0.00000000e+00, -9.94759830e-14,  3.19744231e-14, ...,
         -1.35150884e-07, -2.39865514e-08, -3.71753970e-08]),
  'sampling_rate': 48000},
 'sentence': '別の話を持ちかけられた。',
 'up_votes': 2,
 'down_votes': 0,
 'age': 'thirties',
 'gender': 'male',
 'accent': '',
 'locale': 'ja',
 'segment': ''}

In [6]:
common_voice = common_voice.remove_columns(["accent", "age", "client_id"
    , "down_votes", "gender", "locale", "path", "segment", "up_votes"])
common_voice

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 6505
    })
    validation: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 4485
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 4604
    })
})

In [7]:
# 実験のためデータセットを縮小したい場合はコチラを有効化
common_voice = DatasetDict({
    "train": common_voice['train'].select(range(100)),
    "validation": common_voice['validation'].select(range(100)),
    "test": common_voice['test'].select(range(100)),
})

In [8]:
common_voice

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 100
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 100
    })
})

In [9]:
from datasets import Audio

common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

In [10]:
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")

In [11]:
# サンプル
batch = common_voice["train"][0]

# 変換実行
audio = batch["audio"]
input_features = feature_extractor(audio["array"]
    , sampling_rate=audio["sampling_rate"]).input_features[0]

print(input_features.shape)

(80, 3000)


In [12]:
input_features

array([[-0.69148946, -0.69148946, -0.69148946, ..., -0.69148946,
        -0.69148946, -0.69148946],
       [-0.69148946, -0.69148946, -0.69148946, ..., -0.69148946,
        -0.69148946, -0.69148946],
       [-0.69148946, -0.69148946, -0.69148946, ..., -0.69148946,
        -0.69148946, -0.69148946],
       ...,
       [-0.69148946, -0.69148946, -0.69148946, ..., -0.69148946,
        -0.69148946, -0.69148946],
       [-0.69148946, -0.69148946, -0.69148946, ..., -0.69148946,
        -0.69148946, -0.69148946],
       [-0.69148946, -0.69148946, -0.69148946, ..., -0.69148946,
        -0.69148946, -0.69148946]], dtype=float32)

In [13]:
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small"
    , language="Japanese", task="transcribe")

In [14]:
input_str = common_voice["train"][0]["sentence"]
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

print(f"Input:                 {input_str}")
print(f"Decoded w/ special:    {decoded_with_special}")
print(f"Decoded w/out special: {decoded_str}")
print(f"Are equal:             {input_str == decoded_str}")

Input:                 別の話を持ちかけられた。
Decoded w/ special:    <|startoftranscript|><|ja|><|transcribe|><|notimestamps|>別の話を持ちかけられた。<|endoftext|>
Decoded w/out special: 別の話を持ちかけられた。
Are equal:             True


In [15]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-small"
    , language="Japanese", task="transcribe")

In [16]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array 
    batch["input_features"] = processor.feature_extractor(audio["array"]
        , sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids 
    batch["labels"] = processor.tokenizer(batch["sentence"]).input_ids
    return batch

In [17]:
common_voice = common_voice.map(prepare_dataset
    , remove_columns=common_voice.column_names["train"], num_proc=1)



Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [18]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int]
        , torch.Tensor]]]) -> Dict[str, torch.Tensor]:

        # 音響特徴量側をまとめる処理
        # (一応バッチ単位でパディングしているが、すべて30秒分であるはず)
        input_features \
            = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # トークン化された系列をバッチ単位でパディング
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # attention_maskが0の部分は、トークンを-100に置き換えてロス計算時に無視させる
        # -100を無視するのは、PyTorchの仕様
        labels \
            = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # BOSトークンがある場合は削除
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        # 整形したlabelsをバッチにまとめる
        batch["labels"] = labels

        return batch

In [19]:
features = common_voice["train"][0:5]
label_features = [{"input_ids": labels} for labels in features["labels"]]
labels_batch = processor.tokenizer.pad(label_features, return_tensors="pt")
labels_batch

{'input_ids': tensor([[50258, 50266, 50359, 50363, 16158,  2972, 11103,  5998, 17694,  6574,
          3703,  7625,  5154, 35478,  1543, 50257, 50257, 50257, 50257, 50257,
         50257, 50257, 50257, 50257],
        [50258, 50266, 50359, 50363, 33261, 33261,  1543, 50257, 50257, 50257,
         50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
         50257, 50257, 50257, 50257],
        [50258, 50266, 50359, 50363, 33393, 12488,  5591,  1231, 29098, 47219,
           123, 41397,  1543, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
         50257, 50257, 50257, 50257],
        [50258, 50266, 50359, 50363, 35268, 15266,   232,  8276, 39406,  3193,
          8040, 17794, 39780, 48405, 32026,  1231,  7594, 39046,  5142, 13806,
           171,   120,   253, 50257],
        [50258, 50266, 50359, 50363, 38739,   250,  7732,    95, 22570, 36215,
          2474, 40063,   108, 47885,  4108, 11561,    95, 25266,   106, 43200,
          4895, 27113, 50257, 50257]]), 'att

In [20]:
processor.tokenizer.decode(labels_batch["input_ids"][0])

'<|startoftranscript|><|ja|><|transcribe|><|notimestamps|>別の話を持ちかけられた。<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>'

In [21]:
labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
labels

tensor([[50258, 50266, 50359, 50363, 16158,  2972, 11103,  5998, 17694,  6574,
          3703,  7625,  5154, 35478,  1543, 50257,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100],
        [50258, 50266, 50359, 50363, 33261, 33261,  1543, 50257,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100],
        [50258, 50266, 50359, 50363, 33393, 12488,  5591,  1231, 29098, 47219,
           123, 41397,  1543, 50257,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100],
        [50258, 50266, 50359, 50363, 35268, 15266,   232,  8276, 39406,  3193,
          8040, 17794, 39780, 48405, 32026,  1231,  7594, 39046,  5142, 13806,
           171,   120,   253, 50257],
        [50258, 50266, 50359, 50363, 38739,   250,  7732,    95, 22570, 36215,
          2474, 40063,   108, 47885,  4108, 11561,    95, 25266,   106, 43200,
          4895, 27113, 50257,  -100]])

In [22]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [23]:
!pip install ginza==4.0.5 ja-ginza 
!pip install sortedcontainers~=2.1.0
import pkg_resources, imp
imp.reload(pkg_resources)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


<module 'pkg_resources' from '/usr/local/lib/python3.10/dist-packages/pkg_resources/__init__.py'>

In [24]:
import evaluate
import spacy
import ginza

metric = evaluate.load("wer")
nlp = spacy.load("ja_ginza")
ginza.set_split_mode(nlp, "C") # CはNEologdの意らしいです

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    # 分かち書きして空白区切りに変換
    pred_str = [" ".join([ str(i) for i in nlp(j) ]) for j in pred_str]
    label_str = [" ".join([ str(i) for i in nlp(j) ]) for j in label_str]

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [25]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

In [26]:
model.config.forced_decoder_ids \
    = processor.get_decoder_prompt_ids(language = "ja", task = "transcribe")
model.config.suppress_tokens = []

In [27]:
processor.tokenizer.decode([i[1] for i in model.config.forced_decoder_ids])

'<|ja|><|transcribe|><|notimestamps|>'

In [28]:
!pwd

/content


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.19.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`


In [29]:
!pip show accelerate

Name: accelerate
Version: 0.19.0
Summary: Accelerate
Home-page: https://github.com/huggingface/accelerate
Author: The HuggingFace team
Author-email: sylvain@huggingface.co
License: Apache
Location: /usr/local/lib/python3.10/dist-packages
Requires: numpy, packaging, psutil, pyyaml, torch
Required-by: 


In [30]:
import sys
print(sys.path)

['/content', '/env/python', '/usr/lib/python310.zip', '/usr/lib/python3.10', '/usr/lib/python3.10/lib-dynload', '', '/usr/local/lib/python3.10/dist-packages', '/usr/lib/python3/dist-packages', '/usr/local/lib/python3.10/dist-packages/IPython/extensions', '/root/.ipython', '/root/.cache/huggingface/modules']


In [31]:
# !pip install accelerate==0.19.0
!pip install accelerate -U

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [32]:
!pip install transformers[torch]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [33]:
!pip show pytorch

[0m

In [34]:
!accelerate env


Copy-and-paste the text below in your GitHub issue

- `Accelerate` version: 0.19.0
- Platform: Linux-5.15.107+-x86_64-with-glibc2.31
- Python version: 3.10.11
- Numpy version: 1.22.4
- PyTorch version (GPU?): 2.0.1+cu118 (True)
- System RAM: 12.68 GB
- GPU type: Tesla T4
- `Accelerate` default config:
	Not found


↓でエラーになった場合は、!pip install accelerate -U 後、カーネル再起動で最初から実行するとOK（https://discuss.huggingface.co/t/importerror-using-the-trainer-with-pytorch-seq2seqtrainingarguments/40599）

Trying to implement Whisper fine tuning on colab: Google Colab 3
When trying the step: training_args = Seq2SeqTrainingArguments(…) got the following error message:
ImportError: Using the Trainer with PyTorch requires accelerate>=0.19.0: Please run pip install transformers[torch] or pip install accelerate -U

In [36]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-small-ja",  # change to a repo name of your choice
    # output_dir=".",  # change to a repo name of your choice
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    # warmup_steps=500, # Hugging Faceブログではこちら
    warmup_steps=5,
    # max_steps=4000, # Hugging Faceブログではこちら
    max_steps=40,
    gradient_checkpointing=True,
    fp16=True,
    group_by_length=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    # save_steps=1000, # Hugging Faceブログではこちら
    save_steps=10,
    # eval_steps=1000, # Hugging Faceブログではこちら
    eval_steps=10,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=False,
)

In [37]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=common_voice["train"],
    eval_dataset=common_voice["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

In [38]:
import pandas as pd
pd.DataFrame([
    {"split":"train"
        , "wer": trainer.predict(common_voice["train"]).metrics["test_wer"]},
    {"split":"validation"
        , "wer": trainer.predict(common_voice["validation"]).metrics["test_wer"]},
    {"split":"test"
        , "wer": trainer.predict(common_voice["test"]).metrics["test_wer"]}
])

Unnamed: 0,split,wer
0,train,32.523759
1,validation,23.704333
2,test,25.29458


In [39]:
trainer.train()

`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


Step,Training Loss,Validation Loss,Wer
10,No log,1.957535,20.220901
20,No log,0.894369,22.429907
30,2.129900,0.764919,19.796092
40,2.129900,0.737672,20.390824


TrainOutput(global_step=40, training_loss=1.55040385723114, metrics={'train_runtime': 499.3451, 'train_samples_per_second': 1.282, 'train_steps_per_second': 0.08, 'total_flos': 1.673795321856e+17, 'train_loss': 1.55040385723114, 'epoch': 5.71})

In [40]:
pd.DataFrame([
    {"split":"train"
        , "wer": trainer.predict(common_voice["train"]).metrics["test_wer"]},
    {"split":"validation"
        , "wer": trainer.predict(common_voice["validation"]).metrics["test_wer"]},
    {"split":"test"
        , "wer": trainer.predict(common_voice["test"]).metrics["test_wer"]}
])

Unnamed: 0,split,wer
0,train,4.435058
1,validation,19.796092
2,test,23.723488


In [41]:
prediction_output = trainer.predict(common_voice["test"].select([0]))
pred_ids = prediction_output.predictions
processor.tokenizer.decode(pred_ids[0], skip_special_tokens=True)

'そぼをおむいきげよくさいころころがしている。'

In [42]:
# カラムを消してしまったため再度ロード
common_voice_test = load_dataset("mozilla-foundation/common_voice_11_0"
    , "ja", split="test", use_auth_token=True)
common_voice_test = common_voice_test.select(range(1))
common_voice_test = common_voice_test.cast_column("audio", Audio(sampling_rate=16000))

device = "cuda" if torch.cuda.is_available() else "cpu"

# 推論
speech_data = common_voice_test['audio'][0]["array"]
inputs = processor.feature_extractor(speech_data
    , return_tensors="pt", sampling_rate=16_000).input_features.to(device)
predicted_ids = model.generate(inputs, max_length=480_000)
processor.tokenizer.batch_decode(predicted_ids, skip_special_tokens=False)[0]



'<|startoftranscript|><|ja|><|transcribe|><|notimestamps|>そぼをおむいきげよくさいころころがしている。<|endoftext|>'