# Whisper Small 다단계 파인튜닝 노트북

이 노트북은 전처리된 **MIT 영어강의** 데이터셋으로 OpenAI **Whisper‑small** 모델을 두 단계로 파인튜닝합니다.

## 1. 환경 준비

In [None]:
!pip install -q -U "transformers==4.52.3" datasets accelerate evaluate jiwer
!pip install -q git+https://github.com/openai/whisper.git

In [None]:
from huggingface_hub import notebook_login
notebook_login()

## 2. 데이터 로드 & 안전한 검증 split 생성
`validation` split 이 없는 경우 자동으로 `test` 또는 `train` 10 %를 사용합니다.

In [None]:
from datasets import load_dataset

def get_eval_split(dataset_dict, pct=0.1):
    # validation이 이미 있으면 그대로 리턴
    if "validation" in dataset_dict:
        return dataset_dict["validation"]

    # test가 있으면 test를 평가 데이터로 사용
    if "test" in dataset_dict:
        return dataset_dict["test"]

    # 둘 다 없으면 train을 pct 비율만큼 떼서 validation(test)으로 사용
    print(f"⚠️ 'validation' split이 없어 train의 {int(pct*100)}%를 eval로 사용합니다.")
    split = dataset_dict["train"].train_test_split(test_size=pct, seed=42)
    dataset_dict["train"] = split["train"]
    return split["test"]

# ──────────────────────────────────────────────────────────────────────────

MIT = load_dataset("yongjune2002/MITOCW-Whisper-Processed")

# 1) eval_ds를 먼저 만들어서 MIT["train"]이 90%만 남도록 수정
eval_ds = get_eval_split(MIT, pct=0.1)

# 2) 이제 MIT["train"]에는 “나머지 90%”만 남았으므로, 바로 train으로 사용하면 된다
train_ds = MIT["train"]

# (필요하다면 train_ds에만 shuffle, map, tokenization 등 전처리 진행)
print(train_ds)
print(eval_ds)

## 3. 데이터 콜레이터

In [None]:
from dataclasses import dataclass
from typing import List, Dict, Union
import torch

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: any
    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]):
        input_feats = [{"input_features": f["input_features"]} for f in features]
        label_feats = [{"input_ids": f["labels"]} for f in features]
        batch = self.processor.feature_extractor.pad(input_feats, return_tensors="pt")
        labels = self.processor.tokenizer.pad(label_feats, padding=True, return_tensors="pt").input_ids
        labels[labels == self.processor.tokenizer.pad_token_id] = -100
        batch["labels"] = labels
        return batch

## 4. 모델 & 프로세서 로드

In [None]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration

checkpoint = "openai/whisper-small"
processor = WhisperProcessor.from_pretrained(checkpoint)
processor.tokenizer.set_prefix_tokens(language="en", task="transcribe")
model = WhisperForConditionalGeneration.from_pretrained(checkpoint)

# 1단계: 인코더 freeze
for p in model.model.encoder.parameters():
    p.requires_grad = False

In [None]:
len(train_ds)

## 5. 두 단계 학습 콜백

In [None]:
from transformers import TrainerCallback

class UnfreezeBottom2Callback(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step == 100:
            print("▶️  Unfreezing bottom 2 encoder layers …")
            for layer in model.model.encoder.layers[:2]:
                for p in layer.parameters():
                    p.requires_grad = True

## 6. 트레이닝 인자

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="whisper-mit",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=5e-6,
    warmup_steps=50,
    num_train_epochs=10,
    weight_decay=0.01,
    fp16=False,

    eval_strategy="epoch",           # 에폭이 끝날 때마다 평가
    save_strategy="epoch",           # 에폭이 끝날 때마다 저장

    save_total_limit=1,

    logging_strategy="steps",
    logging_steps=50,

    gradient_checkpointing=True,
    predict_with_generate=True,

    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
)


## 7. 평가 함수: WER

In [None]:
import evaluate
wer_metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)
    return {"wer": wer_metric.compute(predictions=pred_str, references=label_str)}

## 8. Trainer 초기화

In [None]:
from transformers import Seq2SeqTrainer, EarlyStoppingCallback

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=DataCollatorSpeechSeq2SeqWithPadding(processor),
    tokenizer=processor.tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[
        UnfreezeBottom2Callback(),
        EarlyStoppingCallback(early_stopping_patience=2)
    ],
)

## 9. 학습 시작 🚀

In [None]:
trainer.train()

## 10. 모델 업로드

In [None]:
from huggingface_hub import login
login(token="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")

# 2) 모델 & 프로세서 Push
#    repo_name은 "<username>/<repo_id>" 형태로 지정
repo_name = "tfbghjk/whisper-mit-small"

# 이미 정의해 두신 Trainer와 Processor 객체가 있다면
trainer.push_to_hub(repo_name)
processor.push_to_hub(repo_name)

In [None]:
trainer.save_model()                         # ./whisper-mit/pytorch_model.bin 등 생성
processor.save_pretrained("./whisper-mit")   # processor 파일들(토크나이저) 생성

In [None]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from huggingface_hub import login
login(token="허깅페이스 토큰 넣기")


local_model_dir = "./whisper-mit"
repo_name = "tfbghjk/whisper-mit-small_v2"

processor = WhisperProcessor.from_pretrained(local_model_dir)
processor.push_to_hub(repo_name)

model = WhisperForConditionalGeneration.from_pretrained(local_model_dir)
model.push_to_hub(repo_name)

from transformers import Trainer
trainer.push_to_hub(repo_name)

## 11. 간단 테스트

In [None]:
import torch

sample = eval_ds[0]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 모델을 float32로 강제
model = model.to(device).float()

# input도 float32로 강제
input_features = torch.tensor(sample['input_features']).unsqueeze(0).to(device).float()

predicted_ids = model.generate(input_features)
print(processor.decode(predicted_ids[0], skip_special_tokens=True))

**12. (선택) Gradio 데모**##

In [None]:
!pip install gradio

**13. Evaluation**


In [None]:
# Evaluate after training
results = trainer.evaluate()

# Print the results (WER and other metrics)
print("Evaluation results:", results)

Comparing

In [None]:
from transformers import WhisperForConditionalGeneration

# 1️⃣ 사전 훈련된 모델 평가
pretrained_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device).float()
trainer.model = pretrained_model  # Trainer에 모델 할당
pretrained_results = trainer.evaluate()
print("Pretrained model WER:", pretrained_results["eval_wer"])

# 2️⃣ 파인튜닝된 모델 평가
trainer.model = model  # fine-tuned model로 교체
fine_tuned_results = trainer.evaluate()
print("Fine-tuned model WER:", fine_tuned_results["eval_wer"])

# 3️⃣ 성능 비교
improvement_in_wer = pretrained_results['eval_wer'] - fine_tuned_results['eval_wer']
print(f"WER improvement after fine-tuning: {improvement_in_wer:.4f}")

In [None]:
import torch
import torchaudio
import gradio as gr
from transformers import WhisperProcessor, WhisperForConditionalGeneration

# Hugging Face에서 pretrained whisper-small 로드
checkpoint = "openai/whisper-small"
processor = WhisperProcessor.from_pretrained(checkpoint)
model = WhisperForConditionalGeneration.from_pretrained(checkpoint).to("cuda" if torch.cuda.is_available() else "cpu")

def transcribe(file):
    # 파일에서 waveform 로드
    speech_array, sampling_rate = torchaudio.load(file)

    # Whisper는 16kHz 샘플링 기대 → 필요시 리샘플링
    if sampling_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
        speech_array = resampler(speech_array)

    # Whisper feature 추출
    input_features = processor.feature_extractor(
        speech_array.squeeze().numpy(), sampling_rate=16000, return_tensors="pt"
    ).input_features.to(model.device)

    # Whisper 모델로 예측
    predicted_ids = model.generate(input_features)
    transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)

    return transcription

# Gradio 인터페이스 실행
gr.Interface(fn=transcribe, inputs=gr.Audio(type="filepath"), outputs="text").launch()


In [None]:
import torch
import torchaudio
import gradio as gr
from transformers import WhisperProcessor, WhisperForConditionalGeneration

checkpoint = "tfbghjk/whisper-mit-small_v2"

# 1) Processor(토크나이저 + feature-extractor) 불러오기
processor = WhisperProcessor.from_pretrained(checkpoint)
processor.tokenizer.set_prefix_tokens(language="en", task="transcribe")

# 2) 모델 불러오기 (GPU/CPU 할당)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = WhisperForConditionalGeneration.from_pretrained(checkpoint).to(device)

model.generation_config.forced_decoder_ids = None
model.config.forced_decoder_ids = None


def transcribe(file):
    speech_array, sampling_rate = torchaudio.load(file)
    if sampling_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
        speech_array = resampler(speech_array)

    input_features = processor.feature_extractor(
        speech_array.squeeze().numpy(),
        sampling_rate=16000,
        return_tensors="pt"
    ).input_features.to(device)

    predicted_ids = model.generate(input_features)

    transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
    return transcription


# 6) Gradio 인터페이스 실행
gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(type="filepath"),
    outputs="text"
).launch()
