In [1]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import WhisperForConditionalGeneration, WhisperProcessor
import os
import soundfile as sf
from tqdm import tqdm
import random
import csv
import json
import librosa

# CPU 설정
torch.set_num_threads(torch.get_num_threads())  # CPU 연산 최적화
device = torch.device("cpu")

# 모델 및 프로세서 로드
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device)
processor = WhisperProcessor.from_pretrained("openai/whisper-small")

# 옵티마이저 설정
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

class AudioDataset(Dataset):
    def __init__(self, data_list, processor):
        self.data_list = data_list
        self.processor = processor

    def __len__(self):
        return len(self.data_list)

    def __getitem__(self, idx):
        audio_path, transcription = self.data_list[idx]
        # Load audio and resample to 16,000 Hz
        speech_array, sampling_rate = librosa.load(audio_path, sr=16000)  # Resample to 16,000 Hz
        input_features = self.processor(
            speech_array,
            sampling_rate=16000,  # Ensure the correct sampling rate
            return_tensors="pt"
        ).input_features
        return input_features.squeeze(0), transcription

# 매칭 작업 함수
def collect_audio_text_pairs(label_dir, audio_dir, output_file=None):
    """JSON과 오디오 파일을 매칭하여 CSV 파일로 저장."""
    audio_text_pairs = []
    audio_cache = {}

    # 오디오 파일 캐싱
    print("Building audio file cache...")
    for root, _, files in os.walk(audio_dir):
        for file in files:
            if file.endswith(".wav"):
                audio_cache[file] = os.path.join(root, file)

    # JSON 파일 처리
    print("Processing label files...")
    for root, _, files in os.walk(label_dir):
        for file in tqdm(files, desc="Matching data"):
            if file.endswith(".json"):
                json_path = os.path.join(root, file)
                with open(json_path, "r", encoding="utf-8") as f:
                    try:
                        data = json.load(f)
                        file_name = data['File']['FileName']
                        transcription = data['Transcription']['LabelText']
                        if file_name in audio_cache:
                            audio_text_pairs.append((audio_cache[file_name], transcription))
                        else:
                            print(f"Audio file not found for {file_name}")
                    except (json.JSONDecodeError, KeyError) as e:
                        print(f"Error parsing {json_path}: {e}")

    # 매칭 결과를 CSV 파일로 저장
    if output_file:
        print(f"Saving matched pairs to {output_file}...")
        with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(["audio_path", "transcription"])
            writer.writerows(audio_text_pairs)

    return audio_text_pairs


# CSV 파일에서 데이터 로드
def load_matching_pairs(input_file):
    """Load audio-text pairs from a CSV file."""
    pairs = []
    with open(input_file, "r", encoding="utf-8") as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # Skip header
        for row in reader:
            pairs.append((row[0], row[1]))
    return pairs


# 체크포인트 저장 함수
def save_step_checkpoint(model, optimizer, epoch, step, checkpoint_dir):
    step_checkpoint_path = os.path.join(checkpoint_dir, f"step_checkpoint.pth")
    torch.save({
        'epoch': epoch,
        'step': step,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, step_checkpoint_path)
    print(f"Step checkpoint saved: {step_checkpoint_path}")


def save_best_checkpoint(model, optimizer, epoch, best_train_loss, checkpoint_dir):
    best_checkpoint_path = os.path.join(checkpoint_dir, "best_checkpoint.pth")
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'best_train_loss': best_train_loss,
    }, best_checkpoint_path)
    print(f"Best checkpoint updated: {best_checkpoint_path}")

# 체크포인트 로드 함수
def load_checkpoint(checkpoint_path, model, optimizer):
    if os.path.exists(checkpoint_path):
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        epoch = checkpoint['epoch']
        step = checkpoint['step']
        print(f"Checkpoint loaded: {checkpoint_path}, Epoch: {epoch}, Step: {step}")
        return epoch, step
    else:
        print("No checkpoint found, starting from scratch.")
        return 0, 0


# 학습 설정
num_epochs = 3
train_sample_size = 4000  # 학습 데이터 샘플 크기
batch_size = 8
train_cache_file = "train_audio_text_pairs.csv"
checkpoint_dir = "./model_checkpoints"
label_dir = "./아카이브/labeling_data"  # 라벨 데이터 경로
audio_dir = "./아카이브/wav_data"  # 오디오 데이터 경로

# 매칭 작업 (매칭된 CSV가 없으면 수행)
if not os.path.exists(train_cache_file):
    print("Matching training data...")
    train_audio_text_pairs = collect_audio_text_pairs(label_dir, audio_dir, train_cache_file)
else:
    print("Loading matched training data...")
    train_audio_text_pairs = load_matching_pairs(train_cache_file)

# 체크포인트 디렉토리 설정
os.makedirs(checkpoint_dir, exist_ok=True)
checkpoint_path = os.path.join(checkpoint_dir, "best_checkpoint.pth")

# 체크포인트 로드
start_epoch, step = load_checkpoint(checkpoint_path, model, optimizer)

# 학습 루프
best_train_loss = float('inf')

for epoch in range(start_epoch + 1, num_epochs + 1):
    print(f"Epoch {epoch}/{num_epochs}")

    # 랜덤 샘플링 데이터 로드
    train_sample = random.sample(train_audio_text_pairs, min(train_sample_size, len(train_audio_text_pairs)))
    train_dataset = AudioDataset(train_sample, processor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # 학습 루프
    model.train()
    train_loss = 0
    for batch_idx, (input_features, labels) in tqdm(enumerate(train_loader), total=len(train_loader), desc="Training"):
        step += 1
        input_features = input_features.to(device)
        labels = processor(text=labels, return_tensors="pt", padding=True).input_ids.to(device)

        outputs = model(input_features, labels=labels)
        loss = outputs.loss
        train_loss += loss.item()

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        print(f"Step {step}, Loss: {loss.item()}")  # 스텝마다 손실 출력

        # 스텝 체크포인트 저장
        save_step_checkpoint(model, optimizer, epoch, step, checkpoint_dir)

    avg_train_loss = train_loss / len(train_loader)
    print(f"Epoch {epoch}, Average Training Loss: {avg_train_loss}")

    # 최적 손실 업데이트
    if avg_train_loss < best_train_loss:
        best_train_loss = avg_train_loss
        save_best_checkpoint(model, optimizer, epoch, best_train_loss, checkpoint_dir)

# 모델 저장
model_save_path = "./fine_tuned_whisper_model"
os.makedirs(model_save_path, exist_ok=True)
model.save_pretrained(model_save_path)
processor.save_pretrained(model_save_path)

print("모델 학습 완료 및 저장 완료")


  from .autonotebook import tqdm as notebook_tqdm


Loading matched training data...


  checkpoint = torch.load(checkpoint_path)


KeyError: 'step'

In [2]:
checkpoint = torch.load("./model_checkpoints/best_checkpoint.pth")
print("Best Training Loss:", checkpoint['best_train_loss'])

  checkpoint = torch.load("./model_checkpoints/best_checkpoint.pth")


Best Training Loss: 0.0323933675793319


In [3]:
import torch
from transformers import WhisperForConditionalGeneration, WhisperProcessor

# 1. Whisper 모델과 Processor 로드
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
processor = WhisperProcessor.from_pretrained("openai/whisper-small")

# 2. 체크포인트 파일 경로
checkpoint_path = "./model_checkpoints/best_checkpoint.pth"

# 3. 체크포인트 로드
checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))

# 4. 모델 상태 복원
model.load_state_dict(checkpoint['model_state_dict'])
print("Model weights loaded from best checkpoint.")

# 5. 모델 저장 디렉터리 설정
save_dir = "./fine_tuned_whisper_best"
os.makedirs(save_dir, exist_ok=True)

# 6. 모델과 프로세서 저장
model.save_pretrained(save_dir)
processor.save_pretrained(save_dir)

print(f"Model and processor saved to {save_dir}.")


  checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))


Model weights loaded from best checkpoint.




Model and processor saved to ./fine_tuned_whisper_best.
