In [9]:
# Google Colab용 코드

# 필수 라이브러리 설치
!pip install torch torchaudio transformers accelerate

import torch
import torchaudio
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from google.colab import files

# GPU 설정
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# 모델 설정
model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=256,
    chunk_length_s=30,
    batch_size=4,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)

# Colab에서 파일 업로드
uploaded = files.upload()

# 업로드된 파일 경로 설정
audio_path = '/content/sample.wav'
output_path = "transcription.txt"

# 오디오 파일 로드
audio, sr = torchaudio.load(audio_path)
audio = audio.squeeze().numpy()

# 음성 인식 수행
result = pipe(audio)

# 결과 저장
with open(output_path, "w") as f:
    f.write(result["text"].strip())

# 결과 출력
print(result["text"].strip())

# Colab에서 결과 파일 다운로드
files.download(output_path)


Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.30.1


model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.90k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.07k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Saving sample.wav to sample (1).wav


Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.


인플루엔자 같은 거는 저희가 2년 내지 1년을 앞을 내다보고 예방접종약을 만들어서 국민들한테 예방접종을,


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>