## TTS 테스트
- 짧은 문장으로 음성 생성

In [9]:
from openai import OpenAI
from dotenv import load_dotenv
import os

# .env에서 API 키 로드
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

# OpenAI 클라이언트 생성
client = OpenAI(api_key=api_key)

# TTS 생성 요청
response = client.audio.speech.create(
    model="tts-1-hd",
    voice="alloy",
    input="Hello world! This is a TTS test.",
)

# 파일 저장
response.write_to_file("hello_world.mp3")

# 재생
import IPython.display as ipd
ipd.Audio(filename="hello_world.mp3")

## 목소리를 ash로 바꾸고 테스트하기
- voice 파라미터 변경 테스트

In [10]:
# 다른 voice로 생성
voice = "ash"
mp3_file = f"hello_world_{voice}.mp3"
response = client.audio.speech.create(
    model="tts-1-hd",
    voice=voice,
    input=f"Hello world! I'm {voice}. This is a TTS test.",
)

# 파일 저장
response.write_to_file(mp3_file)

# 재생
import IPython.display as ipd
ipd.Audio(filename=mp3_file)

## JSON 파일 로드하여 TTS 수행
- 이미지 퀴즈에서 만든 영문 스크립트 로드
- 생성된 mp3 재생 확인

In [11]:
import json

# JSON 파일 읽기
with open('/Users/donggyeong/develop/now/GPT_AGENT_2025_BOOK/chap06/sec02/image_quiz_eng.json', 'r', encoding='utf-8') as f:
    eng_dict = json.load(f)

eng_dict

[{'no': 1,
  'eng': 'Which of the following descriptions of the image is incorrect?\n- (1) Many people are gathered and seated at the event.\n- (2) The words "DIVE 2024 IN BUSAN" are visible in the image.\n- (3) People are lining up to receive food.\n- (4) The event is taking place indoors.',
  'img': 'busan_dive.jpg'},
 {'no': 2,
  'eng': "Which of the following descriptions of the image is incorrect?\n- (1) A yellow sculpture is visible.\n- (2) The building facade has the words 'Local Stitch'.\n- (3) There is a black sign attached to the wall.\n- (4) The sculpture is green.",
  'img': 'local_stitch.jpg'}]

In [12]:
# 사용할 목소리 후보 목록
voices = ['alloy', 'ash', 'coral', 'echo', 'fable', 'onyx', 'nova', 'sage', 'shimer']

for q in eng_dict:
    no = q['no']
    quiz = q['eng']

    # 번호 표기를 TTS에 적합하게 변환
    quiz = quiz.replace("- (1)", "- One. 	")
    quiz = quiz.replace("- (2)", "- Two. 	")
    quiz = quiz.replace("- (3)", "- Three. 	")
    quiz = quiz.replace("- (4)", "- Four. 	")

    print(no, quiz)

    # 문제 번호로 voice 순환 선택
    voice = voices[no % len(voices)]
    response = client.audio.speech.create(
        model="tts-1-hd",
        voice=voice,
        input=f'#{no}. {quiz}',
    )

    # 문제별 mp3 저장
    response.write_to_file(f"/Users/donggyeong/develop/now/GPT_AGENT_2025_BOOK/chap06/sec02/{no}.mp3")

1 Which of the following descriptions of the image is incorrect?
- One. 	 Many people are gathered and seated at the event.
- Two. 	 The words "DIVE 2024 IN BUSAN" are visible in the image.
- Three. 	 People are lining up to receive food.
- Four. 	 The event is taking place indoors.
2 Which of the following descriptions of the image is incorrect?
- One. 	 A yellow sculpture is visible.
- Two. 	 The building facade has the words 'Local Stitch'.
- Three. 	 There is a black sign attached to the wall.
- Four. 	 The sculpture is green.


In [13]:
# 생성된 mp3 재생 확인
ipd.Audio(f"/Users/donggyeong/develop/now/GPT_AGENT_2025_BOOK/chap06/sec02/1.mp3")

In [14]:
# 생성된 mp3 재생 확인
ipd.Audio(f"/Users/donggyeong/develop/now/GPT_AGENT_2025_BOOK/chap06/sec02/2.mp3")