# 듣고 질문에 답하기 유형 데이터 만들기

- 질문 생성 만들기
- 질문에 대한 오디오 만들기


## 질문 생성 Chain

In [1]:
import json
from typing import List

from tqdm.notebook import tqdm
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import JsonOutputParser, CommaSeparatedListOutputParser
from langchain.pydantic_v1 import BaseModel, Field
from langchain.schema import HumanMessage, AIMessage, StrOutputParser
import pandas as pd

In [2]:
model = ChatOpenAI(model="gpt-3.5-turbo")

### 질문 주제 샘플링하기

In [3]:
csv_parser = CommaSeparatedListOutputParser()

In [4]:
csv_format_instruction = csv_parser.get_format_instructions()

In [5]:
subjet_prompt_template = PromptTemplate.from_template(template="영어 시험에 나올 법한 일상적인 주제를 단어 형식으로 만들어줘.\n{format_instruction}",
                                                      partial_variables={"format_instruction": csv_format_instruction})

In [6]:
subject_chain = subjet_prompt_template | model | csv_parser

In [10]:
subject_chain.invoke({})

['family',
 'friends',
 'hobbies',
 'school',
 'work',
 'food',
 'travel',
 'shopping',
 'sports',
 'movies',
 'music',
 'books',
 'technology',
 'health',
 'weather',
 'fashion',
 'pets',
 'holidays',
 'transportation',
 'social media']

In [11]:
csv_parser.invoke(model.invoke(f"영어 시험에 나올 법한 일상적인 주제를 단어 형식으로 만들어줘.\n{csv_format_instruction}"))

['family',
 'school',
 'hobbies',
 'technology',
 'sports',
 'travel',
 'food',
 'music',
 'movies',
 'fashion',
 'health',
 'environment']

In [12]:
subject_list = subject_chain.invoke({})

In [13]:
subject_list

['family',
 'school',
 'food',
 'sports',
 'hobbies',
 'travel',
 'weather',
 'movies',
 'music',
 'fashion']

In [14]:
subject_list = subject_list[:4]

In [15]:
subject_list

['family', 'school', 'food', 'sports']

### 질문 만들기

In [38]:
model = ChatOpenAI(model="gpt-4-1106-preview")

In [39]:
template = """\
# 이전 질문들 {prev_questions}
- 이전 질문들과는 다른 유형으로 만들어줘
- 영어 시험에 나올 법한 {input} 주제에 관한 쉬운 질문 하나 만들어줘.
- 상대방과 연관지어 만들어줘
- 한 문장만 만들어줘
- 여러 예시 만들지마
- 영어로"""

question_prompt_template = PromptTemplate.from_template(template=template)

In [40]:
question_chain = question_prompt_template | model | StrOutputParser()

In [43]:
question_list = []
for subject in tqdm(subject_list):
    question_list.append(question_chain.invoke({"input": subject, "prev_questions": question_list}))
    # question_list.append(question_chain.invoke({"input": subject}))

  0%|          | 0/4 [00:00<?, ?it/s]

In [44]:
question_list

['How many siblings do you have?',
 'What is the name of your favorite subject in school?',
 'What is your favorite fruit to eat as a snack?',
 'Which sport do you prefer to play, soccer or basketball?']

## 질문에 대한 오디오 파일 만들기

In [45]:
from openai import OpenAI

In [46]:
client = OpenAI()

In [47]:
def gen_speech_file(text, output_file_path):
    response = client.audio.speech.create(
        model="tts-1",
        voice="alloy", # alloy, echo, fable, onyx, nova, and shimmer
        input=text
    )
    response.stream_to_file(output_file_path)

In [48]:
!mkdir -p ./data/speaking__listen_and_answer

In [49]:
save_dir = "./data/speaking__listen_and_answer"

In [50]:
question_list

['How many siblings do you have?',
 'What is the name of your favorite subject in school?',
 'What is your favorite fruit to eat as a snack?',
 'Which sport do you prefer to play, soccer or basketball?']

In [51]:
record_list = []

for i, q in tqdm(enumerate(question_list), total=len(question_list)):
    output_file_path = f"{save_dir}/question_{i}.wav"
    gen_speech_file(q, output_file_path)

    record = {"question": q, "audio_file_path": output_file_path}
    record_list.append(record)

  0%|          | 0/4 [00:00<?, ?it/s]

In [52]:
df = pd.DataFrame(record_list)
df

Unnamed: 0,question,audio_file_path
0,How many siblings do you have?,./data/speaking__listen_and_answer/question_0.wav
1,What is the name of your favorite subject in s...,./data/speaking__listen_and_answer/question_1.wav
2,What is your favorite fruit to eat as a snack?,./data/speaking__listen_and_answer/question_2.wav
3,"Which sport do you prefer to play, soccer or b...",./data/speaking__listen_and_answer/question_3.wav


In [53]:
df.to_csv(f"{save_dir}/question_and_audio.csv", index=False)

In [54]:
from IPython.display import Audio

In [57]:
Audio(f"{save_dir}/question_2.wav")