In [1]:
import pandas as pd
from tqdm import tqdm
from openai import OpenAI

In [2]:
client = OpenAI(
    api_key="",
    base_url="https://api.upstage.ai/v1/solar"
)

In [3]:
train_df = pd.read_csv("../dataset/cleaned_train.csv")
sample_df = pd.read_csv("../dataset/top5_samples.csv")

In [4]:
few_shot_samples = sample_df.head(10)
few_shot_prompt = ""
for idx, row in few_shot_samples.iterrows():
    summary = row['summary']
    dialogue = row['dialogue']
    few_shot_prompt += f"Summary:\n{summary}\nDialogue:\n{dialogue}\n\n"

In [5]:
# 시스템 프롬프트 설정
system_prompt1 = {
    "role": "system",
    "content": (
        "You are a helpful assistant for generating dialogues from summaries. "
        "Make sure not to generate any sentences in English or sentences enclosed in parentheses. "
        "Ensure that each speaker's turn is separated by exactly one newline character, and avoid using double newlines. "
        "The dialogue format should follow this structure: '#Person#: dialogue text'. All dialogues must be generated in Korean."
    )
}

# Few-shot 대화 생성 함수
def generate_dialogue_with_few_shot(summary):
    try:
        # User 프롬프트에 한국어로 대화 생성 요청 추가
        user_prompt = (
            f"{few_shot_prompt}\n"
            f"Now generate a dialogue based on the following summary, ensuring the dialogue is entirely in Korean, "
            f"with no English or sentences in parentheses. Ensure proper formatting as explained above:\n{summary}"
        )
        
        response = client.chat.completions.create(
            model="solar-1-mini-chat",
            messages=[
                system_prompt1,
                {
                    "role": "user",
                    "content": user_prompt
                }
            ],
            stream=False  # Set to True if you want to use streaming
        )
        # 생성된 대화 텍스트 반환
        return response.choices[0].message.content

    except Exception as e:
        print(f"Error generating dialogue: {e}")
        return None

In [6]:
sampled_df = train_df.sample(frac=0.8, random_state=42).reset_index(drop=True)
sampled_df['dialogue'] = None

for idx, row in tqdm(sampled_df.iterrows(), total=len(sampled_df)):
    generated_dialogue = generate_dialogue_with_few_shot(row['summary'])
    # print(idx)
    # print(f"{generated_dialogue}\n")
    sampled_df.at[idx, 'dialogue'] = generated_dialogue

print(sampled_df[['summary', 'dialogue']].head())

  1%|▏         | 147/9966 [08:19<6:24:17,  2.35s/it]

In [7]:
output_file_path = '../dataset/generated_dialogue.csv'
sampled_df.to_csv(output_file_path, index=False)