In [1]:
import pandas as pd
from tqdm import tqdm
from openai import OpenAI

In [3]:
train_df = pd.read_csv("../dataset/cleaned_train.csv")
valid_df = pd.read_csv("../dataset/cleaned_dev.csv")
test_df = pd.read_csv("../dataset/test.csv")
sample_df = pd.read_csv("../dataset/top5_samples.csv")

In [4]:
# client = OpenAI(
#     api_key="up_SFfhHAFGwxRBXTqXAw6epNiyQNWpj",
#     base_url="https://api.upstage.ai/v1/solar"
# )

client = OpenAI(api_key='')

In [5]:
few_shot_samples = sample_df.head(10)
few_shot_prompt = ""
for idx, row in few_shot_samples.iterrows():
    summary = row['summary']
    dialogue = row['dialogue']
    few_shot_prompt += f"Summary:\n{summary}\nDialogue:\n{dialogue}\n\n"

In [9]:
# 시스템 프롬프트 설정
system_prompt1 = {
    "role": "system",
    "content": (
        "You are a helpful assistant for generating dialogues from summaries. "
        "Make sure not to generate any sentences in English or sentences enclosed in parentheses. "
        "Ensure that each speaker's turn is separated by exactly one newline character, and avoid using double newlines. "
        "The dialogue format should follow this structure: '#Person#: dialogue text'. All dialogues must be generated in Korean. "
        "Ensure to incorporate various speech styles such as casual (구어체), formal (문어체), and translated (번역체) tones throughout the dialogue."
    )
}

# Few-shot 대화 생성 함수
def generate_dialogue_with_few_shot(summary):
    try:
        # User 프롬프트에 한국어로 대화 생성 요청 추가 및 문체 다양성 추가
        user_prompt = (
            f"{few_shot_prompt}\n"
            f"Now generate a dialogue based on the following summary, ensuring the dialogue is entirely in Korean, "
            f"with no English or sentences in parentheses. Ensure proper formatting as explained above:\n"
            f"Additionally, use a mix of casual (구어체), formal (문어체), and translated (번역체) speech styles throughout the dialogue:\n{summary}"
        )
        
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                system_prompt1,
                {
                    "role": "user",
                    "content": user_prompt
                }
            ],
            stream=False  # Set to True if you want to use streaming
        )
        # 생성된 대화 텍스트 반환
        return response.choices[0].message.content

    except Exception as e:
        print(f"Error generating dialogue: {e}")
        return None


In [11]:
sampled_df = train_df.sample(frac=0.4, random_state=42).reset_index(drop=True)
sampled_df['dialogue'] = None

for idx, row in tqdm(sampled_df.iterrows(), total=len(sampled_df)):
    generated_dialogue = generate_dialogue_with_few_shot(row['summary'])
    sampled_df.at[idx, 'dialogue'] = generated_dialogue

    # print(idx)
    # print(row['summary'])
    # print(f"{generated_dialogue}\n")
    # break

 81%|████████  | 4027/4983 [4:03:48<1:47:29,  6.75s/it]

In [7]:
output_file_path = '../dataset/generated_dialogue2.csv'
sampled_df.to_csv(output_file_path, index=False)

In [4]:
few_shot_samples = sample_df.head(10)
few_shot_prompt = ""
for idx, row in few_shot_samples.iterrows():
    dialogue = row['dialogue']
    summary = row['summary']
    few_shot_prompt += f"Dialogue:\n{dialogue}\nSummary:\n{summary}\n\n"

In [7]:
system_prompt2 = {
    "role": "system",
    "content": (
        "You are a helpful assistant for summarizing dialogues. "
        "Make sure not to generate any sentences in English or sentences enclosed in parentheses. "
        "Ensure that the summary captures the main points of the dialogue in concise Korean."
    )
}

def generate_summary_with_few_shot(dialogue):
    try:
        user_prompt = (
            f"{few_shot_prompt}\n"
            f"Now generate a summary based on the following dialogue, ensuring the summary is entirely in Korean, "
            f"with no English or sentences in parentheses. Ensure the summary is concise and captures the main points:\n{dialogue}"
        )
        
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                system_prompt2,
                {
                    "role": "user",
                    "content": user_prompt
                }
            ],
            stream=False  # Set to True if you want to use streaming
        )
        # 생성된 요약 텍스트 반환
        return response.choices[0].message.content

    except Exception as e:
        print(f"Error generating summary: {e}")
        return None

In [10]:
sampled_df = train_df.sample(frac=0.8, random_state=42).reset_index(drop=True)
sampled_df['summary'] = None  # summary 열 초기화

for idx, row in tqdm(sampled_df.iterrows(), total=len(sampled_df)):
    # 대화(dialogue)를 기반으로 요약(summary) 생성
    generated_summary = generate_summary_with_few_shot(row['dialogue'])
    # print(row['dialogue'])
    # print(generated_summary)
    sampled_df.at[idx, 'summary'] = generated_summary  # 생성된 요약 저장

100%|██████████| 9966/9966 [3:30:43<00:00,  1.27s/it]  


In [11]:
output_file_path = '../dataset/generated_summary.csv'
sampled_df.to_csv(output_file_path, index=False)