In [1]:
import pandas as pd
from tqdm import tqdm
from openai import OpenAI

In [2]:
# client = OpenAI(
#     api_key="",
#     base_url="https://api.upstage.ai/v1/solar"
# )

client = OpenAI(api_key='')

In [3]:
train_df = pd.read_csv("../dataset/cleaned_train.csv")
test_df = pd.read_csv("../dataset/test.csv")
sample_df = pd.read_csv("../dataset/top5_samples.csv")

In [4]:
few_shot_samples = sample_df.head(10)
few_shot_prompt = ""
for idx, row in few_shot_samples.iterrows():
    summary = row['summary']
    dialogue = row['dialogue']
    few_shot_prompt += f"Summary:\n{summary}\nDialogue:\n{dialogue}\n\n"

In [5]:
# 시스템 프롬프트 설정
system_prompt1 = {
    "role": "system",
    "content": (
        "You are a helpful assistant for generating dialogues from summaries. "
        "Make sure not to generate any sentences in English or sentences enclosed in parentheses. "
        "Ensure that each speaker's turn is separated by exactly one newline character, and avoid using double newlines. "
        "The dialogue format should follow this structure: '#Person#: dialogue text'. All dialogues must be generated in Korean."
    )
}

# Few-shot 대화 생성 함수
def generate_dialogue_with_few_shot(summary):
    try:
        # User 프롬프트에 한국어로 대화 생성 요청 추가
        user_prompt = (
            f"{few_shot_prompt}\n"
            f"Now generate a dialogue based on the following summary, ensuring the dialogue is entirely in Korean, "
            f"with no English or sentences in parentheses. Ensure proper formatting as explained above:\n{summary}"
        )
        
        response = client.chat.completions.create(
            model="solar-1-mini-chat",
            messages=[
                system_prompt1,
                {
                    "role": "user",
                    "content": user_prompt
                }
            ],
            stream=False  # Set to True if you want to use streaming
        )
        # 생성된 대화 텍스트 반환
        return response.choices[0].message.content

    except Exception as e:
        print(f"Error generating dialogue: {e}")
        return None

In [6]:
sampled_df = train_df.sample(frac=0.8, random_state=42).reset_index(drop=True)
sampled_df['dialogue'] = None

for idx, row in tqdm(sampled_df.iterrows(), total=len(sampled_df)):
    generated_dialogue = generate_dialogue_with_few_shot(row['summary'])
    # print(idx)
    # print(f"{generated_dialogue}\n")
    sampled_df.at[idx, 'dialogue'] = generated_dialogue

print(sampled_df[['summary', 'dialogue']].head())

100%|██████████| 9966/9966 [7:28:41<00:00,  2.70s/it]   

                                             summary  \
0                 로즈가 잭에게 전화를 걸어 이번 토요일 저녁 식사에 초대한다.   
1  #Person2#는 #Person1#에게 정장에 넥타이를 착용하고 면접에서 최선을 ...   
2  존은 #Person1#에게 그의 동기, 그의 경영 스타일, 그리고 그의 사업과 개인...   
3  #Person1#은 다음 베를린행 비행기 티켓을 구매하려 한다. #Person2#은...   
4  조지는 주가 코트를 고르는 것을 도와주고 있다. 그들은 점원에게 더 큰 사이즈를 물...   

                                            dialogue  
0  #Person1#: 안녕하세요, 잭입니다.\n#Person2#: 안녕하세요, 로즈입...  
1  #Person1#: 안녕하세요, 선배님. 내일 면접이 있는데 조언 좀 구할 수 있을...  
2  #사람1#: 안녕하세요, 존. 오늘 어떻게 도와드릴까요?\n#사람2#: 안녕하세요....  
3  #Person1#: 안녕하세요. 다음 베를린행 비행기 티켓을 구매하고 싶은데요.\n...  
4  #사람1#: 조지가 도와줄게. 어떤 스타일이 좋을까?\n#사람2#: 음... 잘 모...  





In [7]:
output_file_path = '../dataset/generated_dialogue.csv'
sampled_df.to_csv(output_file_path, index=False)

In [18]:
few_shot_samples = sample_df.head(10)
few_shot_prompt = ""
for idx, row in few_shot_samples.iterrows():
    dialogue = row['dialogue']
    summary = row['summary']
    few_shot_prompt += f"Dialogue:\n{dialogue}\nSummary:\n{summary}\n\n"

In [19]:
system_prompt2 = {
    "role": "system",
    "content": (
        "You are a helpful assistant for summarizing dialogues. "
        "Make sure not to generate any sentences in English or sentences enclosed in parentheses. "
        "Ensure that the summary captures the main points of the dialogue in concise Korean. "
        "Use translation-like, formal Korean, ensuring the style reflects a translated text tone."
    )
}

def generate_summary_with_few_shot(dialogue):
    try:
        user_prompt = (
            f"{few_shot_prompt}\n"
            f"Now generate a summary based on the following dialogue, ensuring the summary is entirely in Korean, "
            f"with no English or sentences in parentheses. Use translation-like, formal Korean, ensuring the style reflects a translated text tone. "
            f"Ensure the summary is concise and captures the main points:\n{dialogue}"
        )
        
        response = client.chat.completions.create(
            model= 'gpt-4o-mini', ## "solar-1-mini-chat",
            messages=[
                system_prompt2,
                {
                    "role": "user",
                    "content": user_prompt
                }
            ],
            stream=False  # Set to True if you want to use streaming
        )
        # 생성된 요약 텍스트 반환
        return response.choices[0].message.content

    except Exception as e:
        print(f"Error generating summary: {e}")
        return None


In [20]:
sampled_df = test_df
# sampled_df = test_df.sample(frac=0.8, random_state=42).reset_index(drop=True)
# sampled_df['summary'] = None  # summary 열 초기화

for idx, row in tqdm(sampled_df.iterrows(), total=len(sampled_df)):
    # 대화(dialogue)를 기반으로 요약(summary) 생성
    generated_summary = generate_summary_with_few_shot(row['dialogue'])
    sampled_df.at[idx, 'summary'] = generated_summary  # 생성된 요약 저장


100%|██████████| 499/499 [10:19<00:00,  1.24s/it]


In [22]:
sampled_df = sampled_df.drop(columns=['dialogue'])

In [23]:
output_file_path = '../dataset/generated_sum.csv'
sampled_df.to_csv(output_file_path, index=False)

In [24]:
import numpy as np

sampled_df['summary'] = sampled_df['summary'].str.replace(',', '', regex=False)

n = 100
random_indices = np.random.randint(0, len(sampled_df), size=n)
selected_rows = sampled_df.iloc[random_indices]

pred_df = pd.read_csv("../notebooks/unsloth/prediction.csv")

for _, row in selected_rows.iterrows():
    pred_df.loc[pred_df['fname'] == row['fname'], 'summary'] = row['summary']

pred_df.to_csv("../notebooks/unsloth/test.csv")

In [25]:
merged_df = pd.merge(test_df, sampled_df[['fname', 'summary']], on='fname', how='left')
merged_df.to_csv("../dataset/new_teset.csv", index=False)