# Conversation Generation Pipeline

This notebook generates natural conversations for the Tactful ToM dataset.

In [None]:
import pandas as pd
import random
import kagglehub
import os
import json
import re
import tiktoken

In [None]:
from conv_generation_gpt_api import (
    generate_natural_conversation_step1, 
    generate_natural_conversation_step2, 
    generate_natural_conversation_step3, 
    generate_natural_conversation_step4
)
from conv_generation_utils import (
    get_leave_reasons,
    replace_ABCD_with_name,
    populate_template,
    append_data_to_json,
    load_conversation_elements,
    extract_data_fields
)

## 1. Setup and Load Data

In [None]:
# Download names dataset
path = kagglehub.dataset_download("ryanburnsworth/popular-names-by-birth-year-1880-2022")
print("Path to dataset files:", path)

# Load the CSV file
file_path = os.path.join(path, 'names_by_birth_year.csv')
df = pd.read_csv(file_path)
print(df.head())

# Sort by count and get top 20%
df_sorted = df.sort_values(by="Count", ascending=False)
top_20_count = int(len(df_sorted) * 0.2)
df_top_20 = df_sorted.iloc[:top_20_count]

In [None]:
# Get leave reasons
leave_reasons = get_leave_reasons()

## 2. Configure Conversation

In [None]:
# Sample leave reasons
leave_reason_B, leave_reason_D_1, leave_reason_D_2 = random.sample(leave_reasons, 3)
print(f"Leave reasons: {leave_reason_B}, {leave_reason_D_1}, {leave_reason_D_2}")

In [None]:
# Sample random names
random_names = df_top_20['Name'].sample(n=4).to_list()
A_name, B_name, C_name, D_name = random_names
print(f"Characters: {A_name}, {B_name}, {C_name}, {D_name}")

In [None]:
# Configure parameters
lie_type = "altruistic_white_lies"
emotion = "sad"
set_id = "2-10-5-2"
lie_id = "5-2"
conv_id = 10
truth_id = 1
falsification = False
muiltiple_liar = True

scenario = "discussing pet care"
relationship = "families (parents, one kid, aunt/uncle)"
situation_topic = "talking about what happened"
situation = "kid is upset"
lie_objective = "comfort feelings"

real_reason_c = "child will be upset"
truth_c = "pet won't return"
lie_c = "pet is safe somewhere"

# Replace placeholders
real_reason_c = replace_ABCD_with_name(real_reason_c, A_name, B_name, C_name, D_name)
lie_c = replace_ABCD_with_name(lie_c, A_name, B_name, C_name, D_name)
truth_c = replace_ABCD_with_name(truth_c, A_name, B_name, C_name, D_name)

## 3. Define Templates

In [None]:
# Define conversation templates here
# Use populate_template() to fill in variables

## 4. Generate Conversation

In [None]:
# Step 1
# result_1 = generate_natural_conversation_step1(filled_step_1)
# part_1 = result_1.choices[0].message['content']

In [None]:
# Step 2
# result_2 = generate_natural_conversation_step2(input_2)
# part_2 = result_2.choices[0].message['content']

In [None]:
# Step 3
# result_3 = generate_natural_conversation_step3(input_3)
# part_3 = result_3.choices[0].message['content']

In [None]:
# Step 4
# result_4 = generate_natural_conversation_step4(input_4)
# part_4 = result_4.choices[0].message['content']

## 5. Save Results

In [None]:
# Combine and save
# full_context = "\n\n".join([part_1, part_2, part_3, part_4])
# short_context = "\n\n".join([part_2, part_3, part_4])

# Create data dict and save with append_data_to_json()