In [38]:
import argparse
import os
import random
import re
import sys
from datetime import datetime
from typing import Dict, List, Tuple, Any

import pandas as pd
from sdg_config import *

def sdg(
    sample_size: int,
    labels: List[str],
    label_descriptions: str,
    categories_types: Dict[str, List[str]],
    use_case: str,
    channel_type: List[str],
    prompt_examples: str,
    model: str,
    max_tokens: int = 2000,
    temperature: float = 0.7,
) -> Tuple[str, str, str]:
    """
    Generates synthetic data based on specified categories and labels.

    Args:
        sample_size (int): The number of synthetic data samples to generate.
        labels (List[str]): The labels used to classify the synthetic data.
        label_descriptions (str): A description of the meaning of each label.
        categories_types (Dict[str, List[str]]): The categories and their types for data generation and diversification.
        use_case (str): The use case of the synthetic data to provide context for the language model.
        prompt_examples (str): The examples used in the Few-Shot or Chain-of-Thought prompting.
        model (str): The large language model used for generating the synthetic data.

    Returns:
        Tuple[str, str, str]: A tuple containing:
                              - A status message indicating the save location of the synthetic data.
                              - The path to the output CSV file.
                              - The timestamp used in the filename.
    """
    total_samples = []
    categories = list(categories_types.keys())

    for idx in range(sample_size):
        for channel in channel_type:
            for category in categories:
                sub_category_list = categories_types[category]
                for sub_category in sub_category_list:
                    for label in labels:

                        prompt = f"""You should create synthetic data for specified labels and categories.
                        This is especially useful for {use_case}.
                        
                        *Label Descriptions*
                        {label_descriptions}

                        *Examples*
                        {prompt_examples}

                        ####################

                        Generate one output for the classification below.
                        You may use the examples I have provided as a guide, but you cannot simply modify or copy them.
                        Only return the OUTPUT and REASONING. The first token in your response must be OUTPUT.
                        Do not return the LABEL, CATEGORY, or TYPE.

                        LABEL: {label}
                        CONTACT : {channel}
                        CATEGORY: {category}
                        TYPE: {sub_category}
                        OUTPUT:
                        REASONING: 
                        """

                        messages = [
                            {
                                "role": "system",
                                "content": f"""You are a helpful assistant designed to generate synthetic data for {use_case} with labels {labels} in categories {category} with conversation {sub_category} theme. 
                                The first token in your generated text must be OUTPUT: This must be followed by the token REASONING: as in the prompt examples.""",
                                },
                            {
                                "role": "user", 
                                "content": prompt},
                                ]

                        batch_formatting = {"custom_id" : f"{idx}_{channel}_{label}_{category}_{sub_category}",
                                             "method": "POST", 
                                             "url": "/v1/chat/completions",
                                             "body": {"model": model,
                                                      "messages": messages,
                                                      "temperature": random.uniform(0.6, 0.9),
                                                      #"max_tokens": max_tokens
                                                      }
                                            }
                        total_samples.append(batch_formatting)

        
    return total_samples

### Synthentic Data Generation

In [62]:
# Generate synthetic data with Sample Size x Label x Category Type x Use Case x Contact Channel
sdg_list_batch_api_format_list = sdg(5, labels, label_descriptions, 
                                     categories_types, use_case, 
                                     contact_chanel, prompt_examples, 
                                     'gpt-4.1')

batch_api_requests_df = pd.DataFrame(sdg_list_batch_api_format_list)
batch_api_requests_df.to_json('tesing_sdg_openai.jsonl', orient='records', lines=True)

In [63]:
len(batch_api_requests_df)

2500

### Cost Estimation for OpenAI Batch API

In [64]:
output_prompt_ex = """OUTPUT: ข้าพเจ้าใคร่ขอกราบเรียนมายังท่านอาจารย์ผู้ทรงคุณวุฒิ ด้วยความเคารพยิ่ง ข้าพเจ้ากำลังดำเนินงานวิจัยตามโครงการที่ได้รับมอบหมาย แต่อาจมีข้อสงสัยบางประการเกี่ยวกับแนวทางการวิเคราะห์ข้อมูล จึงขอกราบกรานความเมตตาโปรดประทานคำแนะนำอันทรงคุณค่าในการปรับปรุงและพัฒนางานวิจัยดังกล่าวต่อไปด้วยจักเป็นพระกรุณาอย่างยิ่ง

REASONING: This text is highly ceremonial and formal. It utilizes very respectful pronouns like "ข้าพเจ้า" and addresses the professor as "ท่านอาจารย์ผู้ทรงคุณวุฒิ." The sentence structure is complex and eloquent, using phrases such as "ขอกราบกรานความเมตตาโปรดประทานคำแนะนำอันทรงคุณค่า." This demonstrates both a high level of politeness and reverence, typical for royal ceremonies or national events, and uses grammatically flawless language indicative of the "พิธีการ" level."""

In [65]:
import tiktoken


def estimate_token_count(text:str, model_encoding):
    return len(model_encoding.encode(text))

def estimmate_cost_batch(tokten_count_input:int, token_count_output:int,
                         model_cost_input:float, model_cost_output:float) -> float:
    input_cost = (tokten_count_input/1000000) * model_cost_input
    output_cost = (token_count_output/1000000) * model_cost_output
    return input_cost + output_cost

enc = tiktoken.encoding_for_model("gpt-4o")
acc_cost = 0
total_token_count = 0
for prompt_information in sdg_list_batch_api_format_list:
    system_prompt = prompt_information['body']['messages'][0]['content']
    user_prompt = prompt_information['body']['messages'][1]['content']
    output_prompt = output_prompt_ex

    system_prompt_token_count = estimate_token_count(system_prompt, enc)
    user_prompt_token_count = estimate_token_count(user_prompt, enc)
    output_prompt_token_count = estimate_token_count(output_prompt, enc)

    cost_per_requests = estimmate_cost_batch(
        system_prompt_token_count + user_prompt_token_count,
        output_prompt_token_count,
        1.00, # gpt-4.1 input cost batch
        4.00, # gpt-4.1 output cost batch
        )
    
    acc_cost += cost_per_requests
    total_token_count += (system_prompt_token_count + user_prompt_token_count + output_prompt_token_count)

In [66]:
print(f"Total Token Count: {total_token_count}")
print(f"Total Cost: {acc_cost}")

Total Token Count: 5816500
Total Cost: 7.781499999999924
