### Typhoon Synthetics Dataset





In [None]:
!pip install openai openpyxl -q

import pandas as pd
import random
import time
import re
from openai import OpenAI
import os
import sys

sys.path.append("/mnt/data")
import sdg_config as config

samples_per_combination = 10
output_file = "typhoon_results.xlsx"
model_name = "typhoon-v2-70b-instruct"

with open("system_prompt.txt", "r", encoding="utf-8") as f:
    system_prompt = f.read().strip()

with open("user_prompt.txt", "r", encoding="utf-8") as f:
    user_prompt_template = f.read().strip()

client = OpenAI(
    api_key="Typhoon_API",
    base_url="https://api.opentyphoon.ai/v1"
)

combinations = []
for label in config.labels:
    for contact in config.contact_chanel:
        for category, types in config.categories_types.items():
            for type_ in types:
                for _ in range(samples_per_combination):
                    combinations.append({
                        "label": label,
                        "contact": contact,
                        "category": category,
                        "type": type_
                    })

if not os.path.exists(output_file):
    pd.DataFrame(columns=["label", "contact", "category", "type", "output", "reasoning", "model"]).to_excel(output_file, index=False)

def get_output_reasoning(prompt):
    try:
        res = client.chat.completions.create(
            model=model_name,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt}
            ]
        )
        text = res.choices[0].message.content
        match = re.search(r"OUTPUT:\s*(.*?)\s*REASONING:\s*(.*)", text, re.DOTALL)
        return (match.group(1).strip(), match.group(2).strip()) if match else ("", "[FAILED TO PARSE]")
    except Exception as e:
        return ("", f"[ERROR] {str(e)}")

total = len(combinations)
for idx, row in enumerate(combinations):
    label = row["label"]
    category = row["category"]

    # --- สุ่ม prompt_data ตาม (category, label) ---
    key = (category, label)
    phrases = config.prompt_data.get(key, {}).get("phrases", [
        "ขออนุญาตสอบถามเพิ่มเติม", "ขอคำแนะนำ", "รบกวนช่วยดูหัวข้อให้หน่อย"
    ])
    prompt_data = random.choice(phrases)

    # --- สุ่ม pronoun ตาม label ---
    pronouns = config.pronouns_by_level.get(label, ["ฉัน", "ผม", "หนู"])
    pronoun = random.choice(pronouns)

    # --- แทนที่ใน template ---
    user_prompt = user_prompt_template.replace("LABEL: พิธีการ", f"LABEL: {label}") \
                                      .replace("CONTACT: Email", f"CONTACT: {row['contact']}") \
                                      .replace("CATEGORY: attendance_issues", f"CATEGORY: {category}") \
                                      .replace("TYPE: request leave", f"TYPE: {row['type']}") \
                                      .replace("OUTPUT:", f"PRONOUN: {pronoun}\nPROMPT_DATA: {prompt_data}\nOUTPUT:")

    output, reasoning = get_output_reasoning(user_prompt)

    new_row = pd.DataFrame([{
        "label": label,
        "contact": row["contact"],
        "category": category,
        "type": row["type"],
        "output": output,
        "reasoning": reasoning,
        "model": model_name
    }])

    with pd.ExcelWriter(output_file, mode="a", engine="openpyxl", if_sheet_exists="overlay") as writer:
        new_row.to_excel(writer, header=False, index=False, startrow=writer.sheets['Sheet1'].max_row)

    print(f"[{idx+1}/{total}] ✅")
    time.sleep(1)


### GPT Cleaning Dataset

In [None]:
import pandas as pd

batch_output = pd.read_json('./batch_6823ed3a1104819084d5eeb5b97556d5_output.jsonl', lines=True)
pd.json_normalize(batch_output['response'])

In [None]:
batch_output = pd.read_json('./batch_6823ed3a1104819084d5eeb5b97556d5_output.jsonl', lines=True)
body_flatten = pd.json_normalize(batch_output['response'])
choices_flatten = pd.json_normalize(body_flatten['body.choices'])
choices_flatten_2ndlayer = pd.json_normalize(choices_flatten[0])

batch_output['output_prompt'] = choices_flatten_2ndlayer['message.content']
batch_output.drop(columns=['error'], inplace=True)
batch_output['body.model'] = body_flatten['body.model']

batch_output['main_output'] = batch_output['output_prompt'].str.split('REASONING: ').str[0]
batch_output['reasoning_output'] = "REASONING: " + batch_output['output_prompt'].str.split('REASONING: ').str[-1]

batch_output

Export to excel

In [None]:
df = batch_output.drop(['id','response','output_prompt'], axis = 1)

columns = ['label', 'contact', 'category', 'type', 'output', 'reasoning','model']

gpt_output = pd.DataFrame(columns=columns)
gpt_output

In [None]:
split_cols = df['custom_id'].str.split('_', expand=True)

split_cols.columns = ['index', 'contact', 'label', 'category', 'category1', 'type']

split_cols['category'] = split_cols['category'] + ' ' + split_cols['category1']

split_cols = split_cols.drop('category1', axis=1)

split_cols

In [None]:
gpt_output['reasoning'] = df['reasoning_output']
gpt_output['output'] = df['main_output']
gpt_output['model'] = df['body.model']
gpt_output['label'] = split_cols['label']
gpt_output['contact'] = split_cols['contact']
gpt_output['category'] = split_cols['category']
gpt_output['type'] = split_cols['type']

gpt_output['output'] = gpt_output['output'].str.replace('^OUTPUT:\s*', '', regex=True)

gpt_output['reasoning'] = gpt_output['reasoning'].str.replace('^REASONING:\s*', '', regex=True)

gpt_output

In [None]:
gpt_output.to_excel('gpt_output.xlsx', index=False)

### Final Dataset

In [None]:
import os
import pandas as pd

file_list = os.listdir("./")
xlsx_file_list = [file_name for file_name in file_list if file_name.endswith(".xlsx")]

Data = pd.DataFrame()
for xlsx_file in xlsx_file_list:
    Data_temp = pd.read_excel(xlsx_file)
    Data = pd.concat([Data, Data_temp], ignore_index=True).dropna()

In [None]:
Data['category'] = Data['category'].str.replace('_', ' ', regex=False)
Data['output'] = Data['output'].str.replace(',', ' ', regex=False)
Data

In [None]:
Data.to_excel('Final_Data.xlsx', index=False)

For hugging face

In [8]:
Data.drop(columns=['contact','type'], inplace=True)
#Data

In [9]:
Data.to_csv('Final_Data.csv', index=False)

### EDA Data

In [None]:
!pip install -U datasets

In [None]:
from datasets import load_dataset
import pandas as pd

ds = load_dataset("nnudee/Thai-Thangkarn-sentence", split = 'train')
ds = ds.to_pandas()
ds

In [None]:
columns_to_count = ["label", "contact", "category", "type", "model"]

for col in columns_to_count:
    print(ds[col].value_counts())


Typhoon - EDA

In [None]:
data = ds[ds['model'] == 'typhoon-v2-70b-instruct']

#Email - Chat

labels = data['label'].unique()

for label in labels:
    print(f"\n=== Label: {label} ===")

    print("Email ตัวอย่าง:")
    email_examples = data[(data['label'] == label) & (data['contact'] == 'Email')].sample(n=5, random_state=42)
    for text in email_examples['output']:
        print("-", text)

    print("Chat ตัวอย่าง:")
    chat_examples = data[(data['label'] == label) & (data['contact'] == 'Chat')].sample(n=5, random_state=42)
    for text in chat_examples['output']:
        print("-", text)

In [None]:
#Each Label

labels = data['label'].unique()

for label in labels:
    print(f"\n=== Label: {label} ===")

    for text in data[data['label'] == label ]['output'].sample(n=5, random_state=42):
        print("-", text)

In [None]:
labels = ds["label"].unique()
types = ds["type"].unique()

for label in labels:
    for t in types:
        subset = ds[(ds["label"] == label) & (ds["type"] == t)]
        if len(subset) >= 3:
            examples = subset.sample(3, random_state=42).reset_index(drop=True)
            print(f"\n Label: {label} |  Type: {t}")
            for i, row in examples.iterrows():
                print(f"\n- {row['output']}")

gpt-4.1

In [None]:
data = ds[ds['model'] == 'gpt-4.1-2025-04-14']

#Email - Chat

labels = data['label'].unique()

for label in labels:
    print(f"\n=== Label: {label} ===")

    print("Email ตัวอย่าง:")
    email_examples = data[(data['label'] == label) & (data['contact'] == 'Email')].sample(n=5, random_state=42)
    for text in email_examples['output']:
        print("-", text)

    print("Chat ตัวอย่าง:")
    chat_examples = data[(data['label'] == label) & (data['contact'] == 'Chat')].sample(n=5, random_state=42)
    for text in chat_examples['output']:
        print("-", text)

In [None]:
#Each Label

labels = data['label'].unique()

for label in labels:
    print(f"\n=== Label: {label} ===")

    for text in data[data['label'] == label ]['output'].sample(n=5, random_state=42):
        print("-", text)

Over all

In [None]:
!pip install pythainlp

In [19]:
from pythainlp.tokenize import word_tokenize
import pandas as pd

In [None]:
# Example
sample_df = ds.sample(5, random_state=42)

for i, row in sample_df.iterrows():
    text = row["output"]
    words = word_tokenize(str(text), engine="newmm")
    print(words)

In [None]:
for i, row in sample_df.iterrows():
    text = row["output"]
    words = word_tokenize(str(text), engine="newmm")
    words = [w for w in words if w.strip()]
    print(words)

In [None]:
def count_words(text):
    words = word_tokenize(str(text), engine="newmm")
    return len(words)

ds["thai_word_count"] = ds["output"].apply(count_words)

grouped = ds.groupby(["model", "label"])["thai_word_count"].mean()

# ตารางเฉลี่ยตาม model และ label
mean_table = ds.pivot_table(
    values="thai_word_count",
    index="model",
    columns="label",
    aggfunc="mean"
).round(2)

overall_avg = ds.groupby("label")["thai_word_count"].mean().round(2)

mean_table.loc["average_all_models"] = overall_avg

max_table = ds.pivot_table(
    values="thai_word_count",
    index="model",
    columns="label",
    aggfunc="max"
)

min_table = ds.pivot_table(
    values="thai_word_count",
    index="model",
    columns="label",
    aggfunc="min"
)

print("Average:")
print(mean_table)

print("\nMax:")
print(max_table)

print("\nMin:")
print(min_table)

In [None]:
import re
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

def extract_english_words(text):
    return [w.lower() for w in re.findall(r'\b[a-zA-Z]+\b', str(text))]

ds["english_words"] = ds["output"].apply(extract_english_words)

model_names = ds["model"].unique()
model_counters = {}

for model in model_names:
    all_words = sum(ds[ds["model"] == model]["english_words"], [])
    word_freq = Counter(all_words)
    model_counters[model] = word_freq

all_words_combined = sum(model_counters.values(), Counter())
top_20_words = [word for word, _ in all_words_combined.most_common(20)]

plot_data = []

for word in top_20_words:
    for model in model_names:
        freq = model_counters[model][word]
        plot_data.append({
            "word": word.lower(),
            "model": model.lower(),
            "frequency": freq
        })

plot_df = pd.DataFrame(plot_data)

plt.figure(figsize=(14, 6))
sns.barplot(data=plot_df, x="word", y="frequency", hue="model")
plt.title("word frequency comparison by model (lowercase)", fontsize=16)
plt.xlabel("word", fontsize=14)
plt.ylabel("frequency", fontsize=14)
plt.xticks(rotation=45)
plt.legend(title="model")
plt.tight_layout()
plt.show()


In [None]:
contain = ds[ds["output"].str.contains("abc", case=False, na=False)]

for i, row in contain.iterrows():
    print(f"ข้อความเต็ม: {row['output']}")


In [9]:
!wget -q https://github.com/Phonbopit/sarabun-webfont/raw/master/fonts/thsarabunnew-webfont.ttf

import matplotlib.font_manager as fm
import matplotlib.pyplot as plt
import matplotlib as mpl

font_path = "thsarabunnew-webfont.ttf"
fm.fontManager.addfont(font_path)  # เพิ่มฟอนต์เข้า font manager
plt.rcParams["font.family"] = "TH Sarabun New"
mpl.rcParams['axes.unicode_minus'] = False  # ให้แสดงเครื่องหมายลบได้ถูกต้อง


In [None]:
from pythainlp.tokenize import word_tokenize
from collections import Counter
import seaborn as sns

labels = ds["label"].unique()

for label in labels:
    print(f"\n label: {label}")

    texts = ds[ds["label"] == label]["output"].astype(str).tolist()
    all_text = " ".join(texts)

    words = word_tokenize(all_text, engine="newmm")
    words = [w for w in words if w.strip()]

    word_freq = Counter(words)
    top_words = word_freq.most_common(20)

    words, freqs = zip(*top_words)

    plt.figure(figsize=(10, 5))
    sns.barplot(x=list(freqs), y=list(words), palette="Blues_r")
    plt.title(f"คำที่พบบ่อยในระดับภาษา: {label}", fontsize=18)
    plt.xlabel("จำนวนครั้ง", fontsize=14)
    plt.ylabel("คำ", fontsize=14)
    plt.tight_layout()
    plt.show()
