In [38]:
import os
import gc
import json
import torch
import unicodedata
import pandas as pd
import soundfile as sf
import re

from transformers import pipeline
from langdetect import detect
from pyannote.audio import Pipeline

from pythainlp.transliterate import pronunciate
from g2p_en import G2p
from g2pk import G2p as G2pKO

In [39]:
# !pip install langdetect pythainlp epitran g2p_en g2pk

In [40]:
SPECIAL = ["<pad>", "<bos>", "<eos>", "<sil>", "<space>"]


In [41]:
import nltk

nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [42]:
def split_by_language(token):
    if re.search(r"[ก-๙]", token):
        return "th"
    if re.search(r"[a-z]", token):
        return "en"
    return "unknown"

def thai_g2p(text):
    ipa = pronunciate(text, engine="ipa")
    ipa = ipa.replace(".", " ").split()

    out = []
    for p in ipa:
        if "́" in p:
            tone = "T3"
        elif "̀" in p:
            tone = "T1"
        else:
            tone = "T0"

        base = p.replace("́","").replace("̀","")
        out.append(f"TH_{base}_{tone}")
    return out

g2p_en = G2p()
def english_g2p(text):
    out = []
    for p in g2p_en(text):
        if p == " ":
            continue
        if p[-1].isdigit():
            stress = p[-1]
            base = p[:-1]
            tone = "T3" if stress == "1" else "T1" if stress == "2" else "T0"
        else:
            base = p
            tone = "T0"
        out.append(f"EN_{base}_{tone}")
    return out

g2p_ko = G2pKO()
def korean_g2p(text):
    norm = g2p_ko(text)
    jamos = [c for c in norm if "HANGUL" in unicodedata.name(c)]

    out = []
    for i, j in enumerate(jamos):
        tone = "T3" if i == 0 else "T2" if i == len(jamos)-1 else "T0"
        out.append(f"KO_{j}_{tone}")
    return out

def text_to_phonemes_with_silence(seq):
    phonemes = ["<bos>"]

    for item in seq.split():
        if item == "<sil>":
            phonemes.append("<sil>")
            continue

        lang = split_by_language(item)
        if lang == "th":
            phonemes.extend(thai_g2p(item))
        elif lang == "en":
            phonemes.extend(english_g2p(item))
        elif lang == "ko":
            phonemes.extend(korean_g2p(item))

        phonemes.append("<space>")

    if phonemes[-1] == "<space>":
        phonemes.pop()

    phonemes.append("<eos>")
    return phonemes

In [43]:
df = pd.read_csv("transcription_results.csv")
df.head()

Unnamed: 0,file_path,whisper_text,pathumma_text,speaker
0,Lisa/dataset/5QmaVImK37w/SPEAKER_06_sent_0000.wav,คือเรื่องที่เราพูดกันวันเนี้ยมันมาจากใจแล้วหนู...,คือเรื่องที่เราพูดกันวันนี้มันมาจากใจแล้วหนูไม...,Lisa
1,Lisa/dataset/5QmaVImK37w/SPEAKER_06_sent_0001.wav,มันก็คือละไฟหอดRoller Coaster Rideเอาจริงๆ ตอน...,มันก็คือระไฟเหาะเนอะรอเลอคอสเตอร์ไหว้เอาจริงจร...,Lisa
2,Lisa/dataset/5QmaVImK37w/SPEAKER_06_sent_0002.wav,เครียด,เครียด,Lisa
3,Lisa/dataset/5QmaVImK37w/SPEAKER_06_sent_0003.wav,เหมือนมันมองไม่เห็นอนาคต giving me any prior a...,เหมือนมันมองไม่เห็นอนาคตอันใกล้ แค่แบบเดือนหน้...,Lisa
4,Lisa/dataset/5QmaVImK37w/SPEAKER_06_sent_0004.wav,เราทำเต็มที่แล้ว เราภูมิใจในของเราอืมแต่มันก็จ...,เราทำเต็มที่แล้วเราภูมิใจในของเราแต่มันก็จะมีบ...,Lisa


In [44]:
def normalize_text(t):
    t = t.lower()
    t = re.sub(r"\[.*?\]", "", t)
    t = re.sub(r"\s+", " ", t)
    return t.strip()

In [45]:
whisper_text = df['whisper_text'].dropna().astype(str).to_list()
pathumma_text = df['pathumma_text'].dropna().astype(str).to_list()

all_texts = whisper_text + pathumma_text
all_texts = [normalize_text(t) for t in all_texts if t.strip()]


In [46]:
phoneme_set = set()

for text in all_texts:
    phs = text_to_phonemes_with_silence(text)
    phoneme_set.update(phs)

phoneme_list = SPECIAL + sorted(phoneme_set - set(SPECIAL))
phoneme2id = {p:i for i,p in enumerate(phoneme_list)}


In [47]:
json.dump(phoneme2id, open("phoneme_vocab.json","w"))


In [48]:
def get_phonemes_output(text):
    if not isinstance(text, str) or text.strip() == "":
        return None

    text = normalize_text(text)
    phonemes = text_to_phonemes_with_silence(text)
    return [phoneme2id[p] for p in phonemes]



In [49]:
df = df.reset_index(drop=True)

In [50]:
whisper_series = df['whisper_text'].apply(get_phonemes_output)
pathumma_series = df['pathumma_text'].apply(get_phonemes_output)

whisper_series.name = "whisper_phonemes"
pathumma_series.name = "pathumma_phonemes"


In [51]:
final_df = pd.concat(
    [df, whisper_series, pathumma_series],
    axis=1
)

final_df.to_csv("transcription_phonemes.csv", index=False)

In [52]:
final_df.head()

Unnamed: 0,file_path,whisper_text,pathumma_text,speaker,whisper_phonemes,pathumma_phonemes
0,Lisa/dataset/5QmaVImK37w/SPEAKER_06_sent_0000.wav,คือเรื่องที่เราพูดกันวันเนี้ยมันมาจากใจแล้วหนู...,คือเรื่องที่เราพูดกันวันนี้มันมาจากใจแล้วหนูไม...,Lisa,"[1, 628, 4, 30, 16, 61, 66, 36, 4, 390, 4, 52,...","[1, 627, 4, 55, 71, 4, 30, 16, 61, 66, 36, 4, ..."
1,Lisa/dataset/5QmaVImK37w/SPEAKER_06_sent_0001.wav,มันก็คือละไฟหอดRoller Coaster Rideเอาจริงๆ ตอน...,มันก็คือระไฟเหาะเนอะรอเลอคอสเตอร์ไหว้เอาจริงจร...,Lisa,"[1, 2427, 4, 51, 58, 64, 66, 36, 4, 2918, 4, 1...","[1, 2408, 2]"
2,Lisa/dataset/5QmaVImK37w/SPEAKER_06_sent_0002.wav,เครียด,เครียด,Lisa,"[1, 3946, 2]","[1, 3946, 2]"
3,Lisa/dataset/5QmaVImK37w/SPEAKER_06_sent_0003.wav,เหมือนมันมองไม่เห็นอนาคต giving me any prior a...,เหมือนมันมองไม่เห็นอนาคตอันใกล้ แค่แบบเดือนหน้...,Lisa,"[1, 4513, 4, 42, 46, 72, 44, 54, 4, 53, 49, 4,...","[1, 4508, 4, 4615, 4, 4381, 4, 4335, 4, 4721, ..."
4,Lisa/dataset/5QmaVImK37w/SPEAKER_06_sent_0004.wav,เราทำเต็มที่แล้ว เราภูมิใจในของเราอืมแต่มันก็จ...,เราทำเต็มที่แล้วเราภูมิใจในของเราแต่มันก็จะมีบ...,Lisa,"[1, 4331, 4, 4340, 2]","[1, 3976, 2]"


In [59]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2590 entries, 0 to 2589
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   file_path          2590 non-null   object
 1   whisper_text       2558 non-null   object
 2   pathumma_text      2590 non-null   object
 3   speaker            2590 non-null   object
 4   whisper_phonemes   2558 non-null   object
 5   pathumma_phonemes  2590 non-null   object
dtypes: object(6)
memory usage: 121.5+ KB


In [60]:
import json
import pandas as pd

def build_manifest(df, phoneme_col):
    manifest = []

    for _, row in df.iterrows():
        phonemes = row[phoneme_col]

        if not isinstance(phonemes, list):
            continue  # ข้าม NaN / None

        item = {
            "audio": row["file_path"],
            "phonemes": phonemes,
            "speaker": row["speaker"]
        }
        manifest.append(item)

    return manifest


In [61]:
whisper_manifest = build_manifest(final_df, "whisper_phonemes")
pathumma_manifest = build_manifest(final_df, "pathumma_phonemes")


In [62]:
with open("manifest_whisper.json", "w", encoding="utf-8") as f:
    json.dump(whisper_manifest, f, ensure_ascii=False, indent=2)

with open("manifest_pathumma.json", "w", encoding="utf-8") as f:
    json.dump(pathumma_manifest, f, ensure_ascii=False, indent=2)
