In [1]:
from functools import lru_cache
from itertools import combinations

import pandas as pd
from tqdm.auto import tqdm

In [2]:
@lru_cache(maxsize=None)
def get_prefix(v, s):
    return ("-".join(s[:v[0]]),)


@lru_cache(maxsize=None)
def get_suffix(v, s):
    return ("-".join(s[v[-1]:]),)


@lru_cache(maxsize=None)
def get_infix(v, s, j):
    return "-".join(s[v[j]:v[j+1]])


@lru_cache(maxsize=None)
def map_intervals_to_syllable_segments(vs, s):
    return [
        get_prefix(v, s)
            + tuple(get_infix(v, s, j) for j in range(len(v)-1))
            + get_suffix(v, s)
        for v in vs
    ]


@lru_cache(maxsize=None)
def get_top1_script_and_logprob(hyphenated_syllable_str):
    test_df = zh_Hant_df[zh_Hant_df["transcript"] == hyphenated_syllable_str]
    if 0 == test_df.size:
        return "<unk>", -float("inf")
    top1_df = test_df.loc[test_df.logprob.idxmax()]
    return top1_df.script, top1_df.logprob


@lru_cache(maxsize=None)
def get_argmax_script_seg_and_logprob_sum(hyphenated_syllable_str_tpl):
    pairs = list(zip(*map(get_top1_script_and_logprob, hyphenated_syllable_str_tpl)))
    return "".join(pairs[0]), sum(pairs[1])


@lru_cache(maxsize=None)
def get_best_alt_t_seg(hyphenated_syllable):
    partitions = []
    s = hyphenated_syllable.split("-")
    max_len = len(s)

    s_tpl = tuple(s)
    for i in range(1, max_len):
        intervals = tuple(combinations(range(1, max_len), i))
        alt_t_segs = map_intervals_to_syllable_segments(intervals, s_tpl)

        for alt_t_seg in alt_t_segs:
            alt_s, alt_l = get_argmax_script_seg_and_logprob_sum(alt_t_seg)
            if alt_l > -float("inf"):
                partitions += [(alt_t_seg, alt_s, alt_l)]
    if not partitions:
        return [(None,), "<unk>", -float("inf")]
    return sorted(partitions, key=lambda x: x[2], reverse=True)[0]
    

In [3]:
df = pd.read_table(
    "../data.txt",
    sep=" ",
    header=None,
    names=("transcript", "script", "logprob"),
    dtype=str,
    skiprows=400,
    quoting=3
)
df = df.assign(syllable_count=df.transcript.str.split("-").apply(len))
zh_Hant_df = df[df.logprob != "-8"]
zh_Hant_df = zh_Hant_df.assign(logprob=pd.to_numeric(zh_Hant_df.logprob))
display(zh_Hant_df.describe())
display(zh_Hant_df.describe(include=["O"]))

Unnamed: 0,logprob,syllable_count
count,152150.0,152150.0
mean,-6.52143,2.547217
std,1.056862,0.977678
min,-99.0,1.0
25%,-7.275731,2.0
50%,-6.641368,2.0
75%,-5.942398,3.0
max,-1.619924,6.0


Unnamed: 0,transcript,script
count,152150,152150
unique,126666,147327
top,ㄧˋ,一個個
freq,132,6


In [4]:
def verify(syl_cnt):
    polysyllabic_zh_Hant_df = zh_Hant_df[zh_Hant_df.syllable_count == syl_cnt]
    logprob_sorted_polysyllabic_zh_Hant_df = polysyllabic_zh_Hant_df.sort_values("logprob", ascending=False)
    logprob_sorted_polysyllabic_top1_zh_Hant_df = logprob_sorted_polysyllabic_zh_Hant_df.drop_duplicates("transcript")

    shadowed_records = []
    for x in tqdm(
        logprob_sorted_polysyllabic_top1_zh_Hant_df.itertuples(),
        total=len(logprob_sorted_polysyllabic_top1_zh_Hant_df.index)
    ):
        t, s, l = x.transcript, x.script, x.logprob
        best_alt_t_seg, best_alt_s, best_alt_l = get_best_alt_t_seg(t)
        if best_alt_s != s and best_alt_l > l:
            shadowed_records.append((t, s, l, best_alt_t_seg, best_alt_s, best_alt_l))
    return shadowed_records


def display_helper(syl_cnt, shadowed_records, top_n=5):
    display(
        pd.DataFrame(
            list(
                sorted(
                    shadowed_records, key=lambda x: x[2], reverse=True
                )
            )[:top_n],
            columns=[
                "transcript",
                "script",
                "logprb",
                "alt_transcript_seg",
                "alt_script",
                "alt_logprob",
            ]
        ).style.set_caption(
            f"{len(shadowed_records):_} shadowed {syl_cnt}-syllable scripts by their parts; top-{top_n}:"
        )
    )

In [5]:
syl_cnt = 6
shadowed_records_6 = verify(syl_cnt)
display_helper(syl_cnt, shadowed_records_6)

  0%|          | 0/1118 [00:00<?, ?it/s]

Unnamed: 0,transcript,script,logprb,alt_transcript_seg,alt_script,alt_logprob


In [6]:
syl_cnt = 5
shadowed_records_5 = verify(syl_cnt)
display_helper(syl_cnt, shadowed_records_5)

  0%|          | 0/2072 [00:00<?, ?it/s]

Unnamed: 0,transcript,script,logprb,alt_transcript_seg,alt_script,alt_logprob


In [7]:
syl_cnt = 4
shadowed_records_4 = verify(syl_cnt)
display_helper(syl_cnt, shadowed_records_4)

  0%|          | 0/26118 [00:00<?, ?it/s]

Unnamed: 0,transcript,script,logprb,alt_transcript_seg,alt_script,alt_logprob
0,ㄧ-ㄒㄧㄠˇ-ㄕˊ-ㄏㄡˋ,一小時後,-6.673671,"('ㄧ', 'ㄒㄧㄠˇ-ㄕˊ-ㄏㄡˋ')",一小時候,-6.540373
1,ㄧㄢˊ-ㄐㄧㄡˋ-ㄙㄨㄛˇ-ㄌㄧˇ,研究所裡,-6.974701,"('ㄧㄢˊ-ㄐㄧㄡˋ-ㄙㄨㄛˇ', 'ㄌㄧˇ')",研究所理,-6.958653
2,ㄊㄞˊ-ㄅㄟˇ-ㄕˋ-ㄕㄤ,台北士商,-6.974701,"('ㄊㄞˊ-ㄅㄟˇ-ㄕˋ', 'ㄕㄤ')",台北市商,-6.869624
3,ㄅㄨˋ-ㄅㄨˋ-ㄒㄧㄠˇ-ㄒㄧㄣ,步步小心,-7.275731,"('ㄅㄨˋ', 'ㄅㄨˋ-ㄒㄧㄠˇ-ㄒㄧㄣ')",不不小心,-7.033613
4,ㄐㄧㄠˇ-ㄊㄚˋ-ㄔㄜ-ㄉㄠˋ,腳踏車道,-7.275731,"('ㄐㄧㄠˇ-ㄊㄚˋ-ㄔㄜ', 'ㄉㄠˋ')",腳踏車到,-7.265441


In [8]:
syl_cnt = 3
shadowed_records_3 = verify(syl_cnt)
display_helper(syl_cnt, shadowed_records_3)

  0%|          | 0/32890 [00:00<?, ?it/s]

Unnamed: 0,transcript,script,logprb,alt_transcript_seg,alt_script,alt_logprob
0,ㄅㄨˋ-ㄈㄣˋ-ㄉㄜ˙,部份的,-5.261156,"('ㄅㄨˋ-ㄈㄣˋ', 'ㄉㄜ˙')",部分的,-5.074531
1,ㄗㄞˋ-ㄇㄟˊ-ㄧㄡˇ,再沒有,-5.475036,"('ㄗㄞˋ', 'ㄇㄟˊ-ㄧㄡˇ')",在沒有,-5.069755
2,ㄓㄨㄥ-ㄍㄨㄛˊ-ㄕˋ,中國式,-5.627127,"('ㄓㄨㄥ-ㄍㄨㄛˊ', 'ㄕˋ')",中國是,-5.383699
3,ㄔㄤˊ-ㄔㄤˊ-ㄉㄜ˙,長長的,-5.738278,"('ㄔㄤˊ-ㄔㄤˊ', 'ㄉㄜ˙')",常常的,-5.623324
4,ㄧ-ㄐㄧㄡˋ-ㄕˋ,依舊是,-5.776066,"('ㄧ', 'ㄐㄧㄡˋ-ㄕˋ')",一就是,-5.221533


In [9]:
syl_cnt = 2
shadowed_records_2 = verify(syl_cnt)
display_helper(syl_cnt, shadowed_records_2)

  0%|          | 0/63123 [00:00<?, ?it/s]

Unnamed: 0,transcript,script,logprb,alt_transcript_seg,alt_script,alt_logprob
0,ㄧ-ㄗㄞˋ,一再,-4.576535,"('ㄧ', 'ㄗㄞˋ')",一在,-4.316461
1,ㄕˋ-ㄕˋ,試試,-4.589256,"('ㄕˋ', 'ㄕˋ')",是是,-4.07838
2,ㄓㄨㄥ-ㄕˋ,中市,-4.614894,"('ㄓㄨㄥ', 'ㄕˋ')",中是,-4.539011
3,ㄍㄜˋ-ㄕˋ,各式,-4.676816,"('ㄍㄜˋ', 'ㄕˋ')",個是,-4.441402
4,ㄓㄨㄥ-ㄅㄨˋ,中部,-4.786606,"('ㄓㄨㄥ', 'ㄅㄨˋ')",中不,-4.677396
