# This notebook aims to compute the subsyllabic-level features from HKCanCor database

Subsyllabic-level features include:
- Initial surprisal
- Final surprisal
- Tone surprisal

Inspired from Kries et al. (2024).

The computation procedures in brief:
1. Use [pycantonese](https://github.com/jacksonllee/pycantonese) to parse HKCanCor sentences into jyutping
2. Set `onset` as the initial, merge `nucleus` and `coda` as the final.
3. Map initials and finals to their corresponding IPA representations.
4. After mapping, compute initial/final surprisal values.
5. The tone surprisal values can then be computed after step 4.

The two output dataframes are:
1. `hkcancor_initial_final_ipa_surprisal.xlsx` - surprisal values for initials and finals
2. `hkcancor_tone_surprisal` - surprisal values for tones, conditioned on initials and finals

The `jyutping_ipa_mapping.xlsx` file is created manually based on the mapping provided in https://jyutping.org/blog/table/


## References
Kries, J., De Clercq, P., Gillis, M., Vanthornhout, J., Lemmens, R., Francart, T., & Vandermosten, M. (2024). Exploring neural tracking of acoustic and linguistic speech representations in individuals with post‐stroke aphasia. Human Brain Mapping, 45(8), e26676. https://doi.org/10.1002/hbm.26676


In [1]:
import pandas as pd
import numpy as np
import pycantonese
import os

def find_nearest_idx(array, value):
    """ Find the index of the nearest value in an array."""
    return np.argmin(np.abs(array - value))

In [2]:
df_jyutping_ipa_mapping = pd.read_excel("jyutping_ipa_mapping.xlsx", sheet_name="jyutping_ipa_mapping")

hkcancor = pycantonese.hkcancor()
hkcancor.n_files()  # number of data files
# print(len(hkcancor.words())) # number of words as segmented from all the utterances
list_dict = []
all_words = hkcancor.words()
for idx_jyutping, this_syllables in enumerate(hkcancor.jyutpings()):
    # this_syllable example = "wai3"/ "leoi5hang4"
    list_parse_jyutping = pycantonese.parse_jyutping(this_syllables)

    if not len(list_parse_jyutping) == 0: # not None
        for idx_parse_jyutping, this_jyutping in enumerate(list_parse_jyutping):
            if this_jyutping.onset == "":
                this_initial = this_jyutping.onset
                this_final = this_jyutping.nucleus + this_jyutping.coda
                this_initial_ipa = ""
                if this_final == "m":
                    this_final_ipa = "m̩"
                else:
                    this_final_ipa = df_jyutping_ipa_mapping.loc[df_jyutping_ipa_mapping['jyutping'] == this_final, 'ipa'].values[0]
            else:
                this_initial = this_jyutping.onset
                this_final = this_jyutping.nucleus + this_jyutping.coda
                this_initial_ipa = df_jyutping_ipa_mapping.loc[df_jyutping_ipa_mapping['jyutping'] == this_initial, 'ipa'].values[0]
                this_final_ipa = df_jyutping_ipa_mapping.loc[df_jyutping_ipa_mapping['jyutping'] == this_final, 'ipa'].values[0]
            try:
                match_syllable = all_words[idx_jyutping][idx_parse_jyutping]
            except:
                match_syllable = "ERROR"
            this_dict = {
                "syllable": match_syllable,
                "initial": this_initial,
                "final": this_final,
                "initial_ipa": this_initial_ipa,
                "final_ipa": this_final_ipa,
                "tone": int(this_jyutping.tone)
            }
            list_dict.append(this_dict)

df_hkcancor_ipa = pd.DataFrame(list_dict)
# len(df_hkcancor_ipa)

In [3]:
# Look at the dataframe
df_hkcancor_ipa.head()

Unnamed: 0,syllable,initial,final,initial_ipa,final_ipa,tone
0,喂,w,ai,w,ɐi,3
1,遲,c,i,tsʰ,iː,4
2,啲,d,i,t,iː,1
3,去,h,eoi,h,ɵy,3
4,唔,,m,,m̩,4


In [4]:
df_initial_ipa_counts = df_hkcancor_ipa["initial_ipa"].value_counts().reset_index()
df_zero_initial_ipa_counts = df_hkcancor_ipa.loc[df_hkcancor_ipa["initial_ipa"]==""]["final_ipa"].value_counts().reset_index()

df_first_phoneme_counts = pd.concat([df_initial_ipa_counts, df_zero_initial_ipa_counts], axis=0).reset_index(drop=True)
df_first_phoneme_counts = df_first_phoneme_counts[["initial_ipa", "final_ipa", "count"]]
df_first_phoneme_counts.loc[df_first_phoneme_counts["initial_ipa"]==""].index[0]
df_first_phoneme_counts.drop(index=df_first_phoneme_counts.loc[df_first_phoneme_counts["initial_ipa"]==""].index[0], inplace=True)
df_first_phoneme_counts["prob"] = df_first_phoneme_counts["count"] / df_first_phoneme_counts["count"].sum()
df_first_phoneme_counts["surprisal"] = -np.log(df_first_phoneme_counts["prob"])

# Look at the surprisal of possible initial phoneme
df_first_phoneme_counts.head()

Unnamed: 0,initial_ipa,final_ipa,count,prob,surprisal
0,k,,22347,0.138564,1.976425
1,h,,19075,0.118276,2.134739
2,j,,16138,0.100064,2.30194
3,t,,14706,0.091185,2.394862
4,ts,,14234,0.088259,2.427484


In [5]:
list_initial_ipa = df_initial_ipa_counts.reset_index()["initial_ipa"].unique()
list_df_next_final_ipa_all = []
for this_initial_ipa in list_initial_ipa:
    if this_initial_ipa == "":
        continue
    list_final_ipa = df_hkcancor_ipa.final_ipa.unique()
    df_next_final_ipa_all = pd.DataFrame({"final_ipa": list_final_ipa})

    df_temp = df_hkcancor_ipa.loc[df_hkcancor_ipa["initial_ipa"]==this_initial_ipa]["final_ipa"].value_counts().reset_index()
    df_next_final_ipa_all = pd.merge(df_next_final_ipa_all, df_temp, on=["final_ipa"], how="left").fillna(0)
    df_next_final_ipa_all["count"] += 1
    df_next_final_ipa_all["prob"] = df_next_final_ipa_all["count"] / df_next_final_ipa_all["count"].sum()
    df_next_final_ipa_all["surprisal"] = -np.log(df_next_final_ipa_all["prob"])
    df_next_final_ipa_all.insert(loc=0, column="initial_ipa", value=this_initial_ipa)
    list_df_next_final_ipa_all.append(df_next_final_ipa_all)

df_next_final_ipa_all = pd.concat(list_df_next_final_ipa_all)

In [6]:
df_initial_final_surprisal = pd.concat((df_first_phoneme_counts, df_next_final_ipa_all))
df_initial_final_surprisal.head()

# Output
df_initial_final_surprisal.to_excel(os.path.join("..", "output", "hkcancor_initial_final_ipa_surprisal.xlsx"), index=False)

The tone surprisal dataframe can be computed after the initial-final surprisal dataframe is computed.

In [7]:
list_tone_surprisal = []
for idx, row in df_initial_final_surprisal.iterrows():
    # There are three possible cases in the df_initial_final_surprisal
    # 1. initial_ipa = "", final_ipa = xxx, surprisal = yyy (this is the case where the syllable has no initial consonant)
    # 2. initial_ipa = xxx, final_ipa = yyy, surprisal = zzz (this is the case where the syllable has an initial consonant)
    # 3. initial_ipa = xxx, final_ipa = "", surprisal = zzz (this is the case where the syllable has an initial consonant but no final, e.g., "si")
    # For the last case, we cannot compute the tone surprisal, so we skip it.

    is_case_no_final = (~(pd.isna(row["initial_ipa"]))) & (pd.isna(row["final_ipa"]))
    is_case_only_final = (pd.isna(row["initial_ipa"])) & (~pd.isna(row["final_ipa"]))
    is_case_normal = (~(pd.isna(row["initial_ipa"]))) and (~pd.isna(row["final_ipa"]))

    if is_case_no_final:
        continue
    elif is_case_only_final:
        df_hkcancor_ipa_subset = df_hkcancor_ipa.loc[(df_hkcancor_ipa['initial_ipa'] == "") & (df_hkcancor_ipa['final_ipa'] == row["final_ipa"]), :]
        this_initial_ipa = ""
        this_final_ipa = row["final_ipa"]
    elif is_case_normal:
        df_hkcancor_ipa_subset = df_hkcancor_ipa.loc[(df_hkcancor_ipa['initial_ipa'] == row["initial_ipa"]) & (df_hkcancor_ipa['final_ipa'] == row["final_ipa"]), :]
        this_initial_ipa = row["initial_ipa"]
        this_final_ipa = row["final_ipa"]

    df_temp_placeholder = pd.DataFrame({"initial_ipa": this_initial_ipa,
                "final_ipa": this_final_ipa,
                "tone": [1, 2, 3, 4, 5, 6]})
    df_temp_tone = df_hkcancor_ipa_subset["tone"].value_counts().reset_index()

    df_this_tone_surprisal = pd.merge(df_temp_placeholder, df_temp_tone,  on=["tone"], how="left").fillna(0)
    df_this_tone_surprisal["count"] += 1
    df_this_tone_surprisal["prob"] = df_this_tone_surprisal["count"] / df_this_tone_surprisal["count"].sum()
    df_this_tone_surprisal["surprisal"] = -np.log(df_this_tone_surprisal["prob"])
    list_tone_surprisal.append(df_this_tone_surprisal)

In [8]:
df_tone_surprisal = pd.concat(list_tone_surprisal)
df_tone_surprisal.head()

# Output
df_tone_surprisal.to_excel(os.path.join("..", "output", "hkcancor_tone_surprisal.xlsx"), index=False)