In [13]:
from arc.core.syllable import syllable_from_phonemes
from arc.io import *


def read_syllables_dict_deu_special(check_phonemes: bool = True):
    assert check_phonemes == False, "check phonemes not supported yet for german corpus"
    with open(SYLLABLES_DEFAULT_PATH_DEU_SPECIAL, "r", encoding='utf-8') as csv_file:
        fdata = list(csv.reader(csv_file, delimiter='\t'))

    syllables_dict: Dict[str, Syllable] = {}
    feature_phonemes = load_default_phonemes()

    for syll_stats in fdata[1:]:
        syll_ipa = phonecodes.xsampa2ipa(syll_stats[1], "deu")
        info = {"freq": int(syll_stats[2]), "prob": float(syll_stats[3])}
        if syll_ipa in syllables_dict and syllables_dict[syll_ipa].info != info:
            logger.warning(
                f"Syllable '{syll_ipa}' with conflicting stats. Keeping new stats."
            )
        if check_phonemes:
            try:
                phonemes = [phonecodes.disc2ipa(phon, "eng") for phon in syll_stats[0]]
                syllables_dict[syll_ipa] = syllable_from_phonemes(feature_phonemes, phoneme_combination=phonemes)
            except KeyError:
                continue
        else:
            syllables_dict[syll_ipa] = Syllable(id=syll_ipa, phonemes=[], info=info, binary_features=[], phonotactic_features=[])
    
    return syllables_dict


def read_syllables_dict_celex(lang: str, check_phonemes: bool =True):
    path = {"deu": SYLLABLES_DEFAULT_PATH_DEU, "eng": SYLLABLES_DEFAULT_PATH_ENG}.get(lang)

    with open(path, "r", encoding='utf-8') as csv_file:
        fdata = list(csv.reader(csv_file, delimiter='\\'))
    
    syllables_dict: Dict[str, Syllable] = {}
    feature_phonemes = load_default_phonemes()

    for syll_stats in fdata[1:]:
        syll_ipa = phonecodes.disc2ipa(syll_stats[0], lang)
        info = {"freq": int(syll_stats[-1])}
        if syll_ipa in syllables_dict and syllables_dict[syll_ipa].info != info:
            logger.info(
                f"Syllable '{syll_ipa}' with conflicting stats. Keeping new stats."
            )
        # we can try to take advantage of the single-character-per-phoneme property of the disc charset
        if check_phonemes:
            try:
                phonemes = [phonecodes.disc2ipa(phon, "eng") for phon in syll_stats[0]]
                syllable_object = syllable_from_phonemes(feature_phonemes, phoneme_combination=phonemes)
            except KeyError:
                continue
        else:
            syllable_object = Syllable(id=syll_ipa, phonemes=[], info=info, binary_features=[], phonotactic_features=[])

        syllables_dict[syll_ipa] = syllable_object
    
    return syllables_dict


def read_syllables_corpus(
        lang: Literal["deu", "nld", "eng"] = "deu",
        check_phonemes: bool = True,
) -> Register[str, Syllable]:
    logger.info("READ SYLLABLES, FREQUENCIES AND PROBABILITIES FROM CORPUS AND CONVERT SYLLABLES TO IPA")

    if lang in ["eng"]:
        syllables_dict = read_syllables_dict_celex(lang=lang, check_phonemes=check_phonemes)  # for german maybe: .intersection(read_syllables_dict_deu_special())
    elif lang == "deu":
        syllables_dict = read_syllables_dict_deu_special(check_phonemes=check_phonemes) 
    else:
        raise ValueError(f"Language {lang} not supported.")

    return Register(syllables_dict)

read_syllables_corpus(lang="eng", check_phonemes=True)

Register([('ɑː',
           Syllable(id='ɑː', info={'binary_features': [1, 0, 1, 0, 1, 1], 'phonotactic_features': [[]]}, phonemes=[Phoneme(id='ɑː', info={'features': ['+', '+', '-', '+', '0', '-', '-', '0', '+', '-', '-', '-', '-', '-', '-', '-', '+', '+', '-', '+', '+']})])),
          ('ɑːɲ',
           Syllable(id='ɑːɲ', info={'binary_features': [1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1], 'phonotactic_features': [[], ['son', 'oth']]}, phonemes=[Phoneme(id='ɑː', info={'features': ['+', '+', '-', '+', '0', '-', '-', '0', '+', '-', '-', '-', '-', '-', '-', '-', '+', '+', '-', '+', '+']}), Phoneme(id='ɲ', info={'features': ['-', '+', '+', '-', '-', '-', '+', '-', '+', '-', '-', '+', '-', '0', '-', '+', '-', '-', '-', '0', '-']})])),
          ('ɑːɲt',
           Syllable(id='ɑːɲt', info={'binary_features': [1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0], 'phonotactic_features': [[], ['son', 'oth'], ['plo', 'den']]}, phonemes=[Phoneme(id='ɑː', info={'features