In [72]:
import datasets 
import string

def create_corpus_file(glottocode):
    glosslm_corpus = datasets.load_dataset("lecslab/glosslm-corpus-split")

    glottocodes_ID = set(glosslm_corpus['eval_ID']['glottocode'])
    glottocodes_OOD = set(glosslm_corpus['eval_OOD']['glottocode'])

    if glottocode in glottocodes_ID:
        id_or_ood = "ID"
    elif glottocode in glottocodes_OOD:
        id_or_ood = "OOD"
    else:
        raise Exception(
            f"Glottocode should be one of: {list(glottocodes_ID) + list(glottocodes_OOD)}")

    # Filter by segmentation and glottocode
    glosslm_corpus = glosslm_corpus.filter(lambda row: row["is_segmented"] == "no")
    train_dataset = glosslm_corpus[f"train_{id_or_ood}"].filter(
        lambda row: row['glottocode'] == glottocode)
    
    # Process each transcription and write to a file
    punctuation_to_remove = string.punctuation.replace("'", "")
    with open(f"{glottocode}.corpus.txt", "w") as file:
        for row in train_dataset:
            transcription = row['transcription']
            # Remove punctuation
            transcription = transcription.translate(str.maketrans("", "", punctuation_to_remove))
            transcription = transcription.lower()
            # Write the cleaned transcription to the file
            print(transcription)
            file.write(transcription + "\n")

for glottocode in ['gitx1241', 'lezg1247', 'natu1246', 'uspa1245']:
    create_corpus_file(glottocode)

gupits maryhl hun 
ha'niigoots james jit gups tylerhl anaax 
needii bas'y 
neediit hilen'y 
hilenit 'nii'y 
hilmooyi'yt mary 
gubis jeremyhl honn 
luu sga hetxw nii'y 
dim amksiwaamaxda 
dim yookwt james ji taahlakxw
yugwimaahl xsdaadiit 
yugwimaahl dim xsdaadiit 
ha'niigoodi'y dim yukwhl bax̱t
hasak̲t dim bax̱t
yukwhl si'ix dim bax̱t
gyoo’n sik’ihl gupdihl hun t’aahlakw
bagat dim bax̱t
ha'niigoodi'y yukwhl bax̱t
ha'niigoodi'y bax̱t k'yoots
mehldis mary loo’y ky'oots win hlebiksxws susan
mehldis mary loo’y ky'oots win hlebiksxws susan ha’niisgwaa’ytxwsa
ha'niigoodi'y dim yukwhl bax̱t
ha'niigoodi'y dim bax̱t
gilbilhl anuutxwhl nda mahlis diana dim wil yeet g̱o’ohl winnipeg am k’i’yhl g̱anuutxw 
hes john ky'oots dim 'witxwg̱att mary jihlaa ha'niisgwaa'ytxw 
sim hasaga’y dim algal–i’y ahl tenet
 hasag̱a'y dim bax̱a'y ehl g̱olt 
hasag̱a'y dim bax̱a'y ehl g̱olt 
 hasaga’y ni dim guphl hun
yukwhl si'ix dim bax̱t 
bagat dim wis ky’oots
si'ix ḵ’otsdis johnhl tomato  ii ap nee dii doxhl tomato

In [73]:
import morfessor

io = morfessor.MorfessorIO()


for glottocode in ['gitx1241', 'lezg1247', 'natu1246', 'uspa1245']:
    train_data = list(io.read_corpus_file(f'{glottocode}.corpus.txt'))
    model = morfessor.BaselineModel()
    model.load_data(train_data)
    model.train_batch()
    io.write_binary_model_file(f"{glottocode}.model", model)

..........................................................
..........................................................
..........................................................
..........................................................
...........................................................
...........................................................
...........................................................
...........................................................
...........................................................
...........................................................
...........................................................
...........................................................
...........................................................
...........................................................
...........................................................
...........................................................
............................................

In [78]:
import regex as re

model = io.read_binary_model_file(f"gitx1241.model")
punctuation_to_remove = string.punctuation.replace("'", "")

def _segment(row):
    transcription = row['transcription'].translate(str.maketrans("", "", punctuation_to_remove))
    row['segmentation'] = ' '.join([' '.join(model.viterbi_segment(word.lower())[0]) for word in transcription.split()])
    return row

glosslm_corpus = datasets.load_dataset("lecslab/glosslm-corpus-split")
train_dataset = glosslm_corpus[f"train_OOD"].filter(
        lambda row: row['glottocode'] == 'gitx1241' and row['is_segmented'] == 'no')
train_dataset.map(_segment)

Map:   0%|          | 0/74 [00:00<?, ? examples/s]

Dataset({
    features: ['transcription', 'glosses', 'translation', 'glottocode', 'id', 'source', 'metalang_glottocode', 'is_segmented', 'language', 'metalang', 'segmentation'],
    num_rows: 74
})

In [77]:
model.viterbi_segment('hiiiii')

(['hi', 'ii', 'ii'], 27.75087844357871)

In [65]:
with open('text.txt', 'w') as f:
    f.write(train_dataset[43]['transcription'].translate(str.maketrans("", "", punctuation_to_remove)).lower())

In [66]:
glosslm_corpus = datasets.load_dataset("lecslab/glosslm-corpus-split")
train_dataset = glosslm_corpus[f"train_OOD"].filter(
        lambda row: row['glottocode'] == 'gitx1241' and row['is_segmented'] == 'no')

In [67]:
train_dataset[43]

{'transcription': "'Nakwhl hlidaa 'wihl wili'y g̲oohl wag̲ayt andoosda wil jok̲hl amxsiwaa .",
 'glosses': 'long-CN PART-SPT around LVB-1SG.II LOC-CN completely NMLZ-across COMP live-CN white.person',
 'translation': 'A long time ago, I lived overseas where the white people lived.',
 'glottocode': 'gitx1241',
 'id': 'st_train_gitx1241_0',
 'source': 'sigmorphon_st',
 'metalang_glottocode': 'stan1293',
 'is_segmented': 'no',
 'language': 'Gitxsan',
 'metalang': 'English'}