In [71]:
import datasets 
import string

def create_corpus_file(glottocode):
    glosslm_corpus = datasets.load_dataset("lecslab/glosslm-corpus-split")

    glottocodes_ID = set(glosslm_corpus['eval_ID']['glottocode'])
    glottocodes_OOD = set(glosslm_corpus['eval_OOD']['glottocode'])

    if glottocode in glottocodes_ID:
        id_or_ood = "ID"
    elif glottocode in glottocodes_OOD:
        id_or_ood = "OOD"
    else:
        raise Exception(
            f"Glottocode should be one of: {list(glottocodes_ID) + list(glottocodes_OOD)}")

    # Filter by segmentation and glottocode
    glosslm_corpus = glosslm_corpus.filter(lambda row: row["is_segmented"] == "no")
    train_dataset = glosslm_corpus[f"train_{id_or_ood}"].filter(
        lambda row: row['glottocode'] == glottocode)
    
    # Process each transcription and write to a file
    punctuation_to_remove = string.punctuation.replace("'", "")
    with open(f"{glottocode}.corpus.txt", "w") as file:
        for row in train_dataset:
            transcription = row['transcription']
            # Remove punctuation
            transcription = transcription.translate(str.maketrans("", "", punctuation_to_remove))
            transcription = transcription.lower()
            # Write the cleaned transcription to the file
            print(transcription)
            file.write(transcription + "\n")

for glottocode in ['gitx1241', 'lezg1247', 'natu1246', 'uspa1245']:
    create_corpus_file(glottocode)

Gupit=s Mary=hl hun .
gupits maryhl hun 
Ha'niigoot=s James [ji=t gup=s Tyler=hl anaax .]
ha'niigoots james jit gups tylerhl anaax 
Nee=dii bas'y .
needii bas'y 
Nee=dii=t hilen'y .
neediit hilen'y 
Hilenit 'nii'y .
hilenit 'nii'y 
Hilmooyi'y=t Mary .
hilmooyi'yt mary 
Gubi=s Jeremy=hl honn .
gubis jeremyhl honn 
Luu sga hetxw `nii'y .
luu sga hetxw nii'y 
Dim amksiwaamaxda .
dim amksiwaamaxda 
Dim yookwt James (ji taahlakxw).
dim yookwt james ji taahlakxw
Yugw=imaa=hl xsdaadiit .
yugwimaahl xsdaadiit 
Yugw=imaa=hl dim xsdaadiit .
yugwimaahl dim xsdaadiit 
Ha'niigoodi'y [(dim) yukw=hl bax̱t].
ha'niigoodi'y dim yukwhl bax̱t
Hasak̲t [#(dim) bax̱t].
hasak̲t dim bax̱t
Yukw[=hl] si'ix [(#dim) bax̱t].
yukwhl si'ix dim bax̱t
Gyoo’n sik’ihl [gupdi=hl hun t’aahlakw].
gyoo’n sik’ihl gupdihl hun t’aahlakw
Bagat [#(dim) bax̱t].
bagat dim bax̱t
Ha'niigoodi'y [yukw=hl bax̱t].
ha'niigoodi'y yukwhl bax̱t
Ha'niigoodi'y [bax̱t k'yoots].
ha'niigoodi'y bax̱t k'yoots
Mehldi=s Mary loo’y ky'oots [win hlebik

Filter:   0%|          | 0/3670 [00:00<?, ? examples/s]

sewreqhaj
sewreqhaj
inal aburun wilik
inal aburun wilik
Bazar .din juğ adaz [tarsar awač luhuz] tak’an x̂anwaj .
bazar din juğ adaz tarsar awač luhuz tak’an x̂anwaj 
Adaz balk’an akuna .
adaz balk’an akuna 
Фу кайла гум гьатна кӀвала , хтана зу стха Бешир стха хтана акуна .
фу кайла гум гьатна кӏвала  хтана зу стха бешир стха хтана акуна 
Къанивилин , мугьубатдин са гаф гудайвал хьуй .
къанивилин  мугьубатдин са гаф гудайвал хьуй 
Бакидиз хъфена
бакидиз хъфена
Им и вири цехера акъвенва .
им и вири цехера акъвенва 
ахпа няни хьана , ксана йифиз ана .
ахпа няни хьана  ксана йифиз ана 
« Твах и зи шикил », лугьуда , « а ,» лугьуда « анавай вилаятдин кьилел алай булахдин кьилел эциг » лугьуда .
« твах и зи шикил » лугьуда  « а » лугьуда « анавай вилаятдин кьилел алай булахдин кьилел эциг » лугьуда 
Сарах , сарах зунни Имран тфенгни къачуна фена чун руказ .
сарах  сарах зунни имран тфенгни къачуна фена чун руказ 
муаллим юлдашари ...
муаллим юлдашари 
фена зун и Бакуда авай зу аялдин патав


Filter:   0%|          | 0/3670 [00:00<?, ? examples/s]

Ncblo lcng nzrlimzng x lr Gwalekana esz' .
ncblo lcng nzrlimzng x lr gwalekana esz' 
Zbz sc tqwaipele tqnginipenge kraesz'mz .
zbz sc tqwaipele tqnginipenge kraesz'mz 
Mrlz voomc x sc tqrtangrtipex nqvi lr legru kc tqdopx-ngrne mrkc wq .
mrlz voomc x sc tqrtangrtipex nqvi lr legru kc tqdopxngrne mrkc wq 
78 . Daniel Boerger X Doa lr Manyzdeng
78  daniel boerger x doa lr manyzdeng
Mz Zbqkranzlvqn li mz temz kxesz' , nzayrlwrpx-lxblr-krgr doa lr mztea , mz nzlcapqbzngr x nzalvztrngr nidr mz nqmq ngr mztea , nqmq sc gzpman , x nqmq ngr makxtr .
mz zbqkranzlvqn li mz temz kxesz'  nzayrlwrpxlxblrkrgr doa lr mztea  mz nzlcapqbzngr x nzalvztrngr nidr mz nqmq ngr mztea  nqmq sc gzpman  x nqmq ngr makxtr 
Kx vcmc kc sc tqalvznetrpexng mz nqmq ngr nzalvztrngr x kabo badr da kx na-alvztr-ngrdr .
kx vcmc kc sc tqalvznetrpexng mz nqmq ngr nzalvztrngr x kabo badr da kx naalvztrngrdr 
Pulis lc atwzlr-ngrm Mr Lore mrkc Tobaita .
pulis lc atwzlrngrm mr lore mrkc tobaita 
X ninge kc tqyzlupe-moux Boo mz

Filter:   0%|          | 0/52464 [00:00<?, ? examples/s]

o sey xtok rixoqiil
o sey xtok rixoqiil
ta' nada chi wi' ra richooch
ta' nada chi wi' ra richooch
pwes ra jupul jaa qe
pwes ra jupul jaa qe
k'ark'aq jaa qe
k'ark'aq jaa qe
xe' juntiir
xe' juntiir
rechaq galani' richoochaq juntiir
rechaq galani' richoochaq juntiir
qusi' juntiir qleen rechaq ,
qusi' juntiir qleen rechaq 
wi' jkwa'yaq ,
wi' jkwa'yaq 
wákixaq .
wákixaq 
Juntiir qleen rechaq ,
juntiir qleen rechaq 
i sik' rech'elxikaq
i sik' rech'elxikaq
koom , neen tren re jpobriil juun .
koom  neen tren re jpobriil juun 
Entoons re xtok rixóqil ,
entoons re xtok rixóqil 
kita' jq'unik twer taq
kita' jq'unik twer taq
i kita' juntiir
i kita' juntiir
kita' qleen rechaq ,
kita' qleen rechaq 
i xen taq maq kirtyaan xe' jchajej taq sik' wunaq .
i xen taq maq kirtyaan xe' jchajej taq sik' wunaq 
Jo' qachajej man kristyaan li
jo' qachajej man kristyaan li
¿nemoo twer laj richooch ?
¿nemoo twer laj richooch 
porke ta' k'ii qleen re ,
porke ta' k'ii qleen re 
tiqil ooj .
tiqil ooj 
Che',
che'
che' 

In [56]:
import morfessor

io = morfessor.MorfessorIO()


for glottocode in ['gitx1241', 'lezg1247', 'natu1246', 'uspa1245']:
    train_data = list(io.read_corpus_file(f'{glottocode}.corpus.txt'))
    model = morfessor.BaselineModel()
    model.load_data(train_data)
    model.train_batch()
    io.write_binary_model_file(f"{glottocode}.model", model)

...........................................................
...........................................................
...........................................................
...........................................................
...........................................................
..........................................................
..........................................................
..........................................................
..........................................................
..........................................................
..........................................................
..........................................................
...........................................................
...........................................................
...........................................................
...........................................................
...............................................

In [50]:
model.segment("'Nakwhl")

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/Users/milesper/.pyenv/versions/3.11.6/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3577, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/1y/v6m6qm2d3p52jn2sv6smp5cr0000gn/T/ipykernel_26038/3397289203.py", line 1, in <module>
    model.segment("'Nakwhl")
  File "/Users/milesper/.pyenv/versions/3.11.6/lib/python3.11/site-packages/morfessor/baseline.py", line 550, in segment
    rcount, count, splitloc = self._analyses[compound]
                              ~~~~~~~~~~~~~~^^^^^^^^^^
KeyError: "'Nakwhl"

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/milesper/.pyenv/versions/3.11.6/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 2168, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/milesper/.pyenv/versions/3.

In [57]:
import regex as re

model = io.read_binary_model_file(f"gitx1241.model")
punctuation_to_remove = string.punctuation.replace("'", "")

def _segment(row):
    transcription = row['transcription'].translate(str.maketrans("", "", punctuation_to_remove))
    row['segmentation'] = ' '.join([' '.join(model.segment(word.lower())) for word in transcription.split()])
    return row

glosslm_corpus = datasets.load_dataset("lecslab/glosslm-corpus-split")
train_dataset = glosslm_corpus[f"train_OOD"].filter(
        lambda row: row['glottocode'] == 'gitx1241' and row['is_segmented'] == 'no')
train_dataset.map(_segment)

Map:   0%|          | 0/74 [00:00<?, ? examples/s]

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/Users/milesper/.pyenv/versions/3.11.6/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3577, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/1y/v6m6qm2d3p52jn2sv6smp5cr0000gn/T/ipykernel_26038/3560710456.py", line 14, in <module>
    train_dataset.map(_segment)
  File "/Users/milesper/.pyenv/versions/3.11.6/lib/python3.11/site-packages/datasets/arrow_dataset.py", line 593, in wrapper
    out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
                                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/milesper/.pyenv/versions/3.11.6/lib/python3.11/site-packages/datasets/arrow_dataset.py", line 558, in wrapper
    out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
                                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/milesper/.pyenv/versions/3.11.6/lib/python3.11/site-packages/datasets/arrow_dataset.py

In [52]:
train_data = list(io.read_corpus_file(f'gitx1241.corpus.txt'))
train_data

[(1, 'Gupits'),
 (1, 'Maryhl'),
 (1, 'hun'),
 (0, ()),
 (1, "Ha'niigoots"),
 (1, 'James'),
 (1, 'jit'),
 (1, 'gups'),
 (1, 'Tylerhl'),
 (1, 'anaax'),
 (0, ()),
 (1, 'Needii'),
 (1, "bas'y"),
 (0, ()),
 (1, 'Neediit'),
 (1, "hilen'y"),
 (0, ()),
 (1, 'Hilenit'),
 (1, "'nii'y"),
 (0, ()),
 (1, "Hilmooyi'yt"),
 (1, 'Mary'),
 (0, ()),
 (1, 'Gubis'),
 (1, 'Jeremyhl'),
 (1, 'honn'),
 (0, ()),
 (1, 'Luu'),
 (1, 'sga'),
 (1, 'hetxw'),
 (1, "nii'y"),
 (0, ()),
 (1, 'Dim'),
 (1, 'amksiwaamaxda'),
 (0, ()),
 (1, 'Dim'),
 (1, 'yookwt'),
 (1, 'James'),
 (1, 'ji'),
 (1, 'taahlakxw'),
 (0, ()),
 (1, 'Yugwimaahl'),
 (1, 'xsdaadiit'),
 (0, ()),
 (1, 'Yugwimaahl'),
 (1, 'dim'),
 (1, 'xsdaadiit'),
 (0, ()),
 (1, "Ha'niigoodi'y"),
 (1, 'dim'),
 (1, 'yukwhl'),
 (1, 'bax̱t'),
 (0, ()),
 (1, 'Hasak̲t'),
 (1, 'dim'),
 (1, 'bax̱t'),
 (0, ()),
 (1, 'Yukwhl'),
 (1, "si'ix"),
 (1, 'dim'),
 (1, 'bax̱t'),
 (0, ()),
 (1, 'Gyoo’n'),
 (1, 'sik’ihl'),
 (1, 'gupdihl'),
 (1, 'hun'),
 (1, 't’aahlakw'),
 (0, ()),
 (1, 'Bag

In [65]:
with open('text.txt', 'w') as f:
    f.write(train_dataset[43]['transcription'].translate(str.maketrans("", "", punctuation_to_remove)).lower())

In [66]:
glosslm_corpus = datasets.load_dataset("lecslab/glosslm-corpus-split")
train_dataset = glosslm_corpus[f"train_OOD"].filter(
        lambda row: row['glottocode'] == 'gitx1241' and row['is_segmented'] == 'no')

In [67]:
train_dataset[43]

{'transcription': "'Nakwhl hlidaa 'wihl wili'y g̲oohl wag̲ayt andoosda wil jok̲hl amxsiwaa .",
 'glosses': 'long-CN PART-SPT around LVB-1SG.II LOC-CN completely NMLZ-across COMP live-CN white.person',
 'translation': 'A long time ago, I lived overseas where the white people lived.',
 'glottocode': 'gitx1241',
 'id': 'st_train_gitx1241_0',
 'source': 'sigmorphon_st',
 'metalang_glottocode': 'stan1293',
 'is_segmented': 'no',
 'language': 'Gitxsan',
 'metalang': 'English'}