In [44]:
# split and sample from the paradigms according to the SIGMORPHON–UniMorph 2023 Shared Task 0 Paper
def get_lemma(line):
    lemma = line.split("\t")[0]
    return lemma

from random import sample,shuffle
from math import floor
from itertools import groupby
path_data = "data/"
LANGS = ["ote","pol","csb"]

for lang in LANGS:
    with open (path_data + lang,"r") as data_file, open (path_data + lang + ".trn","w") as train_file, open (path_data + lang + ".dev","w") as dev_file,open (path_data + lang + ".tst","w") as test_file:
        data_lines = [line.strip() for line in data_file.readlines()]
        tables = [(k, list(g)) for k, g in groupby(data_lines, get_lemma) if k != ""]
        if lang != "csb":
            for n in (500,1000,2000,3000):
                sampled_tables = sample(tables,n)
                size = len(sampled_tables)
                datasets = [ sampled_tables[:floor(0.8*size)], sampled_tables[floor(0.8*size):floor(0.9*size)], sampled_tables[floor(0.9*size):] ]

                train_forms = [form for lemma,table in datasets[0] for form in table]
                dev_forms = [form for lemma,table in datasets[1] for form in table]
                test_forms = [form for lemma,table in datasets[2] for form in table]

                if len(dev_forms) < 1000 or len(test_forms) < 1000 or len(train_forms) < 10000:
                    print(f"more tables than {n} needed for lang: {lang} ",len(dev_forms),len(test_forms) ,len(train_forms))
                else:
                    print(f"{lang} needed {n} tables.")
                    break
                    

            # sampling and retaining order of the forms
            indices_train = sorted(sample(range(len(train_forms)), 10000))
            indices_dev =  sorted(sample(range(len(dev_forms)), 1000))
            indices_test = sorted( sample(range(len(test_forms)), 1000))

            train_samples = [train_forms[i] for i in indices_train]
            dev_samples = [dev_forms[i] for i in indices_dev]
            test_samples = [test_forms[i] for i in indices_test]
            
           
            train_txt = "\n".join(train_samples) + "\n"
            dev_txt = "\n".join(dev_samples) + "\n"
            test_txt = "\n".join(test_samples) + "\n"

            train_file.write(train_txt)
            dev_file.write(dev_txt)
            test_file.write(test_txt)
        elif lang == "csb":
            # shuffle lemmas
            shuffle(tables) # lemmas should be randomly ordered
            test_forms = [form for lemma,table in tables for form in table]
            length = len([(k, list(g)) for k, g in groupby(test_forms, get_lemma)])
            print(f"number of lemmas in csb testfile: ",length)
            test_txt = "\n".join(test_forms) + "\n"
            test_file.write(test_txt)


            
        


more tables than 500 needed for lang: ote  732 792 6366
ote needed 1000 tables.
more tables than 500 needed for lang: pol  840 936 6816
pol needed 1000 tables.
number of lemmas in csb testfile:  37


In [46]:
# unique tags    
tags = []
path_data = "data/"
LANGS = ["ote","pol","csb"]
datasets = ["trn","dev","tst"]
for lang in LANGS:
    if lang == "csb": datasets = ["tst"]
    else: datasets = ["trn","dev","tst"]
    for dataset in datasets:
        with open (path_data + lang + "."+dataset,"r") as data_file:
            print(data_file)
            data_lines = data_file.readlines()
            for line in data_lines:
                line = line.strip()
                if line == "": continue
                lemma, target, features = line.split("\t")
                features = features.split(";")
                for feature in features: 
                    if feature not in tags: 
                        tags += [feature]   
print(tags)



<_io.TextIOWrapper name='data/ote.trn' mode='r' encoding='UTF-8'>
<_io.TextIOWrapper name='data/ote.dev' mode='r' encoding='UTF-8'>
<_io.TextIOWrapper name='data/ote.tst' mode='r' encoding='UTF-8'>
<_io.TextIOWrapper name='data/pol.trn' mode='r' encoding='UTF-8'>
<_io.TextIOWrapper name='data/pol.dev' mode='r' encoding='UTF-8'>
<_io.TextIOWrapper name='data/pol.tst' mode='r' encoding='UTF-8'>
<_io.TextIOWrapper name='data/csb.tst' mode='r' encoding='UTF-8'>
['V', 'IPFV', 'SG', '2', 'PRS', '3', 'PST', 'PFV', 'PRF', '1', 'IRR', 'N', 'ESS', 'PL', 'INS', 'NOM', 'GEN', 'DAT', 'ACC', 'VOC', 'ADJ', 'FEM', 'NEUT', 'MASC', 'ANIM', 'HUM', 'INAN', 'IMP', 'FUT', 'V.MSDR', 'V.PTCP', 'PASS', 'COND', 'V.CVB', 'ACT', 'NFIN']


In [45]:
# classify each feature, sort and rearrange into hierarchical schema
# features_in_ote_pol =     ['V', 'IPFV', 'SG', 'PFV', 'PRF', '2', '3', 'IRR', '1', 'N', 'ESS', 'PL', 'GEN', 'NOM', 'ADJ', 'INS', 'DAT', 'VOC', 'FEM', 'FUT', 'PRS', 'PST', 'COND', 'IMP', 'ACC', 'MASC', 'HUM', 'NEUT', 'V.PTCP', 'PASS', 'ACT', 'ANIM', 'V.MSDR', 'INAN', 'V.CVB', 'NFIN']
features_in_ote_pol_csb = ['V', 'IPFV', 'SG', '2', 'PRS', '1', 'PST', '3', 'PFV', 'PRF', 'IRR', 'N', 'ESS', 'PL', 'GEN', 'NOM', 'ADJ', 'INS', 'DAT', 'VOC', 'FEM', 'FUT', 'MASC', 'COND', 'IMP', 'HUM', 'ACC', 'NEUT', 'V.PTCP', 'PASS', 'ACT', 'ANIM', 'V.MSDR', 'INAN', 'V.CVB', 'NFIN']      
features_in_ote_pol_csb_new = ['V', 'IPFV', 'SG', '1', 'PRS', 'PST', '2', '3', 'PFV', 'PRF', 'IRR', 'MASC', 'N', 'ESS', 'PL', 'INS', 'NOM', 'ACC', 'DAT', 'GEN', 'VOC', 'ADJ', 'FEM', 'NEUT', 'HUM', 'INAN', 'ANIM', 'FUT', 'IMP', 'V.PTCP', 'ACT', 'V.CVB', 'COND', 'PASS', 'V.MSDR', 'NFIN']
for f in features_in_ote_pol_csb_new:
    if f not in features_in_ote_pol_csb:
        print("add feature ", f)
# soruce https://aclanthology.org/P15-2111.pdf
features_per_dimension = {'Parts_of_Speech': ['V', 'N', 'ADJ', 'V.PTCP', 'V.MSDR', 'V.CVB'], # high
'Aspect': ['IPFV', 'PFV', 'PRF'], # high
'Number': ['SG', 'PL'], # low
'Person': ['2', '3', '1'],  # low
'Mood': ['IRR', 'COND', 'IMP'], # high
'Case': ['ESS', 'GEN', 'NOM', 'INS', 'DAT', 'VOC', 'ACC'], # high, (in front of low)
'Gender': ['FEM', 'MASC', 'NEUT'],  # low
'Tense': ['FUT', 'PRS', 'PST'], # high
'Animacy': ['HUM', 'ANIM', 'INAN'], # low
'Voice': ['PASS', 'ACT'], # high
'Finiteness': ['NFIN']} # high
dimension_per_feature = {
    'V': 'Parts_of_Speech', 'IPFV': 'Aspect', 'SG': 'Number', 'PFV': 'Aspect', 'PRF': 'Aspect', '2': 'Person', '3': 'Person', 'IRR': 'Mood', '1': 'Person', 'N': 'Parts_of_Speech', 'ESS': 'Case', 'PL': 'Number', 'GEN': 'Case', 'NOM': 'Case', 'ADJ': 'Parts_of_Speech', 'INS': 'Case', 'DAT': 'Case', 'VOC': 'Case', 'FEM': 'Gender', 'FUT': 'Tense', 'PRS': 'Tense', 'PST': 'Tense', 'COND': 'Mood', 'IMP': 'Mood', 'ACC': 'Case', 'MASC': 'Gender', 'HUM': 'Animacy', 'NEUT': 'Gender', 'V.PTCP': 'Parts_of_Speech', 'PASS': 'Voice', 'ACT': 'Voice', 'ANIM': 'Animacy', 'V.MSDR': 'Parts_of_Speech', 'INAN': 'Animacy', 'V.CVB': 'Parts_of_Speech', 'NFIN': 'Finiteness'}
low_hierarchy_of_dimension = {
    'Parts_of_Speech': False, #high
    'Aspect': False,# high
    'Number': True ,# low
    'Person': True,# low
    'Mood':  False,# high
    'Case':  False,# high, (in front of low)
    'Gender':  True,# low
    'Tense':  False,# high
    'Animacy': True,# low
    'Voice':  False,# high
    'Finiteness': False, # high
}

low_order_priority = {
    'Person': 1,
    'Number': 2,
    'Gender': 3,
    'Animacy': 4
}

order_priority = {
    'Parts_of_Speech': 0, #high
    'Aspect': 1,# high
    'Number': 4 ,# low
    'Person': 3,# low
    'Mood':  1,# high
    'Case':  2,# high, (in front of low)
    'Gender':  5,# low
    'Tense':  1,# high
    'Animacy': 6,# low
    'Voice':  1,# high
    'Finiteness': 1, # high
}

def hierarchical_schema (features):
    split_features = features.split(";")
    POS = split_features[0]
    sorted_features = sorted(split_features, key=lambda feature: order_priority[dimension_per_feature[feature]])
    low_hierarchy_features  = filter  (lambda feature:      low_hierarchy_of_dimension[dimension_per_feature[feature]], sorted_features)
    high_hierarchy_features = filter  (lambda feature: not  low_hierarchy_of_dimension[dimension_per_feature[feature]], sorted_features)
    if "V" in POS:
        return ";".join(high_hierarchy_features) + ";NOM(" + ",".join(low_hierarchy_features) +")"
    elif POS in ("N","ADJ"):
        return ";".join(high_hierarchy_features) + "(" + ",".join(low_hierarchy_features) + ")"

def process_line(x):
    x = x.strip()
    lemma, target, features = x.split("\t")
    processed_features = hierarchical_schema(features)
    line = lemma + "\t" + processed_features + "\t" + target
    return line

path_data = "data/"
OUT_DIR = "../../2023InflectionST/part1/data/"
LANGS = ["ote","pol","csb"]
datasets = ["trn","dev","tst"]
for lang in LANGS:
    if lang == "csb": datasets = ["tst"]
    else: datasets = ["trn","dev","tst"]
    for dataset in datasets:
        with open (path_data + lang + "."+dataset,"r") as data_file, open (OUT_DIR + lang + "." + dataset,"w") as output_file:
            data_lines = data_file.readlines()
            output_file.write("\n".join([process_line(line) for line in data_lines]) + "\n")

In [10]:
!ls data

hierarchical_schema  ote.trn	   ote_trn.args  pol.dev  pol_dev.args
ote		     ote.tst	   ote_tst.args  pol.trn  pol_trn.args
ote.dev		     ote_dev.args  pol		 pol.tst  pol_tst.args
