In [1]:
# -*- coding: utf-8 -*-
import pprint
import csv
import pandas as pd
import pyconll
import re
from collections import defaultdict
import itertools
import os
import json

from sacremoses import MosesDetokenizer

In [2]:
ARABIC_SPECIAL_CHARS = ['ً', 'ّ', 'ٍ', 'ٌ', 'ـ']

def join_labels(labels, feat):
    if feat != 'pos':
        joint_labels = '+'.join([l for l in labels if l != 'X'])
        if len(joint_labels) == 0:
            joint_labels = 'X'
    else:
        joint_labels = '+'.join(labels)
    return joint_labels


def dump_labels_to_file(sent_ids, sents_labels, output_path):
    with open(output_path, 'w', encoding='utf-8') as f:
        for sent_id, sent_labels in zip(sent_ids, sents_labels):
            f.write(f'{sent_id}\n')
            for tok, label in sent_labels:
                f.write(f'{tok} {label}\n')
            f.write('\n')


def align_labels_to_morphemes(orig_tokens, morpheme_toks, morpheme_labels, feat='pos'):
    """Applies alignment heuristics to map the morpheme labels to the given list of tokens (characters). 
    Example:
    >>> align_labels_to_morphemes(['ו', 'כ', 'ש', 'מ', 'ד', 'ב', 'ר', 'י', 'ה', 'ם'],
                                  ['ו', 'כש', 'מ', 'דברים', 'של', 'הם'],
                                  ['CCONJ', 'SCONJ', 'ADP', 'NOUN', 'ADP', 'PRON'],
                                  feat='pos')
    >>> [('ו', 'CCONJ'),
         ('כ', 'SCONJ'),
         ('ש', 'SCONJ'),
         ('מ', 'ADP'),
         ('ד', 'NOUN+ADP+PRON'),
         ('ב', 'NOUN+ADP+PRON'),
         ('ר', 'NOUN+ADP+PRON'),
         ('י', 'NOUN+ADP+PRON'),
         ('ה', 'NOUN+ADP+PRON'),
         ('ם', 'NOUN+ADP+PRON')]
    """
    curr = 0
    orig_curr = 0
    labels_full = []
    tokens_full = []
    joint_labels = None
    prev = None
    
    if orig_tokens[-1] in ARABIC_SPECIAL_CHARS and not morpheme_toks[-1].endswith(orig_tokens[-1]):
        morpheme_toks[-1] += orig_tokens[-1]
        
    while curr < len(morpheme_toks):
        if prev is not None:
            labels_full.extend([morpheme_labels[prev[1]] for ch in morpheme_toks[prev[1]]])
            tokens_full.extend([ch for ch in morpheme_toks[prev[1]]])
            prev = None
        
        # Hebrew covert morphemes - join the respective lables into a multitag
        if ''.join(orig_tokens[orig_curr:]) != ''.join(morpheme_toks[curr:]) and \
            (morpheme_toks[curr:curr + 2] == ['ב', 'ה'] \
                or morpheme_toks[curr:curr + 2] == ['ל', 'ה'] \
                or morpheme_toks[curr:curr + 2] == ['כ', 'ה']):
            tokens_full.append(morpheme_toks[curr])       
            joint_labels = join_labels(morpheme_labels[curr:curr + 2], feat)
            labels_full.append(joint_labels)
            orig_curr += 1
            curr += 2
            continue

        if ''.join(orig_tokens[orig_curr:]) != ''.join(morpheme_toks[curr:]):
            if ''.join(orig_tokens[orig_curr:orig_curr + len(morpheme_toks[curr])]) == ''.join(orig_tokens[orig_curr:]):
                p = prev if prev is not None else (orig_curr, curr)
                joint_labels = join_labels(morpheme_labels[p[1]:], feat)
                labels_full.extend([joint_labels for ch in orig_tokens[p[0]:]])
                tokens_full.extend([ch for ch in orig_tokens[p[0]:]])
                break
            if ''.join(orig_tokens[orig_curr:orig_curr + len(morpheme_toks[curr])]) == morpheme_toks[curr]:
                prev = (orig_curr, curr)
            else:
                p = prev if prev is not None else (orig_curr, curr)
                joint_labels = join_labels(morpheme_labels[p[1]:], feat)
                labels_full.extend([joint_labels for ch in orig_tokens[p[0]:]])
                tokens_full.extend([ch for ch in orig_tokens[p[0]:]])
                break
        else:
            if ''.join(orig_tokens[orig_curr:orig_curr + len(morpheme_toks[curr])]) == morpheme_toks[curr]:
                labels_full.extend([morpheme_labels[curr] for ch in morpheme_toks[curr]])
                tokens_full.extend([ch for ch in morpheme_toks[curr]])
            else:
                joint_labels = join_labels(morpheme_labels[curr:], feat)
                labels_full.extend([joint_labels for ch in orig_tokens[curr:]])
                tokens_full.extend([ch for ch in orig_tokens[orig_curr:]])
                break
        orig_curr += len(morpheme_toks[curr])
        curr += 1
    return list(zip(tokens_full, labels_full))

In [3]:
### Hebrew
pprint.pprint(align_labels_to_morphemes(['ל', 'ה', 'פ', 'ו', 'ע', 'ל'],
                                  ['ל', 'ה', 'פועל'],
                                  ['ADP', 'DET', 'NOUN'],
                                  feat='pos'))

### Arabic
pprint.pprint(align_labels_to_morphemes(['و', 'م', 'ت', 'س', 'ا', 'ر', 'ع', 'ا', 'ً'],
                                        ['و', 'متسارعا'],
                                        ['CCONJ', 'ADJ']))

# ### Turkish
# pprint.pprint(align_labels_to_morphemes(['t', 'u', 't', 's', 'a', 'ğ', 'ı', 'm'],
#                                         ['tutsak', 'ım'],
#                                         ['ADJ', 'AUX']))

# pprint.pprint(align_labels_to_morphemes(['s', 'a', 'y', 'f', 'a', 'l', 'ı', 'k'],
#                                         ['sayfa', 'lık'],
#                                         ['NOUN', 'ADP']))

[('ל', 'ADP'),
 ('ה', 'DET'),
 ('פ', 'NOUN'),
 ('ו', 'NOUN'),
 ('ע', 'NOUN'),
 ('ל', 'NOUN')]
[('و', 'CCONJ'),
 ('م', 'ADJ'),
 ('ت', 'ADJ'),
 ('س', 'ADJ'),
 ('ا', 'ADJ'),
 ('ر', 'ADJ'),
 ('ع', 'ADJ'),
 ('ا', 'ADJ'),
 ('ً', 'ADJ')]


In [4]:
span_pattern = re.compile("(?P<start>[0-9]+)-(?P<end>[0-9]+)")

VOID_TAG = "VOID"


def is_space_after_token(token):
    return token.misc.get('SpaceAfter') != {'No'}


def tokenize_chars(text):
    return [ch for ch in text]

def get_sentence_toks_and_morphs(sentence, feats):
    start, end = -1, -1
    toks = []
    tok_inds = []
    morpheme_toks = []
    spaces = []
    morpheme_labels = defaultdict(list)
    for token in sentence:
        if "." in token.id:
            # Arabic parataxis (e.g. 27.1)
            continue
        try:
            token_id = int(token.id)
        except ValueError:
            # This is a span
            start, end = [int(m) for m in span_pattern.match(token.id).groups()]
            space_after = is_space_after_token(token)
            spaces.append(space_after)
            orig_token_chars = tokenize_chars(token.form)
            tok_inds.append(list(range(start - 1, end)))
            toks.append(token.form)
            continue
            
        if token_id not in range(start, end + 1):
            toks.append(token.form)
            tok_inds.append([token_id - 1])
            space_after = is_space_after_token(token)
            spaces.append(space_after)
        morpheme_toks.append(token.form)
        morpheme_labels['pos'].append(token.upos)
        for f in feats:
            tok_f = '+'.join(sorted(list(token.feats.get(f, {'X'}))))
            morpheme_labels[f].append(tok_f)
    spaces[-1] = False
    return toks, tok_inds, morpheme_toks, morpheme_labels, spaces


def preprocess_ud(output_dir, conll_file_path, output_format='segmented', json_char_format=True, feats=[]):
    sents = {'train': {}, 'dev': {}, 'test': {}}
    json_sents = {'train': {}, 'dev': {}, 'test': {}}
    sent_ids = {}
    for sp in sents:
        for f in feats + ['pos']:
            sents[sp][f] = []
            
    for sp in sents:
        json_sents[sp] = []
    
    for split in ['train', 'dev', 'test']:
        conll_obj = pyconll.load_from_file(conll_file_path.format(split))
        sent_ids[split] = []
        for i, sentence in enumerate(conll_obj):
            sent_id = sentence.id
            sent_ids[split].append(sent_id)
            sent_text = sentence.text
            tags = defaultdict(list)
            json_tags = defaultdict(list)
            toks, tok_inds, morphs, morpheme_labels, spaces = get_sentence_toks_and_morphs(sentence, feats)
            assert len(set([len(morpheme_labels[f]) for f in feats + ['pos']])) == 1, \
                                        (sent_id, [morpheme_labels[f] for f in feats + ['pos']])
            for f in feats + ['pos']:
                for ind, full_tok, is_space_after in zip(tok_inds, toks, spaces):
                    if len(ind) == 1:
                        tok_chars = [c for c in full_tok]
                        labels = [morpheme_labels[f][ind[0]] for _ in full_tok]
                        tags[f].extend(list(zip(tok_chars, labels)))
                        if output_format == 'multitag':
                            if json_char_format:
                                json_tags[f].extend(list(zip(tok_chars, labels)))
                            else:
                                json_tags[f].append((full_tok, morpheme_labels[f][ind[0]]))
                    else:
                        if output_format == 'multitag':
                            joint_multitag = '+'.join([morpheme_labels[f][i] for i in ind])
                            if f != 'pos':
                                all_feats = [morpheme_labels[f][i] for i in ind if morpheme_labels[f][i] != 'X']
                                joint_multitag = 'X' if len(all_feats) == 0 else '+'.join(all_feats)
                            tags[f].extend(list(zip([c for c in full_tok], 
                                                    [joint_multitag for _ in full_tok])))
                            if json_char_format:
                                json_tags[f].extend(list(zip([c for c in full_tok], 
                                                    [joint_multitag for _ in full_tok])))
                            else:
                                json_tags[f].append((full_tok, joint_multitag))
                        else: # segmented format
                            tags[f].extend(align_labels_to_morphemes([c for c in full_tok], 
                                                                     [morphs[i].replace('_', '') for i in ind], 
                                                                     [morpheme_labels[f][i] for i in ind], f))
                    if is_space_after:
                        tags[f].append((' ', 'VOID'))
                        if output_format == 'multitag' and json_char_format:
                            json_tags[f].append((' ', 'VOID'))

            if len(set([len(tags[f]) for f in feats + ['pos']] + [len(sent_text)])) != 1:
                print(f"======= Problem in sentence id {sent_id}:")
                print(sent_text)
                print([len(tags[f]) for f in feats + ['pos']] + [len(sent_text)])
            
            sent_dict = {}
            for f in feats + ['pos']:
                sents[split][f].append(tags[f])
                if f == 'pos':
                    sent_dict['tokens'] = [t[0] for t in json_tags[f]] if output_format == 'multitag' else [t[0] for t in tags[f]]
                if output_format == 'multitag':
                    sent_dict[f'{f.lower()}_tags'] = [t[1] for t in json_tags[f]]
                else:
                    sent_dict[f'{f.lower()}_tags'] = [t[1] for t in tags[f]]
            json_sents[split].append(sent_dict)
            
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    for sp, sents_labels in sents.items():
        for f, sent_labels in sents_labels.items():
            dump_labels_to_file(sent_ids[sp], sent_labels, 
                                os.path.join(output_dir, 
                                             f'{f.lower()}_{sp}_{output_format}_ud.txt'))
    
    for sp, sent_labels in json_sents.items():
        json_format = 'chars' if json_char_format else 'words'
        filename = f'{sp}_{output_format}_{json_format}_ud.json'
        open(os.path.join(output_dir, filename), 'w',
             encoding='utf-8').write(
            json.dumps({'format': output_format, 'data': sent_labels}))

In [5]:
from pathlib import Path

ud_paths = {'ar': "raw_data_ar/UD_Arabic-PADT/ar_padt-ud-{}.conllu",
            'he': "raw_data_he/UD_Hebrew-HTB/he_htb-ud-{}.conllu",
            'tr': "raw_data_tr/UD_Turkish-IMST/tr_imst-ud-{}.conllu"}

feats_per_lang = {'ar': ['Abbr',
                         'AdpType',
                         'Aspect',
                         'Case',
                         'ConjType',
                         'Definite',
                         'Foreign',
                         'Gender',
                         'Mood',
                         'Number',
                         'NumForm',
                         'NumValue',
                         'Person',
                         'Polarity',
                         'PronType',
                         'VerbForm',
                         'Voice'],
                  'he': ['Abbr',
                         'Case',
                         'Definite',
                         'Gender',
                         'HebBinyan',
                         'HebExistential',
                         'Mood',
                         'Number',
                         'Person',
                         'Polarity',
                         'Prefix',
                         'PronType',
                         'Reflex',
                         'Tense',
                         'VerbForm',
                         'VerbType',
                         'Voice'],
                  'tr': ['Abbr',
                         'Aspect',
                         'Case',
                         'Definite',
                         'Echo',
                         'Evident',
                         'Mood',
                         'Number',
                         'Number[psor]',
                         'NumType',
                         'Person',
                         'Person[psor]',
                         'Polarity',
                         'Polite',
                         'PronType',
                         'Reflex',
                         'Tense',
                         'VerbForm',
                         'Voice']}

# for lang in ['ar', 'he', 'tr']:
for lang in ['he', 'tr']:
    for output_format, json_char_format in [('segmented', True), ('multitag', True), ('multitag', False)]:
        preprocess_ud(output_dir=os.path.join(str(Path(ud_paths[lang]).parent), 
                                              f'{output_format}_{"chars" if json_char_format else "words"}'),
                      conll_file_path=ud_paths[lang],
                      output_format=output_format,
                      json_char_format=json_char_format,
                      feats=feats_per_lang[lang])

In [None]:
keys = ['global_global_sent_id', 'global_TOKEN', 
        'sp_orig_sent', 'sp_token_str', 'sp_forms', 'sp_morph_ids', 
        'sp_count', 'ud_orig_sent', 'ud_token_str', 
        'ud_forms', 'ud_morph_ids', 'ud_count', 
        'fixed_token_str', 'fixed_sp_forms', 'fixed_ud_forms', 'comment']

conll_obj = pyconll.load_from_file(r"spmrl_fixed.conllu")
conll_data = defaultdict(list)
with open(r"token_morpheme_alignment_spmrl_ud_with_fixes.csv", 'r', encoding='utf-8') as infile:
    reader = csv.DictReader(infile)
    for row in reader:
        conll_data[row['global_global_sent_id']].append(row)

def preprocess_spmrl(output_dir):
    detok = MosesDetokenizer(lang='he')
    sents = {'train': {}, 'dev': {}, 'test': {}}
    feats = ['gen', 'num', 'per', 'polar', 'tense', 'HebBinyan', 'suf_num', 'suf_per',
                                                  'suf_gen']

    for sp in sents:
        for f in feats + ['pos']:
            sents[sp][f] = []

    output_format = 'multitag'

    for sentence in conll_obj:
        sent_id = int(sentence.id)
        split = sentence._meta['set']
        if sent_id % 100 == 0:
            print(sent_id)
        tags = defaultdict(list)
        morpheme_labels = defaultdict(list)

        morpheme_labels['pos'] = [token.upos for token in sentence if hasattr(token, 'upos') and token.upos is not None]
        for f in feats:
            toks_f = ['+'.join(sorted(list(token.feats.get(f, {'X'})))) for token in sentence if token.upos is not None]
            morpheme_labels[f] = toks_f
        assert len(set([len(morpheme_labels[f]) for f in feats + ['pos']])) == 1, (sent_id, [morpheme_labels[f] for f in feats + ['pos']])

        sent_rows = conll_data[str(sent_id)]
        morphs = [r['fixed_sp_forms'] if r['fixed_sp_forms'] != '' else r['sp_forms'] for r in sent_rows]
        toks = [r['fixed_token_str'] if r['fixed_token_str'] != '' else r['sp_token_str'] for r in sent_rows]
        sent_text = detok.detokenize(toks)
        tok_inds = [eval(r['sp_morph_ids']) for r in sent_rows]
        tok_inds = [[int(y) - 1 for y in t] for t in tok_inds]
        for f in feats + ['pos']:
            for ind, full_tok, tok_morphs in zip(tok_inds, toks, morphs):
                # manual fix 
                if sent_id == 4009 and tok_morphs == 'ב 19.9':
                    tok_morphs = 'ב 19.90'
                    print("Manual Fix!!!")
                if len(ind) == 1:
                    tok_chars = [c for c in full_tok]
                    labels = [morpheme_labels[f][ind[0]] for _ in full_tok]
                    tags[f].extend(list(zip(tok_chars, labels)))
                else:
                    if output_format == 'multitag':
                        joint_multitag = '+'.join([morpheme_labels[f][i] for i in ind])
                        if f != 'pos' and all([morpheme_labels[f][i] == 'X' for i in ind]):
                            joint_multitag = 'X'
                        tags[f].extend(list(zip([c for c in full_tok], 
                                                [joint_multitag for _ in full_tok])))
                    else: # segmented format
                        tags[f].extend(align_labels_to_morphemes([c for c in full_tok], tok_morphs.split(), [morpheme_labels[f][i] for i in ind], f))

            for ws_ind in [i.start() for i in re.finditer(' ', sent_text)]:
                tags[f].insert(ws_ind, (' ', 'VOID'))
        if len(set([len(tags[f]) for f in feats + ['pos']] + [len(sent_text)])) != 1:
            print(f"======= Problem in sentence id {sent_id}:", sent_id)
            print(sent_text)
            pprint.pprint(tags['pos'])
            print([len(tags[f]) for f in feats + ['pos']] + [len(sent_text)])

        for f in feats + ['pos']:
            sents[split][f].append(tags[f])

    for sp, sents_labels in sents.items():
        for f, sent_labels in sents_labels.items():
            dump_labels_to_file(sent_labels, 
                                os.path.join(output_dir, 
                                             f'{f}_{sp}_{output_format}_spmrl.txt'))

In [None]:
preprocess_spmrl(r'preprocessed_ud_he')