# Transition-based arc-eager unlabeled dependency parser for Ukrainian

## Read the data

Useful links:
* [UD corpus for Ukrainian](https://github.com/UniversalDependencies/UD_Ukrainian-IU/)
* [Easy-to-use library for parsing UD](https://github.com/EmilStenstrom/conllu)

In [1]:
!pwd

/home/ihor/ds/nlp/module3/students


In [2]:
from collections import OrderedDict
from conllu import parse
from enum import Enum

PATH = "UD_Ukrainian-IU"

with open(PATH + "/uk_iu-ud-train.conllu.txt", "r") as f:
    train_trees = parse(f.read())

with open(PATH + "/uk_iu-ud-dev.conllu.txt", "r") as f:
    test_trees = parse(f.read())

In [3]:
print(train_trees[0].metadata['text'])
train_trees[0].to_tree().print_tree()

У домі римського патриція Руфіна була прегарна фреска, зображення Венери та Адоніса.
(deprel:root) form:була lemma:бути upos:VERB [6]
    (deprel:obl) form:домі lemma:дім upos:NOUN [2]
        (deprel:case) form:У lemma:у upos:ADP [1]
        (deprel:nmod) form:патриція lemma:патрицій upos:NOUN [4]
            (deprel:amod) form:римського lemma:римський upos:ADJ [3]
            (deprel:flat:title) form:Руфіна lemma:Руфін upos:PROPN [5]
    (deprel:nsubj) form:фреска lemma:фреска upos:NOUN [8]
        (deprel:amod) form:прегарна lemma:прегарний upos:ADJ [7]
        (deprel:appos) form:зображення lemma:зображення upos:NOUN [10]
            (deprel:punct) form:, lemma:, upos:PUNCT [9]
            (deprel:nmod) form:Венери lemma:Венера upos:PROPN [11]
                (deprel:conj) form:Адоніса lemma:Адоніс upos:PROPN [13]
                    (deprel:cc) form:та lemma:та upos:CCONJ [12]
    (deprel:punct) form:. lemma:. upos:PUNCT [14]


## Design actions and the oracle

We will be using a static oracle that reproduces a single valid order of actions.

In [4]:
class Actions(str, Enum):
    SHIFT = "shift"
    REDUCE = "reduce"
    RIGHT = "right"
    LEFT = "left"
    SWAP = "swap"

In [5]:
def oracle_arc_eager(stack, queue, relations):
    """
    Make a decision on the right action to do.
    """
    top_stack = stack[-1]
    top_queue = queue[0] if queue else None
    
    # check if both stack and queue are non-empty
    if top_stack and not top_queue:
        return Actions.REDUCE
    # check if there are any clear dependencies
    elif top_queue["head"] == top_stack["id"]:
        return Actions.RIGHT
    elif top_stack["head"] == top_queue["id"]:
        return Actions.LEFT
    # check if we can reduce the top of the stack
    elif top_stack["id"] in [i[0] for i in relations] and \
         (top_queue["head"] < top_stack["id"] or \
          [s for s in stack if s["head"] == top_queue["id"]]):
        return Actions.REDUCE
    # default option
    else:
        return Actions.SHIFT


## Feature extraction

Reference: [Dependency Parsing by Kübler, McDonald, and Nivre](https://books.google.com.ua/books?id=k3iiup7HB9UC&pg=PA21&hl=uk&source=gbs_toc_r&cad=4#v=onepage&q&f=false)

In [6]:
def extract_features(stack, queue):
    features = dict()
    if len(stack) > 0:
        stack_top = stack[-1]
        features["s0-word"] = stack_top["form"]
        features["s0-lemma"] = stack_top["lemma"]
        features["s0-tag"] = stack_top["upostag"]
    if len(stack) > 1:
        features["s1-tag"] = stack[-2]["upostag"]
    if queue:
        queue_top = queue[0]
        features["q0-word"] = queue_top["form"]
        features["q0-lemma"] = queue_top["lemma"]
        features["q0-tag"] = queue_top["upostag"]
    if len(queue) > 1:
        queue_next = queue[1]
        features["q1-word"] = queue_next["form"]
        features["q1-tag"] = queue_next["upostag"]
    if len(queue) > 2:
        features["q2-tag"] = queue[2]["upostag"]
    if len(queue) > 3:
        features["q3-tag"] = queue[3]["upostag"]
    return features

## Prepare train and test data

In [7]:
ROOT = OrderedDict([('id', 0), ('form', 'ROOT'), ('lemma', 'ROOT'), ('upostag', 'ROOT'),
                    ('xpostag', None), ('feats', None), ('head', None), ('deprel', None),
                    ('deps', None), ('misc', None)])

In [10]:
def get_data(tree, oracle=None, verbose=False):
    features, labels = [], []
    stack, queue, relations, relations_human = [ROOT], tree[:], [], []

    if oracle is None:
        oracle = oracle_arc_eager

    while queue or stack:
        # get action
        action = oracle(stack if len(stack) > 0 else None,
                        queue if len(queue) > 0 else None,
                        relations)
        if verbose:
            print("-"*20)
            print(f"Stack: {[t['form'] for t in stack] if stack else []}")
            print(f"Queue: {[t['form'] for t in queue] if queue else [] }")

        # Update train data
        features.append(extract_features(stack, queue))
        labels.append(action.value)
        
        # Perform action
        if action == Actions.SHIFT:
            stack.append(queue.pop(0))
        elif action == Actions.REDUCE:
            stack.pop()
        elif action == Actions.LEFT:
            relations.append((stack[-1]["id"], queue[0]["id"]))
            if verbose:
                relations_human.append((stack[-1]["form"], queue[0]["form"]))
            stack.pop()
        elif action == Actions.RIGHT:
            relations.append((queue[0]["id"], stack[-1]["id"]))
            if verbose:
                relations_human.append((queue[0]["form"], stack[-1]["form"]))
            stack.append(queue.pop(0))
        else:
            print("Unknown action.")

        if verbose:
            print(f"\t Relation {relations_human}")
            print(f"\t Action: {action}")  
            
    return features, labels

In [11]:
# Algorithm exploration 
def show_oracle_prophecy(tree, oracle=None):
    print(tree.metadata['text'])
    get_data([t for t in tree if type(t["id"])==int], oracle=oracle ,verbose=True)

show_oracle_prophecy(train_trees[0])

У домі римського патриція Руфіна була прегарна фреска, зображення Венери та Адоніса.
--------------------
Stack: ['ROOT']
Queue: ['У', 'домі', 'римського', 'патриція', 'Руфіна', 'була', 'прегарна', 'фреска', ',', 'зображення', 'Венери', 'та', 'Адоніса', '.']
	 Relation []
	 Action: shift
--------------------
Stack: ['ROOT', 'У']
Queue: ['домі', 'римського', 'патриція', 'Руфіна', 'була', 'прегарна', 'фреска', ',', 'зображення', 'Венери', 'та', 'Адоніса', '.']
	 Relation [('У', 'домі')]
	 Action: left
--------------------
Stack: ['ROOT']
Queue: ['домі', 'римського', 'патриція', 'Руфіна', 'була', 'прегарна', 'фреска', ',', 'зображення', 'Венери', 'та', 'Адоніса', '.']
	 Relation [('У', 'домі')]
	 Action: shift
--------------------
Stack: ['ROOT', 'домі']
Queue: ['римського', 'патриція', 'Руфіна', 'була', 'прегарна', 'фреска', ',', 'зображення', 'Венери', 'та', 'Адоніса', '.']
	 Relation [('У', 'домі')]
	 Action: shift
--------------------
Stack: ['ROOT', 'домі', 'римського']
Queue: ['патр

In [40]:
def data_extractor(trees, extractor=None):
    if extractor is None:
        extractor = get_data
    features, labels = [], []
    for tree in trees:
        # A simple hack would be to check the type of the node id
        tree_features, tree_labels = extractor([t for t in tree if type(t["id"])==int])
        features += tree_features
        labels += tree_labels    
    return features, labels

In [52]:
# Train data

train_features, train_labels = data_extractor(train_trees)

print(len(train_features), len(train_labels))

190298 190298


In [53]:
# Test data

test_features, test_labels = data_extractor(test_trees)

print(len(test_features), len(test_labels))

25820 25820


## Train a classifier

In [14]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [54]:
vectorizer = DictVectorizer()
vec = vectorizer.fit(train_features)

print("\nTotal number of features: ", len(vec.get_feature_names()))


Total number of features:  111126


In [55]:
train_features_vectorized = vec.transform(train_features)
test_features_vectorized = vec.transform(test_features)

# Breaks my computer
# print(len(train_features_vectorized.toarray()), len(test_features_vectorized.toarray()))

In [56]:
lrc = LogisticRegression(random_state=42, solver="saga", multi_class="multinomial", max_iter=600, verbose=1)
lrc.fit(train_features_vectorized, train_labels)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
convergence after 546 epochs took 184 seconds
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  3.1min finished


LogisticRegression(max_iter=600, multi_class='multinomial', random_state=42,
                   solver='saga', verbose=1)

In [57]:
predicted = lrc.predict(test_features_vectorized)
print(classification_report(test_labels, predicted))

precision    recall  f1-score   support

        left       0.86      0.87      0.86      6371
      reduce       0.85      0.78      0.81      6875
       right       0.75      0.79      0.77      5996
       shift       0.85      0.87      0.86      6578

    accuracy                           0.83     25820
   macro avg       0.83      0.83      0.83     25820
weighted avg       0.83      0.83      0.83     25820



## Extend classes

In [23]:
for token in train_trees[0]:
    print(token['id'], token['lemma'], token['deprel'])

1 у case
2 дім obl
3 римський amod
4 патрицій nmod
5 Руфін flat:title
6 бути root
7 прегарний amod
8 фреска nsubj
9 , punct
10 зображення appos
11 Венера nmod
12 та cc
13 Адоніс conj
14 . punct


In [44]:
def get_data_with_deps(tree, oracle=None, verbose=False):
    features, labels = [], []
    stack, queue, relations, relations_human = [ROOT], tree[:], [], []

    if oracle is None:
        oracle = oracle_arc_eager

    while queue or stack:
        # get action
        action = oracle(stack if len(stack) > 0 else None,
                        queue if len(queue) > 0 else None,
                        relations)
        if verbose:
            print("-"*20)
            print(f"Stack: {[t['form'] for t in stack] if stack else []}")
            print(f"Queue: {[t['form'] for t in queue] if queue else [] }")

        # Update train data
        features.append(extract_features(stack, queue))
        
        deprel = None
        # Perform action
        if action == Actions.SHIFT:
            stack.append(queue.pop(0))
        elif action == Actions.REDUCE:
            stack.pop()
        elif action == Actions.LEFT:
            relations.append((stack[-1]["id"], queue[0]["id"]))
            if verbose:
                relations_human.append((stack[-1]["form"], queue[0]["form"]))
            
            deprel = stack[-1]["deprel"]
            
            stack.pop()
        elif action == Actions.RIGHT:
            relations.append((queue[0]["id"], stack[-1]["id"]))
            if verbose:
                relations_human.append((queue[0]["form"], stack[-1]["form"]))
            
            deprel = queue[0]["deprel"]
            stack.append(queue.pop(0))
        else:
            print("Unknown action.")

        if deprel:
            labels.append(action.value+'_'+deprel)   
        else:
            labels.append(action.value)   

        if verbose:
            print(f"\t Relation {relations_human}")
            print(f"\t Action: {action}")  
            
    return features, labels

In [48]:
test_features, test_labels = data_extractor(test_trees,extractor=get_data_with_deps)
train_features, train_labels = data_extractor(train_trees,extractor=get_data_with_deps)

In [50]:
vectorizer = DictVectorizer()
vec = vectorizer.fit(train_features)

train_features_vectorized = vec.transform(train_features)
test_features_vectorized = vec.transform(test_features)

lrc = LogisticRegression(random_state=42, solver="saga", multi_class="multinomial", max_iter=600, verbose=1)
lrc.fit(train_features_vectorized, train_labels)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
max_iter reached after 1312 seconds
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 21.9min finished


LogisticRegression(max_iter=600, multi_class='multinomial', random_state=42,
                   solver='saga', verbose=1)

In [51]:
predicted = lrc.predict(test_features_vectorized)
print(classification_report(test_labels, predicted))

precision    recall  f1-score   support

                 left_acl       0.00      0.00      0.00         1
               left_advcl       0.53      0.22      0.31        37
            left_advcl:sp       0.00      0.00      0.00         1
              left_advmod       0.81      0.91      0.86       528
          left_advmod:det       0.75      0.75      0.75         4
                left_amod       0.88      0.99      0.93       992
                 left_aux       0.70      0.64      0.67        11
                left_case       0.98      0.98      0.98      1150
                  left_cc       0.89      0.91      0.90       443
            left_compound       0.60      0.07      0.12        45
                 left_cop       0.88      0.82      0.85        44
               left_csubj       1.00      0.33      0.50         6
                 left_det       0.94      0.97      0.96       276
          left_det:numgov       0.93      0.93      0.93        14
          left_det:nu

## Change model type

In [61]:
from sklearn import svm
model_svm = svm.SVC(
                kernel="poly",
                degree=2,
                coef0=0,
                gamma=0.2,
                C=0.5,
                verbose=True,
                probability=True,
                max_iter = 500
            )

model_svm.fit(train_features_vectorized, train_labels)

[LibSVM]

SVC(C=0.5, coef0=0, degree=2, gamma=0.2, kernel='poly', max_iter=500,
    probability=True, verbose=True)

In [62]:
predicted = model_svm.predict(test_features_vectorized)
print(classification_report(test_labels, predicted))

precision    recall  f1-score   support

        left       0.77      0.78      0.78      6371
      reduce       0.65      0.40      0.50      6875
       right       0.47      0.71      0.57      5996
       shift       0.53      0.48      0.50      6578

    accuracy                           0.59     25820
   macro avg       0.60      0.59      0.59     25820
weighted avg       0.61      0.59      0.58     25820



## Calculate the unlabeled attachment score
UAS - the percentage of words in an input that are assigned the correct head.

In [33]:
def dep_parse(sentence, oracle, vectorizer, log=True):
    stack, queue, relations = [ROOT], sentence[:], []
    while queue or stack:
        if stack and not queue:
            stack.pop()
        else:
            features = extract_features(stack, queue)
            action = oracle.predict(vectorizer.transform([features]))[0]
            # actual parsing
            if action == Actions.SHIFT:
                stack.append(queue.pop(0))
            elif action == Actions.REDUCE:
                stack.pop()
            elif action == Actions.LEFT:
                relations.append((stack[-1]["id"], queue[0]["id"]))
                stack.pop()
            elif action == Actions.RIGHT:
                relations.append((queue[0]["id"], stack[-1]["id"]))
                stack.append(queue.pop(0))
            else:
                print("Unknown action.")
    return sorted(relations)

In [34]:
total, tp, full_match = 0, 0, 0
for tree in test_trees:
    tree = [t for t in tree if type(t["id"])==int]
    golden = [(node["id"], node["head"]) for node in tree]
    predicted = dep_parse(tree, lrc, vec, log=False)
    total += len(tree)
    tp += len(set(golden).intersection(set(predicted)))
    if set(golden) == set(predicted):
        full_match += 1

print("Total:", total)
print("Correctly defined:", tp)
print("UAS:", round(tp/total, 2))
print("Full match:", round(full_match/len(test_trees), 2))

Total: 12574
Correctly defined: 8717
UAS: 0.69
Full match: 0.09


## Find non-projective trees

In [37]:
def is_non_projective(tree):
    relations = [[i['id'], i['head']] for i in tree if type(i["id"])==int]
    for rel in relations:
        for ref_rel in relations:
            a, c = sorted(rel)
            b, d = sorted(ref_rel)
            if a < b and b < c and c < d:
                return True
    return False

total_non_pr = 0
np_tree_ids = []
for i in range(len(train_trees)):
    if is_non_projective(train_trees[i]):
        total_non_pr += 1
        np_tree_ids.append(i)

print("The percentage of non-projective trees is {} ({} out of {}).".
      format(round(total_non_pr * 100 / len(train_trees), 2), total_non_pr, len(train_trees)))

print("IDs:", np_tree_ids[:10])

The percentage of non-projective trees is 7.99 (439 out of 5496).
IDs: [4, 9, 13, 19, 22, 28, 29, 33, 34, 43]


## Nonprojectivity. Swap

In [39]:
# find the smalest nonprojective tree. Why its non-projective? How to fix it?
smallest_len = 99999
smallest_np_tree_id = -1

for np_tree_id in np_tree_ids:
    if len(train_trees[np_tree_id])<smallest_len:
        smallest_np_tree_id = np_tree_id
        smallest_len = len(train_trees[np_tree_id])
        # print(smallest_np_tree_id)
        # print(train_trees[smallest_np_tree_id].metadata['text'])

print(smallest_np_tree_id)
print(train_trees[smallest_np_tree_id].metadata['text'])

1042
Якось треба жити.


In [21]:
for token in train_trees[1042]:
    print(token['id'], token['lemma'], token['deprel'])

1 якось advmod
2 треба root
3 жити csubj
4 . punct


In [22]:
show_oracle_prophecy(train_trees[1042])

Якось треба жити.
--------------------
Stack: ['ROOT']
Queue: ['Якось', 'треба', 'жити', '.']
	 Relation []
	 Action: shift
--------------------
Stack: ['ROOT', 'Якось']
Queue: ['треба', 'жити', '.']
	 Relation []
	 Action: shift
--------------------
Stack: ['ROOT', 'Якось', 'треба']
Queue: ['жити', '.']
	 Relation [('жити', 'треба')]
	 Action: right
--------------------
Stack: ['ROOT', 'Якось', 'треба', 'жити']
Queue: ['.']
	 Relation [('жити', 'треба')]
	 Action: reduce
--------------------
Stack: ['ROOT', 'Якось', 'треба']
Queue: ['.']
	 Relation [('жити', 'треба'), ('.', 'треба')]
	 Action: right
--------------------
Stack: ['ROOT', 'Якось', 'треба', '.']
Queue: []
	 Relation [('жити', 'треба'), ('.', 'треба')]
	 Action: reduce
--------------------
Stack: ['ROOT', 'Якось', 'треба']
Queue: []
	 Relation [('жити', 'треба'), ('.', 'треба')]
	 Action: reduce
--------------------
Stack: ['ROOT', 'Якось']
Queue: []
	 Relation [('жити', 'треба'), ('.', 'треба')]
	 Action: reduce
---------

In [60]:
def oracle_arc_eager_swap(stack, queue, relations):
    """
    Make a decision on the right action to do.
    """
    top_stack = stack[-1]
    top_queue = queue[0] if queue else None
    
    # check if both stack and queue are non-empty
    if top_stack and not top_queue:
        return Actions.REDUCE
    # check if there are any clear dependencies
    elif top_queue["head"] == top_stack["id"]:
        return Actions.RIGHT
    elif top_stack["head"] == top_queue["id"]:
        return Actions.LEFT
    # check if we can reduce the top of the stack
    elif top_stack["id"] in [i[0] for i in relations] and \
         (top_queue["head"] < top_stack["id"] or \
          [s for s in stack if s["head"] == top_queue["id"]]):
        return Actions.REDUCE
    elif  
    # default option
    else:
        print("***Default***")
        return Actions.SHIFT

show_oracle_prophecy(train_trees[1042],oracle_arc_eager_swap)

Якось треба жити.
***Default***
--------------------
Stack: ['ROOT']
Queue: ['Якось', 'треба', 'жити', '.']
	 Relation []
	 Action: shift
***Default***
--------------------
Stack: ['ROOT', 'Якось']
Queue: ['треба', 'жити', '.']
	 Relation []
	 Action: shift
--------------------
Stack: ['ROOT', 'Якось', 'треба']
Queue: ['жити', '.']
	 Relation [('жити', 'треба')]
	 Action: right
--------------------
Stack: ['ROOT', 'Якось', 'треба', 'жити']
Queue: ['.']
	 Relation [('жити', 'треба')]
	 Action: reduce
--------------------
Stack: ['ROOT', 'Якось', 'треба']
Queue: ['.']
	 Relation [('жити', 'треба'), ('.', 'треба')]
	 Action: right
--------------------
Stack: ['ROOT', 'Якось', 'треба', '.']
Queue: []
	 Relation [('жити', 'треба'), ('.', 'треба')]
	 Action: reduce
--------------------
Stack: ['ROOT', 'Якось', 'треба']
Queue: []
	 Relation [('жити', 'треба'), ('.', 'треба')]
	 Action: reduce
--------------------
Stack: ['ROOT', 'Якось']
Queue: []
	 Relation [('жити', 'треба'), ('.', 'треба')