# Evaluate

In [1]:
import logging
import regex
import unicodecsv as csv
import lemmy
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.DEBUG)

In [2]:
NORMS_FILE = "./data/norms.csv"
UD_TRAIN_FILE = "./data/UD_Danish/da-ud-train.conllu"
UD_DEV_FILE = "./data/UD_Danish/da-ud-dev.conllu"

We read the normalization rules we build in the first notebook. When evaluating, we apply these to the lemmas specified in UD. Otherwise we would risk, for example, counting "akvarie" as the incorrect lemma for "akvarier" if UD specified the other spelling, "akvarium".

In [3]:
norm_lookup = dict([row for row in csv.reader(open(NORMS_FILE, 'rb'), delimiter=",",
                        quotechar='"',
                        quoting=csv.QUOTE_MINIMAL,
                        encoding='utf-8',
                        lineterminator='\n')][1:])

We apply a few more normalization rules. This is due to DSN and UD not agreeing on the lemmas for certain words.

In [4]:
ud_dsn_normalization = (('PRON', 'det', 'det', 'den'),
                        ('ADJ', 'flere', 'mange', 'flere'),
                        ('ADJ', 'mere', 'meget', 'mere'),
                        ('ADJ', 'meget', 'meget', 'megen'),
                        ('ADJ', 'fleste', 'mange', 'flest'))

## Load The Lemmatizer

In [5]:
lemmatizer = lemmy.load()

In [6]:
def _parse_ud_line(line):
    return line.split("\t")[1:4]

def _evaluate(ud_file):
    correct = 0
    incorrect = 0
    ambiguous = 0
    mistakes = {}
    ambiguities = {}
    pos_prev = ""
    for line in open(ud_file).readlines():
        if line.startswith("#") or line.strip() == "":
            pos_prev = ""
            continue

        orth, lemma_expected, pos = _parse_ud_line(line)

        if pos == "NOUN" and lemma_expected in norm_lookup:
            lemma_expected = norm_lookup[lemma_expected]
        else:
            for pos_, orth_, expected_ud, expected_dsn in ud_dsn_normalization:
                if pos != pos_ or orth.lower() != orth_ or lemma_expected != expected_ud:
                    continue            
                lemma_expected = expected_dsn

        lemmas_actual = lemmatizer.lemmatize(pos, orth.lower(), pos_previous=pos_prev)    
        lemma_actual = lemmas_actual[0]

        if len(lemmas_actual) > 1:
            ambiguous += 1
            ambiguities[(pos, orth)] = ambiguities.get((pos, orth), 0) + 1
        elif lemma_actual.lower() == lemma_expected.lower():
            correct += 1
        else:
            mistakes[(pos, orth, lemma_expected, lemma_actual)] = mistakes.get((pos, orth, lemma_expected, lemma_actual), 0) + 1
            incorrect += 1
        pos_prev = pos

    print("* correct:", correct)
    print("* incorrect:", incorrect)
    print("* ambiguous:", ambiguous)
    print("*", correct/(incorrect+ambiguous+correct))
    print("*", (correct+ambiguous)/(incorrect+ambiguous+correct))
    
    return mistakes, ambiguities

## Evaluate on UD Train

In [7]:
mistakes_train, ambiguities_train = _evaluate(UD_TRAIN_FILE)

* correct: 80220
* incorrect: 113
* ambiguous: 45
* 0.9980342879892508
* 0.9985941426758566


## Evaluate on UD Dev

In [8]:
mistakes_dev, ambiguities_dev =_evaluate(UD_DEV_FILE)

* correct: 10246
* incorrect: 82
* ambiguous: 4
* 0.9916763453348819
* 0.9920634920634921


## Mistakes

In [9]:
sorted(mistakes_train.items(), key=lambda x: (-x[1], x[0][1].lower(), x))[:10]

[(('PRON', 'de', 'den', 'de'), 20),
 (('NOUN', 'g', 'gram', 'g'), 8),
 (('NOUN', 'aftes', 'aften', 'aftes'), 5),
 (('PRON', 'De', 'den', 'de'), 5),
 (('ADJ', 'megen', 'meget', 'megen'), 5),
 (('DET', 'det', 'det', 'den'), 4),
 (('NOUN', 'forvejen', 'forvej', 'forvejen'), 4),
 (('ADJ', 'øverste', 'øvre', 'øverst'), 4),
 (('VERB', 'bortset', 'bortset', 'bortse'), 3),
 (('ADJ', 'mest', 'meget', 'mest'), 3)]

## Ambiguities

In [10]:
sorted(ambiguities_train.items(), key=lambda x: (-x[1], x[0][1].lower(), x))[:10]

[(('ADJ', 'mange'), 10),
 (('NOUN', 'jorden'), 6),
 (('ADJ', 'Mange'), 6),
 (('ADV', 'stort'), 4),
 (('ADJ', '3.'), 3),
 (('VERB', 'ses'), 3),
 (('NOUN', 'Jorden'), 2),
 (('VERB', 'mindes'), 2),
 (('VERB', 'skændes'), 2),
 (('NOUN', 'tanke'), 2)]