In [None]:
import os
import re
import random
import logging
import pickle

import pandas as pd
from sklearn.metrics import classification_report
from tqdm import tqdm
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
import spacy

### Load evaluation data

In [None]:
with open("../data/prepared/20210311_eval.pickle", "rb") as fp:
    eval_data = pickle.load(fp)

### Define function for evaluation

In [None]:
def get_metrics_from_tuplelist(actual_tuples,predicted_tuples): 
    """Checks if lists for actual and predicted have the same tokens and returns sklearn confusion matrix for NER"""
    
    #check if tokens are similar btwn actual and predicted
    tokens_actual = [elem[0] for elem in actual_tuples]
    tokens_pred = [elem[0] for elem in predicted_tuples]
    
    if (tokens_actual == tokens_pred):
        # return confusion matrix if all tokens are similar #TODO: is more flexibility required (does it returns errors to often?)
        
        NER_actual = [elem[1] for elem in actual_tuples]
        NER_pred = [elem[1] for elem in predicted_tuples]
    
        return classification_report(NER_actual, NER_pred, labels=["PER", "LOC", "ORG"])
    
    else:
        raise NameError('tokens in list for actual do not match tokens in list prediction')

In [None]:
# other functions that might be useful

def get_wrong_pred(actual, predicted, label):
    
    print("predicted, but not labelled")
    print("------------------------------")

    for p, a in zip(predicted, actual):
        if p[1] == label and a[1] != label:
            print((p, a))

def get_miss_pred(actual, predicted, label):
    
    print("labelled, but not predicted")
    print("------------------------------")

    for p, a in zip(predicted_res, actual_res):
        if p[1] != label and a[1] == label:
            print((p, a))

### Load our models and setup

In [None]:
def setup_pipeline(model_path):
    tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
    model = AutoModelForTokenClassification.from_pretrained(model_path)

    nlp = pipeline(
        "ner", model=model, tokenizer=tokenizer, grouped_entities=True
    )
    
    return nlp

In [None]:
model_path_lrg = "../models/xlm-roberta-large-finetuned-conll03-german"
nlp_large = setup_pipeline(model_path_lrg)

In [None]:
model_path_sml = "../models/distilbert-base-german-cased-finetuned-germeval14-german"
nlp_small = setup_pipeline(model_path_sml)

In [None]:
model_path_sml_alt = "../models/distilbert-base-german-cased-finetuned-conll03-german"
nlp_smalll_2 = setup_pipeline(model_path_sml_alt)

In [None]:
# helper function for preparing tokens 

def prepare_token_transformer(text, nlp_pipeline):
    text_splitted = re.split("\s|!|\.|,|\\(|\\)", text)
    text_splitted = [{e: "O"} for e in text_splitted]

    found_ents = nlp_pipeline(text)
    found = []
    for each in found_ents:
        word = each.get("word")
        ent_group = each.get("entity_group")
        word_splitted = word.split()
        for each_word in word_splitted:
            found.append({each_word: ent_group})

    keys_found = []
    for each in found:
        for k, v in each.items():
            keys_found.append(k)

    new_res = []
    for each in text_splitted:
        for k, v in each.items():
            if k in keys_found:
                for i in found:
                    for k_f, v_f in i.items():
                        if k == k_f:
                            temp_add = i
            else:
                temp_add = each

        new_res.append(temp_add)

    return new_res

In [None]:
# get predictions for large model
predicted = []
actual = []

for each in tqdm(eval_data):
    
    temp_act = each.get("with_labels")
    
    text_to_anon = each.get("text")
    try:
        temp_res = prepare_token_transformer(text_to_anon, nlp_large) 
    except Exception as e:
        print(e)
        next
        
    temp_pred = []
    for each in temp_res:
        for k, v in each.items():
            temp = (k, v)
            temp_pred.append(temp)
    temp_pred = [e for e in temp_pred if len(e[0]) > 1]
            
    if len(temp_act) != len(temp_pred):
        print("unequal length")
        print(len(temp_act))
        print(len(temp_pred))
        print(temp_act)
        print(temp_pred)
        next
    else:
        actual.append(temp_act)
        predicted.append(temp_pred)

In [None]:
# correct for manual "rules" coded in framework for actual labels

temp_actual_res = []
for each in actual:
    for i in each:
        for k, v in i.items():
            temp_tuple = (k, v)
            temp_actual_res.append(temp_tuple)

actual_res = []
for t in temp_actual_res:
    if t[0] == "Hr":
        t = ("Hr", "O")
    elif t[0] == "Fr":
        t = ("Fr", "O")
    elif t[0] == "Frau":
        t = ("Frau", "O")
    elif t[0] == "Herr":
        t = ("Herr", "O")
    actual_res.append(t)

In [None]:
# correct for manual "rules" coded in framework for predicted labels
# and labelling errors ::fixme:: later

predicted_res = []
for each in predicted:
    for t in each:
        if t[0] == "Hr":
            t = ("Hr", "O")
        elif t[0] == "Fr":
            t = ("Fr", "O")
        elif t[0] == "Frau":
            t = ("Frau", "O")
        elif t[0] == "Herr":
            t = ("Herr", "O")
        elif t[0] == "Markt":
            t = ("Markt", "O")
        elif t[0] == "XXX":
            t = ("XXX", "O")
        predicted_res.append(t)

In [None]:
# discuss whether Herr Frau etc should be filtered out here
print("large xlm-roberta model")
print(get_metrics_from_tuplelist(actual_res, predicted_res))

### smaller distillbert model

In [None]:
# get predictions for small model
predicted = []
actual = []

for each in tqdm(eval_data):
    
    temp_act = each.get("with_labels")
    
    text_to_anon = each.get("text")
    try:
        temp_res = prepare_token_transformer(text_to_anon, nlp_small) 
    except Exception as e:
        print(e)
        next
        
    temp_pred = []
    for each in temp_res:
        for k, v in each.items():
            temp = (k, v)
            temp_pred.append(temp)
    temp_pred = [e for e in temp_pred if len(e[0]) > 1]
            
    if len(temp_act) != len(temp_pred):
        print("unequal length")
        print(len(temp_act))
        print(len(temp_pred))
        print(temp_act)
        print(temp_pred)
        next
    else:
        actual.append(temp_act)
        predicted.append(temp_pred)

In [None]:
# correct for manual "rules" coded in framework for actual labels

temp_actual_res = []
for each in actual:
    for i in each:
        for k, v in i.items():
            temp_tuple = (k, v)
            temp_actual_res.append(temp_tuple)

actual_res = []
for t in temp_actual_res:
    if t[0] == "Hr":
        t = ("Hr", "O")
    elif t[0] == "Fr":
        t = ("Fr", "O")
    elif t[0] == "Frau":
        t = ("Frau", "O")
    elif t[0] == "Herr":
        t = ("Herr", "O")
    actual_res.append(t)

In [None]:
# correct for manual "rules" coded in framework for predicted labels
# and labelling errors ::fixme:: later

predicted_res = []
for each in predicted:
    for t in each:
        if t[0] == "Hr":
            t = ("Hr", "O")
        elif t[0] == "Fr":
            t = ("Fr", "O")
        elif t[0] == "Frau":
            t = ("Frau", "O")
        elif t[0] == "Herr":
            t = ("Herr", "O")
        elif t[0] == "Markt":
            t = ("Markt", "O")
        elif t[0] == "XXX":
            t = ("XXX", "O")
        predicted_res.append(t)

In [None]:
# discuss whether Herr Frau etc should be filtered out here
print("small distillbert model")
print(get_metrics_from_tuplelist(actual_res, predicted_res))

In [None]:
# get_wrong_pred(actual_res, predicted_res, "LOC")

In [None]:
# get_miss_pred(actual_res, predicted_res, "PER")

## alternative model

In [None]:
# get predictions for small model
predicted = []
actual = []

for each in tqdm(eval_data):
    
    temp_act = each.get("with_labels")
    
    text_to_anon = each.get("text")
    try:
        temp_res = prepare_token_transformer(text_to_anon, nlp_smalll_2) 
    except Exception as e:
        print(e)
        next
        
    temp_pred = []
    for each in temp_res:
        for k, v in each.items():
            temp = (k, v)
            temp_pred.append(temp)
    temp_pred = [e for e in temp_pred if len(e[0]) > 1]
            
    if len(temp_act) != len(temp_pred):
        print("unequal length")
        print(len(temp_act))
        print(len(temp_pred))
        print(temp_act)
        print(temp_pred)
        next
    else:
        actual.append(temp_act)
        predicted.append(temp_pred)

In [None]:
# correct for manual "rules" coded in framework for actual labels

temp_actual_res = []
for each in actual:
    for i in each:
        for k, v in i.items():
            temp_tuple = (k, v)
            temp_actual_res.append(temp_tuple)

actual_res = []
for t in temp_actual_res:
    if t[0] == "Hr":
        t = ("Hr", "O")
    elif t[0] == "Fr":
        t = ("Fr", "O")
    elif t[0] == "Frau":
        t = ("Frau", "O")
    elif t[0] == "Herr":
        t = ("Herr", "O")
    actual_res.append(t)

In [None]:
# correct for manual "rules" coded in framework for predicted labels
# and labelling errors ::fixme:: later

predicted_res = []
for each in predicted:
    for t in each:
        if t[0] == "Hr":
            t = ("Hr", "O")
        elif t[0] == "Fr":
            t = ("Fr", "O")
        elif t[0] == "Frau":
            t = ("Frau", "O")
        elif t[0] == "Herr":
            t = ("Herr", "O")
        elif t[0] == "Markt":
            t = ("Markt", "O")
        elif t[0] == "XXX":
            t = ("XXX", "O")
        predicted_res.append(t)

In [None]:
# discuss whether Herr Frau etc should be filtered out here
print("small distillbert model conll")
print(get_metrics_from_tuplelist(actual_res, predicted_res))

### evaluate spacy

In [None]:
def prepare_token_spacy(doc, text_splitted):
    found = []
    for ent in doc.ents:
        word = ent.text
        ent_group = ent.label_

        word_splitted = word.split()
        for each_word in word_splitted:
            found.append({each_word: ent_group})

    keys_found = []
    for each in found:
        for k, v in each.items():
            keys_found.append(k)

    new_res = []
    for each in text_splitted:
        for k, v in each.items():
            if k in keys_found:
                for i in found:
                    for k_f, v_f in i.items():
                        if k == k_f:
                            temp_add = i
            else:
                temp_add = each

        new_res.append(temp_add)

    return new_res

In [None]:
# might need to download model first
# !python -m spacy download de_core_news_md 

In [None]:
nlp = spacy.load("de_core_news_md")

In [None]:
actual = []
predicted = []

for each in tqdm(eval_data):
    
    temp_act = each.get("with_labels")
    
    text_to_anon = each.get("text")
    text_splitted = re.split("\s|!|\.|,|\\(|\\)", text_to_anon)
    text_splitted = [{e: "O"} for e in text_splitted]
    
    doc = nlp(text_to_anon)
    
    try:
        temp_res = prepare_token_spacy(doc, text_splitted)
    except Exception as e:
        print(e)
        next
    
    temp_pred = []
    
    for each in temp_res:
        for k, v in each.items():
            
            if not v in ["PER", "LOC", "ORG", "O"]:
                temp = (k, "O")
            else:
                temp = (k, v)
                
            temp_pred.append(temp)
    temp_pred = [e for e in temp_pred if len(e[0]) > 1]
            
    if len(temp_act) != len(temp_pred):
        print("unequal length")
        print(len(temp_act))
        print(len(temp_pred))
        print(temp_act)
        print(temp_pred)
        next
    else:
        actual.append(temp_act)
        predicted.append(temp_pred)
    



In [None]:
# correct for manual "rules" coded in framework for actual labels

temp_actual_res = []
for each in actual:
    for i in each:
        for k, v in i.items():
            temp_tuple = (k, v)
            temp_actual_res.append(temp_tuple)

actual_res = []
for t in temp_actual_res:
    if t[0] == "Hr":
        t = ("Hr", "O")
    elif t[0] == "Fr":
        t = ("Fr", "O")
    elif t[0] == "Frau":
        t = ("Frau", "O")
    elif t[0] == "Herr":
        t = ("Herr", "O")
    actual_res.append(t)

In [None]:
# correct for manual "rules" coded in framework for predicted labels
# and labelling errors ::fixme:: later

predicted_res = []
for each in predicted:
    for t in each:
        if t[0] == "Hr":
            t = ("Hr", "O")
        elif t[0] == "Fr":
            t = ("Fr", "O")
        elif t[0] == "Frau":
            t = ("Frau", "O")
        elif t[0] == "Herr":
            t = ("Herr", "O")
        elif t[0] == "Markt":
            t = ("Markt", "O")
        elif t[0] == "XXX":
            t = ("XXX", "O")
        predicted_res.append(t)

In [None]:
# discuss whether Herr Frau etc should be filtered out here
print("medium spacy model")
print(get_metrics_from_tuplelist(actual_res, predicted_res))

### lexical approach

In [None]:
df_per = pd.read_csv("../lexical_approach/german_per.csv")
df_loc = pd.read_csv("../lexical_approach/german_loc.csv")
df_org = pd.read_csv("../lexical_approach/german_org.csv")

In [None]:
lookup_names = df_per["per"].values
lookup_locations = df_loc["loc"].values
lookup_orgs = df_org["loc"].values

In [None]:
# tokenize text
actual = []
predicted = []

for each in tqdm(eval_data):
    
    temp_act = each.get("with_labels")
    
    text_to_anon = each.get("text")
    text_splitted = re.split("\s|!|\.|,|\\(|\\)", text_to_anon)
    text_splitted = [e for e in text_splitted if len(e) > 1]
    
    temp_pred = []
    for each in text_splitted:
        temp_res = []
        if each in lookup_names:
            temp_res.append("PER")
        if each in lookup_locations:
            temp_res.append("LOC")
        if each in lookup_orgs:
            temp_res.append("ORG")
        
        if not temp_res:
            temp = {each: "O"}
        else: 
            temp = {each: random.choice(temp_res)}
        temp_pred.append(temp)
            
    if len(temp_act) != len(temp_pred):
        print("unequal length")
        print(len(temp_act))
        print(len(temp_pred))
        print(temp_act)
        print(temp_pred)
        next
    else:
        actual.append(temp_act)
        predicted.append(temp_pred)


In [None]:
# correct for manual "rules" coded in framework for actual labels

temp_actual_res = []
for each in actual:
    for i in each:
        for k, v in i.items():
            temp_tuple = (k, v)
            temp_actual_res.append(temp_tuple)

actual_res = []
for t in temp_actual_res:
    if t[0] == "Hr":
        t = ("Hr", "O")
    elif t[0] == "Fr":
        t = ("Fr", "O")
    elif t[0] == "Frau":
        t = ("Frau", "O")
    elif t[0] == "Herr":
        t = ("Herr", "O")
    actual_res.append(t)

In [None]:
# correct for manual "rules" coded in framework for predicted labels
# and labelling errors ::fixme:: later

temp_predict_res = []
for each in predicted:
    for i in each:
        for k, v in i.items():
            temp_tuple = (k, v)
            temp_predict_res.append(temp_tuple)
            
predicted_res = []
for t in temp_predict_res:
    if t[0] == "Hr":
        t = ("Hr", "O")
    elif t[0] == "Fr":
        t = ("Fr", "O")
    elif t[0] == "Frau":
        t = ("Frau", "O")
    elif t[0] == "Herr":
        t = ("Herr", "O")
    elif t[0] == "Markt":
        t = ("Markt", "O")
    elif t[0] == "XXX":
        t = ("XXX", "O")
    predicted_res.append(t)

In [None]:
# discuss whether Herr Frau etc should be filtered out here
print("lexical approach")
print(get_metrics_from_tuplelist(actual_res, predicted_res))