In [29]:
import pandas as pd
import os
from spacy.lemmatizer import Lemmatizer, ADJ, NOUN, VERB
import spacy
import numpy as np
import distance

In [30]:
def precision(y_true, y_pred):
    i = set()
    len1 = len(y_pred)
    for pred in y_pred:
        for true in y_true:
            if pred == true:
                i.add(pred)
                break
            if len(true) == len(pred):
                if distance.hamming(true,pred) < 7:
                    i.add(pred)
    if len1 == 0:
        return 0.0
    else:
        return float(len(i) / len1)


def recall(y_true, y_pred):
    i = set()
    if len(y_true) == 0:
        return 0.0
    else:
        for pred in y_pred:
            for true in y_true:
                if pred == true:
                    i.add(pred)
                    break
                if len(true) == len(pred):
                    if distance.hamming(true,pred) < 7:
                        i.add(pred)
        return float(len(i) / len(y_true))


def f1(y_true, y_pred):
    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    if p + r == 0:
        return 0.0
    else:
        return 2 * (p * r) / (p + r)
    
def get_PI(filename):
    getFilename = os.path.basename(filename)
    filename, ext = getFilename.split(".")
    if "_" in filename:
        filename, rest = filename.split("_", 1)
    return filename

In [31]:
os.chdir(r"C:\Users\Goegg\OneDrive\Desktop\Durchgänge")

In [32]:
# create df out of goldstandard
goldstandard = pd.ExcelFile(r"GOLDSTANDARD.xlsx")
# choose model-combinations for predictions:
directory = r"2. TFIDF - FLAIR\MI_Schlagworte_Composed"
extension = ".xlsx"
nlp = spacy.load("de_core_news_lg")
lemmatizer = nlp.vocab.morphology.lemmatizer

In [33]:
#Geographischer Index

all_geo_precision = []
all_geo_recall = []
all_geo_f1 = []
for filename in os.listdir(r"C:\Users\Goegg\OneDrive\Desktop\zu annotieren"):
    
    pi_name = get_PI(filename)
    print("Presseinformation:", pi_name)
    #get geo-pred
    df_pred = pd.read_excel(os.path.join(directory, pi_name + extension), header=0)
    pred_geo_out = df_pred['Geographischer Index:'].tolist()
    pred_geo = [x for x in pred_geo_out if x == x]
    #get geo-gold
    df_gold = pd.read_excel(goldstandard, pi_name)
    gold_geo_out = df_gold['Geographischer Index'].tolist()
    gold_geo = [lemmatizer(x, NOUN)[0].lower() for x in gold_geo_out if x == x]
    print("Goldstandard:", gold_geo)
    print("Predicted:", pred_geo, "\n")
    if len(gold_geo) == 0 and len(pred_geo) == 0:
        print("both nothing\n __________________________")
        all_geo_precision.append(1.0)
        all_geo_recall.append(1.0)
        all_geo_f1.append(1.0)
        continue
    else:
        print("Precision:", precision(gold_geo, pred_geo))
        print("Recall:", recall(gold_geo, pred_geo))
        print("F1-Score:", f1(gold_geo, pred_geo), "\n___________________________")
        all_geo_precision.append(precision(gold_geo, pred_geo))
        all_geo_recall.append(recall(gold_geo, pred_geo))
        all_geo_f1.append(f1(gold_geo, pred_geo))
        
mean_geo_precision = sum(all_geo_precision) / len(all_geo_precision)
mean_geo_recall = sum(all_geo_recall) / len(all_geo_recall)
mean_geo_f1 = sum(all_geo_f1) / len(all_geo_f1)    
print(mean_geo_precision, mean_geo_recall, mean_geo_f1)
    
    



Presseinformation: 2000 0310
Goldstandard: []
Predicted: [] 

both nothing
 __________________________
Presseinformation: 2000 0702
Goldstandard: ['backnang']
Predicted: ['backnang'] 

Precision: 1.0
Recall: 1.0
F1-Score: 1.0 
___________________________
Presseinformation: 2000 1009
Goldstandard: []
Predicted: [] 

both nothing
 __________________________
Presseinformation: 2001 0350
Goldstandard: ['genf']
Predicted: [] 

Precision: 0.0
Recall: 0.0
F1-Score: 0.0 
___________________________
Presseinformation: 2001 0909
Goldstandard: ['ravensburg']
Predicted: [] 

Precision: 0.0
Recall: 0.0
F1-Score: 0.0 
___________________________
Presseinformation: 2002 0114
Goldstandard: ['hildesheim', 'miskolc', 'stuttgart-feuerbach', 'wemerstraße']
Predicted: ['wemerstraße', 'germany', 'stuttgart-feuerbach'] 

Precision: 0.6666666666666666
Recall: 0.5
F1-Score: 0.5714285714285715 
___________________________
Presseinformation: 2002 0403
Goldstandard: []
Predicted: [] 

both nothing
 ______________

In [34]:
#Körperschaftsindex

all_org_precision = []
all_org_recall = []
all_org_f1 = []
for filename in os.listdir(r"C:\Users\Goegg\OneDrive\Desktop\zu annotieren"):
    
    pi_name = get_PI(filename)
    print("Presseinformation:", pi_name)
    #get org-pred
    df_pred = pd.read_excel(os.path.join(directory, pi_name + extension), header=0)
    pred_org_out = df_pred['Körperschaftsindex:'].tolist()
    pred_org = [x for x in pred_org_out if x == x]
    #get org-gold
    df_gold = pd.read_excel(goldstandard, pi_name)
    gold_org_out = df_gold['Körperschaftsindex'].tolist()
    gold_org = [lemmatizer(x, NOUN)[0].lower() for x in gold_org_out if x == x]
    print("Goldstandard:", gold_org)
    print("Predicted:", pred_org, "\n")
    if len(gold_org) == 0 and len(pred_org) == 0:
        print("both nothing\n __________________________")
        all_org_precision.append(1.0)
        all_org_recall.append(1.0)
        all_org_f1.append(1.0)
        continue
    else:
        print("Precision:", precision(gold_org, pred_org))
        print("Recall:", recall(gold_org, pred_org))
        print("F1-Score:", f1(gold_org, pred_org), "\n___________________________")
        all_org_precision.append(precision(gold_org, pred_org))
        all_org_recall.append(recall(gold_org, pred_org))
        all_org_f1.append(f1(gold_org, pred_org))
        
mean_org_precision = sum(all_org_precision) / len(all_org_precision)
mean_org_recall = sum(all_org_recall) / len(all_org_recall)
mean_org_f1 = sum(all_org_f1) / len(all_org_f1)    
print(mean_org_precision, mean_org_recall, mean_org_f1)

Presseinformation: 2000 0310
Goldstandard: []
Predicted: [] 

both nothing
 __________________________
Presseinformation: 2000 0702
Goldstandard: []
Predicted: ['zö', 'zoef'] 

Precision: 0.0
Recall: 0.0
F1-Score: 0.0 
___________________________
Presseinformation: 2000 1009
Goldstandard: ['daimler-chrysler', 'audi', 'bmw', 'cadillac', 'citroen', 'daewoo', 'fiat', 'lancia', 'mitsubishi', 'opel', 'peugeot', 'porsche', 'renault', 'suzuki', 'vauxhall', 'volvo', 'vw']
Predicted: [] 

Precision: 0.0
Recall: 0.0
F1-Score: 0.0 
___________________________
Presseinformation: 2001 0350
Goldstandard: ['blaupunkt']
Predicted: [] 

Precision: 0.0
Recall: 0.0
F1-Score: 0.0 
___________________________
Presseinformation: 2001 0909
Goldstandard: ['hawera probst gmbh']
Predicted: ['hawera'] 

Precision: 0.0
Recall: 0.0
F1-Score: 0.0 
___________________________
Presseinformation: 2002 0114
Goldstandard: []
Predicted: [] 

both nothing
 __________________________
Presseinformation: 2002 0403
Goldstanda

In [35]:
#Personen-Index

all_per_precision = []
all_per_recall = []
all_per_f1 = []
for filename in os.listdir(r"C:\Users\Goegg\OneDrive\Desktop\zu annotieren"):
    
    pi_name = get_PI(filename)
    print("Presseinformation:", pi_name)
    #get per-pred
    df_pred = pd.read_excel(os.path.join(directory, pi_name + extension), header=0)
    pred_per_out = df_pred['Personen-Index'].tolist()
    pred_per = [x for x in pred_per_out if x == x]
    #get per-gold
    df_gold = pd.read_excel(goldstandard, pi_name)
    gold_per_out = df_gold['Personen-Index'].tolist()
    gold_per = [lemmatizer(x, NOUN)[0].lower() for x in gold_per_out if x == x]
    print("Goldstandard:", gold_per)
    print("Predicted:", pred_per, "\n")
    if len(gold_per) == 0 and len(pred_per) == 0:
        print("both nothing\n __________________________")
        all_per_precision.append(1.0)
        all_per_recall.append(1.0)
        all_per_f1.append(1.0)
        continue
    else:
        print("Precision:", precision(gold_per, pred_per))
        print("Recall:", recall(gold_per, pred_per))
        print("F1-Score:", f1(gold_per, pred_per), "\n___________________________")
        all_per_precision.append(precision(gold_per, pred_per))
        all_per_recall.append(recall(gold_per, pred_per))
        all_per_f1.append(f1(gold_per, pred_per))
        
mean_per_precision = sum(all_per_precision) / len(all_per_precision)
mean_per_recall = sum(all_per_recall) / len(all_per_recall)
mean_per_f1 = sum(all_per_f1) / len(all_per_f1)    
print(mean_per_precision, mean_per_recall, mean_per_f1)

Presseinformation: 2000 0310
Goldstandard: []
Predicted: [] 

both nothing
 __________________________
Presseinformation: 2000 0702
Goldstandard: []
Predicted: ['borm1nn'] 

Precision: 0.0
Recall: 0.0
F1-Score: 0.0 
___________________________
Presseinformation: 2000 1009
Goldstandard: []
Predicted: [] 

both nothing
 __________________________
Presseinformation: 2001 0350
Goldstandard: []
Predicted: [] 

both nothing
 __________________________
Presseinformation: 2001 0909
Goldstandard: ['dr. marco lang']
Predicted: [] 

Precision: 0.0
Recall: 0.0
F1-Score: 0.0 
___________________________
Presseinformation: 2002 0114
Goldstandard: []
Predicted: ['hans walz', 'robert bosch', 'walz', 'georg zundel'] 

Precision: 0.0
Recall: 0.0
F1-Score: 0.0 
___________________________
Presseinformation: 2002 0403
Goldstandard: []
Predicted: [] 

both nothing
 __________________________
Presseinformation: 2002 0710
Goldstandard: []
Predicted: [] 

both nothing
 __________________________
Presseinforma

In [36]:
#Themen-Index

all_thema_precision = []
all_thema_recall = []
all_thema_f1 = []
for filename in os.listdir(r"C:\Users\Goegg\OneDrive\Desktop\zu annotieren"):
    
    pi_name = get_PI(filename)
    print("Presseinformation:", pi_name)
    #get thema-pred
    df_pred = pd.read_excel(os.path.join(directory, pi_name + extension), header=0)
    pred_thema_out = df_pred['Themen-Index'].tolist()
    pred_thema = [x for x in pred_thema_out if x == x]
    #get thema-gold
    df_gold = pd.read_excel(goldstandard, pi_name)
    gold_thema_out = df_gold['Themen-Index'].tolist()
    gold_thema = [lemmatizer(x, NOUN)[0].lower() for x in gold_thema_out if x == x]
    print("Goldstandard:", gold_thema)
    print("Predicted:", pred_thema, "\n")
    if len(gold_thema) == 0 and len(pred_thema) == 0:
        print("both nothing\n __________________________")
        all_thema_precision.append(1.0)
        all_thema_recall.append(1.0)
        all_thema_f1.append(1.0)
        continue
    else:
        print("Precision:", precision(gold_thema, pred_thema))
        print("Recall:", recall(gold_thema, pred_thema))
        print("F1-Score:", f1(gold_thema, pred_thema), "\n___________________________")
        all_thema_precision.append(precision(gold_thema, pred_thema))
        all_thema_recall.append(recall(gold_thema, pred_thema))
        all_thema_f1.append(f1(gold_thema, pred_thema))
        
mean_thema_precision = sum(all_thema_precision) / len(all_thema_precision)
mean_thema_recall = sum(all_thema_recall) / len(all_thema_recall)
mean_thema_f1 = sum(all_thema_f1) / len(all_thema_f1)    
print(mean_thema_precision, mean_thema_recall, mean_thema_f1)

Presseinformation: 2000 0310
Goldstandard: ['elektrowerkzeug']
Predicted: ['ahs 6000 pro', 'bild', 'griff', 'ahs 7000   pro'] 

Precision: 0.0
Recall: 0.0
F1-Score: 0.0 
___________________________
Presseinformation: 2000 0702
Goldstandard: ['raumfahrttechnik']
Predicted: [] 

Precision: 0.0
Recall: 0.0
F1-Score: 0.0 
___________________________
Presseinformation: 2000 1009
Goldstandard: []
Predicted: ['die fließgepresste mittelelektrode', 'f 8 oper'] 

Precision: 0.0
Recall: 0.0
F1-Score: 0.0 
___________________________
Presseinformation: 2001 0350
Goldstandard: []
Predicted: ['silver plus', 'die spitzenbatterie von bosch'] 

Precision: 0.0
Recall: 0.0
F1-Score: 0.0 
___________________________
Presseinformation: 2001 0909
Goldstandard: ['hammerbohrer', 'diamant']
Predicted: ['handwerker', 'metall'] 

Precision: 0.0
Recall: 0.0
F1-Score: 0.0 
___________________________
Presseinformation: 2002 0114
Goldstandard: ['mitarbeiter', 'kündigung']
Predicted: ['firma', 'jahr', 'ersten weltkr

F1-Score: 0.6666666666666666 
___________________________
Presseinformation: 2019 1103
Goldstandard: ['landwirtschaft', 'effizienz', 'automatisierung', 'vernetzung']
Predicted: ['landwirtschaft', 'landwirt', 'agritechnica innovation award', 'service'] 

Precision: 0.25
Recall: 0.25
F1-Score: 0.25 
___________________________
0.18115942028985504 0.16630434782608694 0.16457902001380262


In [37]:
#Produkt-Index

all_prod_precision = []
all_prod_recall = []
all_prod_f1 = []
for filename in os.listdir(r"C:\Users\Goegg\OneDrive\Desktop\zu annotieren"):
    
    pi_name = get_PI(filename)
    print("Presseinformation:", pi_name)
    #get prod-pred
    df_pred = pd.read_excel(os.path.join(directory, pi_name + extension), header=0)
    pred_prod_out = df_pred['Produkt-Index'].tolist()
    pred_prod = [x for x in pred_prod_out if x == x]
    #get prod-gold
    df_gold = pd.read_excel(goldstandard, pi_name)
    gold_prod_out = df_gold['Produkt-Index'].tolist()
    gold_prod = [lemmatizer(x, NOUN)[0].lower() for x in gold_prod_out if x == x]
    print("Goldstandard:", gold_prod)
    print("Predicted:", pred_prod, "\n")
    if len(gold_prod) == 0 and len(pred_prod) == 0:
        print("both nothing\n __________________________")
        all_prod_precision.append(1.0)
        all_prod_recall.append(1.0)
        all_prod_f1.append(1.0)
        continue
    else:
        print("Precision:", precision(gold_prod, pred_prod))
        print("Recall:", recall(gold_prod, pred_prod))
        print("F1-Score:", f1(gold_prod, pred_prod), "\n___________________________")
        all_prod_precision.append(precision(gold_prod, pred_prod))
        all_prod_recall.append(recall(gold_prod, pred_prod))
        all_prod_f1.append(f1(gold_prod, pred_prod))
        
mean_prod_precision = sum(all_prod_precision) / len(all_prod_precision)
mean_prod_recall = sum(all_prod_recall) / len(all_prod_recall)
mean_prod_f1 = sum(all_prod_f1) / len(all_prod_f1)    
print(mean_prod_precision, mean_prod_recall, mean_prod_f1)

Presseinformation: 2000 0310
Goldstandard: ['ahs 6000 pro', 'ahs 7000']
Predicted: ['messer', 'anwender'] 

Precision: 0.0
Recall: 0.0
F1-Score: 0.0 
___________________________
Presseinformation: 2000 0702
Goldstandard: ['leiterplatten']
Predicted: ['leiterplatten'] 

Precision: 1.0
Recall: 1.0
F1-Score: 1.0 
___________________________
Presseinformation: 2000 1009
Goldstandard: ['slk 320', 'zündkerze']
Predicted: [] 

Precision: 0.0
Recall: 0.0
F1-Score: 0.0 
___________________________
Presseinformation: 2001 0350
Goldstandard: ['silver plus', 'powercontrol system', 'esp', 'common rail', 'bea 350', 'zündkerze', 'funline']
Predicted: ['produktvorteil powercontrol system', 'powercontrol system'] 

Precision: 0.5
Recall: 0.14285714285714285
F1-Score: 0.22222222222222224 
___________________________
Presseinformation: 2001 0909
Goldstandard: ['quadro-x', 'sägeblätter']
Predicted: ['qualität'] 

Precision: 1.0
Recall: 0.5
F1-Score: 0.6666666666666666 
___________________________
Pressein

In [38]:
print("Standard Deviation of Precision/Products is ", np.std(all_thema_precision))

Standard Deviation of Precision/Products is  0.31770631578170844


### 