In [1]:
import pandas as pd
import os
from spacy.lemmatizer import Lemmatizer, ADJ, NOUN, VERB
import spacy
import numpy as np
import itertools

In [2]:
def precision(y_true, y_pred):
    i = set(y_true).intersection(y_pred)
    len1 = len(y_pred)
    if len1 == 0:
        return 0.0
    else:
        return float(len(i) / len1)


def recall(y_true, y_pred):
    if len(y_true) == 0:
        return 0.0
    else:
        i = set(y_true).intersection(y_pred)
        return float(len(i) / len(y_true))


def f1(y_true, y_pred):
    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    if p + r == 0:
        return 0.0
    else:
        return 2 * (p * r) / (p + r)
    
def get_PI(filename):
    getFilename = os.path.basename(filename)
    filename, ext = getFilename.split(".")
    if "_" in filename:
        filename, rest = filename.split("_", 1)
    return filename

In [3]:
os.chdir(r"C:\Users\Goegg\OneDrive\Desktop\Durchgänge")

In [39]:
# create df out of goldstandard
goldstandard = pd.ExcelFile(r"GOLDSTANDARD.xlsx")
# choose model-combinations for predictions:
directory = r"9. YAKE - BERT\MI_Schlagworte_Decomposed"
extension = ".xlsx"
nlp = spacy.load("de_core_news_lg")
lemmatizer = nlp.vocab.morphology.lemmatizer

In [40]:
#ALL
corpus_p = []
corpus_r = []
corpus_f = []
for filename in os.listdir(r"C:\Users\Goegg\OneDrive\Desktop\zu annotieren"):
    pi_name = get_PI(filename)
    print("Presseinformation:", pi_name)
    #get geo-pred
    df_pred = pd.read_excel(os.path.join(directory, pi_name + extension), header=0)
    pred_geo_out = df_pred['Geographischer Index:'].tolist()
    pred_geo = [x for x in pred_geo_out if x == x]
    #get geo-gold
    df_gold = pd.read_excel(goldstandard, pi_name)
    gold_geo_out = df_gold['Geographischer Index'].tolist()
    gold_geo = [lemmatizer(x, NOUN)[0].lower() for x in gold_geo_out if x == x]
    #get org-pred
    df_pred = pd.read_excel(os.path.join(directory, pi_name + extension), header=0)
    pred_org_out = df_pred['Körperschaftsindex:'].tolist()
    pred_org = [x for x in pred_org_out if x == x]
    #get org-gold
    df_gold = pd.read_excel(goldstandard, pi_name)
    gold_org_out = df_gold['Körperschaftsindex'].tolist()
    gold_org = [lemmatizer(x, NOUN)[0].lower() for x in gold_org_out if x == x]
    #get per-pred
    df_pred = pd.read_excel(os.path.join(directory, pi_name + extension), header=0)
    pred_per_out = df_pred['Personen-Index'].tolist()
    pred_per = [x for x in pred_per_out if x == x]
    #get per-gold
    df_gold = pd.read_excel(goldstandard, pi_name)
    gold_per_out = df_gold['Personen-Index'].tolist()
    gold_per = [lemmatizer(x, NOUN)[0].lower() for x in gold_per_out if x == x]
    #get thema-pred
    df_pred = pd.read_excel(os.path.join(directory, pi_name + extension), header=0)
    pred_thema_out = df_pred['Themen-Index'].tolist()
    pred_thema = [x for x in pred_thema_out if x == x]
    #get thema-gold
    df_gold = pd.read_excel(goldstandard, pi_name)
    gold_thema_out = df_gold['Themen-Index'].tolist()
    gold_thema = [lemmatizer(x, NOUN)[0].lower() for x in gold_thema_out if x == x]
    #get prod-pred
    df_pred = pd.read_excel(os.path.join(directory, pi_name + extension), header=0)
    pred_prod_out = df_pred['Produkt-Index'].tolist()
    pred_prod = [x for x in pred_prod_out if x == x]
    #get prod-gold
    df_gold = pd.read_excel(goldstandard, pi_name)
    gold_prod_out = df_gold['Produkt-Index'].tolist()
    gold_prod = [lemmatizer(x, NOUN)[0].lower() for x in gold_prod_out if x == x]
    
    all_preds = list(itertools.chain(pred_geo, pred_org, pred_per, pred_thema, pred_prod))
    all_gold = list(itertools.chain(gold_geo, gold_org, gold_per, gold_thema, gold_prod))
    corpus_p.append(precision(all_gold, all_preds))
    corpus_r.append(recall(all_gold, all_preds))
    corpus_f.append(f1(all_gold, all_preds))
    print("Goldstandard:", all_gold)
    print("Predicted:", all_preds, "\n")
    print("Precision:", precision(all_gold, all_preds))
    print("Recall:", recall(all_gold, all_preds))
    print("F1-Score:", f1(all_gold, all_preds), "\n___________________________")
mean_corpus_precision = sum(corpus_p) / len(corpus_p)
mean_corpus_recall = sum(corpus_r) / len(corpus_r)
mean_corpus_f1 = sum(corpus_f) / len(corpus_f)    
print(mean_corpus_precision, mean_corpus_recall, mean_corpus_f1)

Presseinformation: 1951 0601
Goldstandard: ['bundesgebiet', 'batterie-prüfdienst', 'batterie', 'auto', 'fahrzeug', 'batterie-station', 'bosch-batterie']
Predicted: ['werbeburo', 'bosch', 'gmbh', 'robert bosch', 'nachdruck', 'nach', 'druck', 'pflege', 'robert', 'dienst', 'arbeit', 'batterie', 'batterie', 'fahrzeug'] 

Precision: 0.14285714285714285
Recall: 0.2857142857142857
F1-Score: 0.19047619047619047 
___________________________
Presseinformation: 1961 0302
Goldstandard: ['abtauen', 'automatie', 'abtau-automatie', 'kühlschrank', 'verdampfer']
Predicted: ['gmbh', 'pressestelle', 'cingcs', 'neuon', 'deehalb', 'robert bosch', 'tauwa ssers', 'desson', 'lhnon', 'wtr', 'haus', 'frau', 'robert', 'technik', 'verdampfer', 'kühlschrank'] 

Precision: 0.125
Recall: 0.4
F1-Score: 0.19047619047619047 
___________________________
Presseinformation: 1965 0102
Goldstandard: ['stuttgart', 'deutschland', 'konzentrationslager sachsenhausen', 'technischen hochschule stuttgart', 'gestapo', 'vereinigung 

Goldstandard: ['baden-württemberg', 'robert bosch industrieausrüstung gmbh', 'rohrleitungen', 'strangfärberei', 'frottier', 'rohrsysteme', 'rohr', 'rieber-rohrleitungssysteme']
Predicted: ['baden-württemberg', 'stuttgart', 'prt', 'robert bosch gmbh', 'robert bosch industrieausrüstung gmbh', 'eckhard d. noelte', 'f. wolfgang knellesseri', 'gerd neermann', 'lrmgard müller', 'bosch', 'bereich', 'färberei', 'industrie', 'ausrüstung', 'robert', 'gmbh', 'presse', 'information', 'weberei', 'betrieb'] 

Precision: 0.1
Recall: 0.25
F1-Score: 0.14285714285714288 
___________________________
Presseinformation: 1974 0607
Goldstandard: ['stuttgart-mühlhausen', 'luft', 'reinhaltung', 'kondensator', 'zink', 'ventilatoren', 'entstaubungsanlagen', 'zinkstaub', 'zinkstaubabscheider', 'zinkspritzautomaten', 'filter']
Predicted: ['mülhausen', 'stuttgart', 'stuttgart-mühlhausen', 'bosch', 'robert bosch gmbh', 'choltitz', 'f_ wolfgang knellessen', 'klaus gennann', 'margarete düker', 'luft', 'robert', 'menge

Goldstandard: ['lärmminderung', 'lärm', 'bosch-hochfrequenz-außenrüttler', 'umformer', 'verdichtung']
Predicted: ['stuttgart', 'bosch', 'robert bosch gmbh', "frank-ulrichbreitsprecher'8'62", 'gerd neermann', 'knellessen', 'beton', 'form', 'verdichtung', 'kva'] 

Precision: 0.1
Recall: 0.2
F1-Score: 0.13333333333333333 
___________________________
Presseinformation: 1984 1101
Goldstandard: ['haushalt', 'elektrogeräte', 'dampfstop-automatik', 'küche', 'wasser', 'wasserkocher']
Predicted: ['hochstraße 17', 'münchen', 'hal be', 'bosch', 'bundespost', 'robert bosch haus gmbh', 'robert bosch hausgeräte gmbh', 'presse', 'liter', 'wasser', 'wasserkocher', 'wasser'] 

Precision: 0.16666666666666666
Recall: 0.3333333333333333
F1-Score: 0.2222222222222222 
___________________________
Presseinformation: 1985 0436
Goldstandard: ['bamberg', 'homburg-ost', 'rodez', 'bosch-werk bamberg', 'fertigung', 'einspritzsysteme', 'einspritzventilen', 'benzineinspritztechnik']
Predicted: ['bamberg', 'homburg', '

Goldstandard: ['prag', 'tschechien', 'eurotel', 'bosch telecom gmbh', 'richtfunktechnik', 'mobilfunk-kommunikation', 'sdh-richtfunkeinrichtungen']
Predicted: ['prag', 'stuttgar', 'stuttgart', 'tschechien', 'bosch', 'bosch telecom', 'bosch telecom gmbh', 'eurotel', 'eurotel praha', 'robert bosch gmbh', 'us west international', 'frank-ulrich breitsprecher', 'betreiber', 'com', 'technik', 'vertrag', 'praha', 'technik', 'mobilfunk', 'richtfunk', 'richtfunk', 'system'] 

Precision: 0.18181818181818182
Recall: 0.5714285714285714
F1-Score: 0.27586206896551724 
___________________________
Presseinformation: 1997 0309
Goldstandard: ['fern-und automatikstart', 'stromerzeuger', 'profiline', 'benzin-modellen']
Predicted: ['stuttgart', 'bosch', 'dvgw verein des gas· und wasserfaches', 'kölner eisenwarenmesse', 'robert bosch gmbh', 'frank-ulrich breitsprecher', 'strom', 'energie', 'modellreihe', 'modell', 'reihe', 'kölner', 'presse', 'information', 'verbraucher', 'stromerzeuger', 'variante', 'kva', 

Goldstandard: ['abstatt', 'heilbronn', 'assel gmbh', 'chassissysteme', 'richtfest', 'techniker']
Predicted: ['absta ti/stuttgart', 'abstau', 'abstatt', 'abstull', 'heilbronn', 'stuttgarl', 'bosch', 'bosch rexroth ag', 'bosch-gruppe', 'chassissystemc', 'roberl bosch gmbh', 'robert bosch gmbh', 'zvw', 'frank-ulrich breitsprecher', 'martin lober', 'robert', 'robert bosch', 'richtfest', 'gmbh', 'etappe', 'ziel', 'kraftfahrzeug', 'technik', 'entwicklung', 'milliarde', 'ingenieur', 'euro'] 

Precision: 0.1111111111111111
Recall: 0.5
F1-Score: 0.1818181818181818 
___________________________
Presseinformation: 2002 1008
Goldstandard: ['junker', 'bosch-thermotechnik', 'junkers/bosch-thermotechnik', 'junkers-website', 'heizung', 'neuer junkers-internetauftritt', 'beratung']
Predicted: ['wernau', 'junkers', 'junkers-pressebild', 'junkers/bosch', 'junkers/bosch-thermotechnik', 'robert bosch gmbh', 'bau', 'herr', 'tipps', 'internet', 'informations', 'medium', 'auftritt', 'konzept', 'design', 'rubri

Goldstandard: ['bosch sicherheitssysteme', 'tonwiedergabe', 'lautsprecher']
Predicted: ['stuttgart', 'bosch', 'bosch sicherheitssysteme gmbh', 'bosch-gruppe', 'robert bosch gmbh', 'robert bosch industrietreuhand kg', 'robert bosch stiftung gmbh', 'erika görge', 'p.o', 'robert bosch', 'uta-micaela dürig', 'robert', 'sprecher', 'ton', 'märz', 'musik', 'gmbh', 'technik', 'euro', 'lautsprecher', 'laut', 'wiedergabe', 'wiedergabe', 'lautsprecher'] 

Precision: 0.041666666666666664
Recall: 0.3333333333333333
F1-Score: 0.07407407407407407 
___________________________
Presseinformation: 2008 1101
Goldstandard: ['brüssel', 'europa', 'esafety-award-komitee', 'fia', 'esafety', 'automobilweltverbands fia', 'europäischen kommission', 'dr. bernd bohr', 'esp®']
Predicted: ['europa', 'köln', 'stuttgart', 'award', 'bosch', 'bosch-gruppe', 'environment', 'europäischen kommission', 'fia', 'road', 'robert bosch gmbh', 'robert bosch industrietreuhand kg;', 'robert bosch stiftung gmbh', 'and', 'esafety awar

Goldstandard: ['stuttgart', 'münchen', 'europa', 'berlin', 'bsh', 'siemens ag', 'volkmar denner', 'hausgeräte']
Predicted: ['europa', 'stuttgart', 'ersten', 'bsh', 'bsh bosch', 'bosch', 'bosch gruppe', 'feinmechanik', 'gmbh', 'robert bosch gmbh', 'robert bosch industrietreuhand kg;', 'robert bosch stiftung gmbh', 'sec', 'siemens', 'siemens ag', 'siemens hausgeräte gmbh', 'stuttgart/münchen', 'venture bsh bosch', 'am gemeinsamen', 'der', 'für', 'und', 'denner', 'raschke', 'robert bosch', 'thomas', 'uta-micaela dürig', 'robert', 'milliarde', 'unternehmen', 'haus', 'euro', 'key', 'milliarden', 'millionen', 'risk', 'information', 'aussage', 'hausgeräte', 'gerät'] 

Precision: 0.125
Recall: 0.625
F1-Score: 0.20833333333333334 
___________________________
Presseinformation: 2015 0405
Goldstandard: ['türkei', 'erenler', 'schwarzmeerküste', 'türk traktör', 'bosell', 'absicherung', 'traktor', 'serie 5000']
Predicted: ['erenler', 'grasbrunn', 'türkei', 'bosch', 'bosch sicherheitssysteme gmbh', '

In [6]:
print("Standard Deviation of Precision/Products is ", np.std(all_thema_precision))

NameError: name 'all_thema_precision' is not defined

### 