In [None]:
# Umwandlung Ground Truth in CSV
import re
import pandas as pd
# RUB
temp = pd.read_json("../../../data_mining/ground_truth/RUB_persons.json", encoding="utf-8")
temp.sort_values(by=["name"]).to_csv("../results/ground_truth_rub.csv", index=False, header=False, doublequote=True, sep=";")
# UDE
temp = pd.read_csv("../results/ground_truth.csv", encoding="utf-8", header=None, skiprows=1)
temp.columns = ["id", "name", "email", "homepage", "organisation", "position"]
temp.sort_values(by=["name"]).to_csv("../results/ground_truth_ude.csv", index=False, header=False, doublequote=True, sep=";")

In [None]:
# UDE
import re
import pandas as pd
from pandas import DataFrame as df

# Import the "mining" root folder
import os, sys
parent_dir = os.path.abspath("../..")
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

truth = pd.read_csv("../results/ground_truth.csv", encoding="utf-8", index_col=False, header=None, skiprows=1)  # found by naive person search by URL
truth.columns = ["id", "name", "email", "homepage", "organisation", "position"]
found = pd.read_csv("../results/people-ude.csv", encoding="utf-8", index_col=False, header=None, skiprows=1)    # found by the people miner
found.columns = ["title", "name", "email", "method", "homepage", "foundIn"]

uni = "ude"

In [None]:
# RUB
import re
import pandas as pd
from pandas import DataFrame as df

# Import the "mining" root folder
import os, sys
parent_dir = os.path.abspath("../..")
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

truth = pd.read_json("../../../data_mining/ground_truth/RUB_persons.json", encoding="utf-8")  # found by naive person search by URL
# truth.columns = ["id", "name", "email", "homepage", "organisation", "position"]
found = pd.read_csv("../results/people-rub.csv", encoding="utf-8", index_col=False, header=None, skiprows=1)    # found by the people miner
found.columns = ["title", "name", "email", "method", "homepage", "foundIn"]

uni = "rub"

In [None]:
truth

In [None]:
found

# Duplicates

In [None]:
duplicates = truth.duplicated(subset=["name"], keep=False)
# truth[duplicates]

In [None]:
# Write duplicates in Ground Truth to a file - UDE
with open(f"result_{uni}_duplicates.txt", "w", encoding="utf-8") as f:
    f.write("# Duplicates found in Ground Truth data\n# Generated by evaluation.ipynb")
    for index, row in truth[duplicates].sort_values(by=["name"]).iterrows():
        f.write(f"https://www.uni-due.de/person/{row['id']:<6}\t{row['name']}\n")

In [None]:
# Write duplicates in Ground Truth to a file - RUB
with open(f"result_{uni}_duplicates.txt", "w", encoding="utf-8") as f:
    f.write("# Duplicates found in Ground Truth data\n# Generated by evaluation.ipynb")
    for index, row in truth[duplicates].sort_values(by=["name"]).iterrows():
        f.write(f"{row['name']}\n")

# Evaluation

In [None]:
# Preprocessing
from people.name_analysis import full_title_group, is_normalized_name

# 1. Remove academic grades from truth
truth["title"] = truth["name"].apply(lambda n: re.search(full_title_group + r"|()", n).group())
truth["name"] = truth.apply(lambda x: x["name"].replace(x["title"], "").strip().strip(",").strip(), axis=1)

# 2. Remove wrong names
truth = truth[truth["name"].apply(is_normalized_name)]
truth = truth.drop_duplicates(subset=["name"])
found = found.drop_duplicates(subset=["name"])

# 3. Filter "Wissenschaftliche Mitarbeiter" and people with academic titles
# (see next snippet)
# truth = truth.loc[truth["name"].empty]
# truth

In [None]:
from tabulate import tabulate, SEPARATING_LINE

# Utils
def eval(name, filter, base):
    """Returns a tuple with the count of filter and the percentage of filter in base"""
    return (name, len(filter.index), (len(filter.index)/len(base.index))*100)

data = [
    ("Truth total", len(truth.index), 100),
    ("Found total", len(found.index), 100),
    SEPARATING_LINE
]

def add_metrics(f, t, name=None):
    true_pos = f[f["name"].isin(t["name"])]     # X in found and in truth
    false_pos = f[~f["name"].isin(t["name"])]   # X in found but not in truth
    false_neg = t[~t["name"].isin(f["name"])]   # X in truth but not in found
    precision = eval("True positive (Precision)", true_pos, f)
    recall = eval("Found (Recall)", true_pos, t)
    if name:
        data.append(SEPARATING_LINE)
        data.append(eval(name,t, truth))
    data.append(precision)   # How many results are correct
    data.append(recall)      # How many names were actually found
    # if precision[-1] > 0 and recall[-1] > 0: # TODO WHY???
    data.append(("F1", None, 2*((precision[-1]*recall[-1])/(precision[-1]+recall[-1]))))
    data.append(eval("Not correct (False positive)", false_pos, f))
    data.append(eval("Not found (False negative)", false_neg, t))

# Print metrics

add_metrics(found, truth)
add_metrics(found, truth[truth["title"].str.len()>0], "People with title")
if uni == "ude":
    add_metrics(found, truth[truth["email"].notnull()], "People with email")
else:
    add_metrics(found, truth[truth["mail_arbeit"].notnull() | truth["mail_2"].notnull()], "People with email")
position_col = 'position' if uni == "ude" else "jobtitel"
add_metrics(found, truth[truth[position_col].str.contains("wiss\.|wissenschaftl", case=False, na=False)], "Wiss. MA")
add_metrics(found, truth[truth[position_col].notnull()], "Mind. 1 position")
add_metrics(found[found['name'].str.startswith(("A", "a",))], truth[truth['name'].str.startswith(("A", "a",))], "Names with A")

found_start_a = found[found['name'].str.startswith(("A", "a",))]
truth_start_a = truth[truth['name'].str.startswith(("A", "a",))]
if uni == "ude":
    add_metrics(found_start_a, truth_start_a[truth_start_a["email"].notnull()], "Names with A and email")
else:
    add_metrics(found_start_a, truth_start_a[truth_start_a["mail_arbeit"].notnull() | truth_start_a["mail_2"].notnull()], "Names with A and email")

# add_metrics(found, truth[truth[position_col].str.contains("wiss\.|wissenschaftl", case=False, na=False) & truth["email"].notnull()], "Wiss. MA with email")

s = tabulate(data, floatfmt=",.2f", headers=("Metric", "Count", "%"))
with open(f"result_{uni}_metrics.txt", "w") as f:
    f.write(s)
# print(s)

# truth["name"].sort_values().to_csv(f"result_{uni}_truth.csv", index=False, header=False, doublequote=False)
# found["name"].sort_values().to_csv(f"result_{uni}_found.csv", index=False, header=False, doublequote=False)

In [None]:
truth["name"].sort_values().to_csv(f"result_{uni}_truth.csv", index=False, header=False, doublequote=False)
found["name"].sort_values().to_csv(f"result_{uni}_found.csv", index=False, header=False, doublequote=False)
truth[~truth["name"].isin(found["name"])].sort_values(by=["name"]).to_csv(f"result_{uni}_not_found.csv",     index=False, header=False, doublequote=True, sep=";")
truth[ truth["name"].isin(found["name"])].sort_values(by=["name"]).to_csv(f"result_{uni}_truth_found.csv",   index=False, header=False, doublequote=True, sep=";")
found[~found["name"].isin(truth["name"])].sort_values(by=["name"]).to_csv(f"result_{uni}_not_correct.csv",   index=False, header=False, doublequote=True, sep=";")
found[ found["name"].isin(truth["name"])].sort_values(by=["name"]).to_csv(f"result_{uni}_found_correct.csv", index=False, header=False, doublequote=True, sep=";")
# truth[~truth["name"].isin(found["name"]) & truth["position"].notnull() & truth["email"].notnull()].sort_values(by=["name"]).to_csv("result_not_found_position_and_mail.csv", index=False, header=False, doublequote=True, sep=";")
# found[~found["name"].isin(truth["name"]) & truth["position"].notnull() & truth["email"].notnull()].sort_values(by=["name"]).to_csv("result_not_correct_position_and_mail.csv", index=False, header=False, doublequote=True, sep=";")

# Saves found names starting with A and truth names starting with A and an email to files
found[found["name"].str.startswith(("A", "a",))].sort_values(by=["name"]).to_csv(f"Auswertung {uni} Found.csv",  index=False, header=False, doublequote=True, sep=";")
if uni == "ude":
    truth[truth["name"].str.startswith(("A", "a",)) & truth["email"].notnull()].sort_values(by=["name"]).to_csv(f"Auswertung {uni} Truth.csv", index=False, header=False, doublequote=True, sep=";")
else:
    truth[truth["name"].str.startswith(("A", "a",)) & (truth["mail_arbeit"].notnull() | truth["mail_2"].notnull())].sort_values(by=["name"]).to_csv(f"Auswertung {uni} Truth.csv", index=False, header=False, doublequote=True, sep=";")