In [1]:
import pandas as pd
from error_analysis import main
import os
from IPython.display import display, HTML

In [2]:
gold_dir = "/home/lisa/projects/cross_ling_drug_ner/data/converted/by_language/de/test/"
predicted_dir = "/home/lisa/projects/cross_ling_drug_ner/models_ensembled/mono_de/mono_de_models_with_preds_ensembled/ensemble/"
results_dir = "/".join(predicted_dir.split("/")[:-1])

In [3]:
df_strict, df_lenient = main(gold_dir, predicted_dir, verbose=False, num_examples="all")

KeyError: 'T2'

In [None]:

def highlight_selected_text(entity, text):
    return text.replace(entity, f'<span style="color: green; font-weight: bold">{entity}</span>')

def get_unique_entites(false_entities, file_names):
    unique = set()
    errors_per_file = []
    for file_name, l in zip(file_names, false_entities):
        if not l:
            continue
        with open(os.path.join(gold_dir, file_name.replace(".ann", ".txt")), "r") as read_handle:
            text = read_handle.read()
        for element in l:
            unique.add(element[0])
            errors_per_file.append({"file": file_name, 
                                    "entity": element[0], 
                                    "start": element[1], 
                                    "end": element[2],
                                    "sentence_h": highlight_selected_text(entity=element[0],
                                                                          text="..." + text[element[1] - 200: \
                                                                                          element[2] + 200] + "..."),
                                    "sentence": "..." + text[element[1] - 200: element[2] + 200] + "..." 
                                   })
    
    return unique, pd.DataFrame(errors_per_file).sort_values("file").reset_index(drop=True)

def clean_up_dfs(df):
    # sort the dataframe by file name
    df = df.sort_values("file").reset_index(drop=True)
    
    # convert FP and FN columns to list and then to their own DFs
    fps = df['FP'].tolist()
    unique_fps, fps_per_file = get_unique_entites(fps, list(df["file"]))
    fns = df['FN'].tolist()
    unique_fns, fns_per_file = get_unique_entites(fns, list(df["file"]))
    
    df_fp = pd.DataFrame(fps, index=df.index).add_prefix('FP')
    df_fn = pd.DataFrame(fns, index=df.index).add_prefix('FN')
    
    # remove the old FP and FN columns and concatenate the three DFs
    df = pd.concat([df.drop(columns=['FP', "FN"]), df_fp, df_fn], axis=1)
    
    df = df.round(2)
 
    return df, unique_fps, unique_fns, fps_per_file, fns_per_file

In [None]:
df_strict, unique_fps_strict, unique_fns_strict, fps_per_file_strict, fns_per_file_strict = clean_up_dfs(df_strict)
df_lenient, unique_fps_lenient, unique_fns_lenient, fps_per_file_lenient, fns_per_file_lenient = clean_up_dfs(df_lenient)

In [None]:
print(f"max #false negatives: {df_strict['#FN'].max()}")
print(f"max #false positives: {df_strict['#FP'].max()}")
print(f"max #gold entities: {df_strict['#gold_entities'].max()}")

In [None]:
fns_per_file_lenient_csv = fns_per_file_lenient.drop(columns="sentence_h")
fns_per_file_lenient_html = fns_per_file_lenient.drop(columns="sentence")

fns_per_file_lenient_html.style.set_sticky(axis="index")
html_table = fns_per_file_lenient_html.to_html(escape=False)
display(HTML(html_table))

In [None]:
fps_per_file_lenient_csv = fps_per_file_lenient.drop(columns="sentence_h")
fps_per_file_lenient_html = fps_per_file_lenient.drop(columns="sentence")

fps_per_file_lenient_html.style.set_sticky(axis="index")
html_table = fps_per_file_lenient_html.to_html(escape=False)
display(HTML(html_table))

In [None]:
# FN = entities that were not detected by sys
fns_per_file_strict_csv = fns_per_file_strict.drop(columns="sentence_h")
#.to_csv(os.path.join(results_dir, "fns_strict.csv"))
fns_per_file_strict_html = fns_per_file_strict.drop(columns="sentence")

fns_per_file_strict_html.style.set_sticky(axis="index")
html_table = fns_per_file_strict_html.to_html(escape=False)
display(HTML(html_table))

In [None]:
# FP = entities that were detected by sys but are not gold

fps_per_file_strict_csv = fps_per_file_strict.drop(columns="sentence_h")
#.to_csv(os.path.join(results_dir,"fps_strict.csv"))
fps_per_file_strict_html = fps_per_file_strict.drop(columns="sentence")

fps_per_file_strict_html.style.set_sticky(axis="index")
html_table = fps_per_file_strict_html.to_html(escape=False)
display(HTML(html_table))

In [None]:
#table = pd.read_html(html_table)[0] 
# Store the dataframe in Excel file
#table.to_excel("test_table.xlsx")

In [None]:
df_strict

In [None]:
unique_xps = {"unique_fps_strict": pd.Series(sorted(list(unique_fps_strict))), "unique_fps_lenient": pd.Series(sorted(list(unique_fps_lenient))),
              "unique_fns_strict": pd.Series(sorted(list(unique_fns_strict))), "unique_fns_lenient": pd.Series(sorted(list(unique_fns_lenient)))}
unique_xps_df = pd.DataFrame(unique_xps)
unique_xps_df

In [None]:
# df_strict.to_csv(os.path.join(results_dir, "results_ner_strict.csv"))

In [None]:
# df_lenient.to_csv(os.path.join(results_dir, "results_ner_lenient.csv"))

- find error groups
- collect drugs that were not detected (those + drugs from the list could be added in postprocessing step)
- find FNs that were not detected by strict, but by lenient
- check if we can combine words separated by "/"
- get gold annotation and check overlaps with false positives

In [None]:
# write everything to one excel with one sheet per dataframe

def write_to_file():
    if not os.path.exists(results_dir):
        os.makedirs(results_dir)

    writer = pd.ExcelWriter(os.path.join(results_dir, 'error_analysis_ner.xlsx'), engine='xlsxwriter')

    df_strict.to_excel(writer, sheet_name='strict_overview')

    df_lenient.to_excel(writer, sheet_name='lenient_overview')

    fps_per_file_strict_csv.to_excel(writer, sheet_name='fps_strict')

    fns_per_file_strict_csv.to_excel(writer, sheet_name='fns_strict')

    fps_per_file_lenient_csv.to_excel(writer, sheet_name='fps_lenient')

    fns_per_file_lenient_csv.to_excel(writer, sheet_name='fns_lenient')

    unique_xps_df.to_excel(writer, sheet_name="unique_fps_fns")
    
    writer.save()

In [None]:
write_to_file()

In [None]:
#unique_fns_strict

In [None]:
# FP = entities that were detected by sys but are not gold
unique_fps_strict

In [None]:
unique_fns_lenient

In [None]:
unique_fps_lenient