# Result Report

In [1]:
from collections import defaultdict
from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns

dataset = "rbo"


def not_equal(df1, df2):
    return (df1 != df2) & ~(df1.isnull() & df2.isnull())


def diff_dfs(df1, df2, compare_func=not_equal):
    assert (df1.columns == df2.columns).all(), "DataFrame column names are different"
    if any(df1.dtypes != df2.dtypes):
        "Data Types are different, trying to convert"
        df2 = df2.astype(df1.dtypes)
    if df1.equals(df2):
        return None
    else:
        diff_mask = compare_func(df1, df2)
        ne_stacked = diff_mask.stack()
        changed = ne_stacked[ne_stacked]
        changed.index.names = ["id", "col"]
        difference_locations = np.where(diff_mask)
        changed_from = df1.values[difference_locations]
        changed_to = df2.values[difference_locations]
        df = pd.DataFrame(
            {"Outliers": changed_from, "Normal values": changed_to},
            index=changed.index,
        )
        df["id"] = df.index.get_level_values("id")
        df["col"] = df.index.get_level_values("col")
        df["Outliers"] = df["Outliers"].apply(lambda x: x[:53])
        df["Normal values"] = df["Normal values"].apply(lambda x: x[:53])
        return df[["Outliers", "Normal values"]].reset_index(drop=True)


output_path = Path("../../../output") / dataset
raw_path = Path("../../../data/test/rbo/raw")
cleaned_path = Path("../../../data/test/rbo/cleaned")

name2diff = {}
name2report = defaultdict(lambda: pd.DataFrame())
name2debug = defaultdict(lambda: [None for _ in range(10)])

for dataset_path in output_path.iterdir():
    if dataset_path.name != "summary":
        raw_df = pd.read_csv(raw_path / f"{dataset_path.name}.csv", dtype=str, keep_default_na=False)
        cleaned_df = pd.read_csv(
            cleaned_path / f"{dataset_path.name}.csv", dtype=str, keep_default_na=False
        )
        name2diff[dataset_path.name] = diff_dfs(raw_df, cleaned_df)
        
        for step_path in dataset_path.iterdir():
            report_df = pd.read_csv(dataset_path / step_path.name / "report.csv")
            score_df = pd.read_csv(dataset_path / step_path.name / "scores.csv")

            report_values = report_df[report_df["index"] == "False"]

            name2report[dataset_path.name] = name2report[dataset_path.name].append(
                {
                    "precision": report_values["precision"].item(),
                    "recall": report_values["recall"].item(),
                    "f1-score": report_values["f1-score"].item(),
                },
                ignore_index=True,
            )
            
            name2debug[dataset_path.name][int(step_path.name)] = score_df

In [2]:
from labext.prelude import M, A, W
import matplotlib.pyplot as plt
from IPython.display import HTML, Markdown, display, Javascript

def render(index):
    name = list(name2diff.keys())[index]

    diff_df = name2diff[name]
    
    display(HTML(f"<h2>Dataset: {name}<h2>"))
    
    display(HTML("<h3>Result:<h3>"))
    ax = name2report[name].plot(kind="line")
    plt.show()
    
    display(HTML("<h3>Outlier Examples:<h3>"))
    M.DataTable.render(diff_df)
    
    display(HTML("<h3>Predictions:<h3>"))
    
    def render_debug(index):
        debug_df = name2debug[name][index]
        M.DataTable.render(debug_df)

    A.slider(render_debug, max=9)
    
A.slider(render, max=len(name2diff) - 1)

HBox(children=(Button(description='Previous', icon='arrow-circle-left', style=ButtonStyle()), Button(descripti…

Output()