# Data Transformation Experiments

## This notebook presents a set of experiments running for data transformation.
## First, we need to set up the environment and observers.

In [1]:
%load_ext autoreload
%autoreload 2

from beakerx import *
from beakerx.object import beakerx

from sacred import Experiment
from sacred.observers import MongoObserver

from tqdm import tqdm_notebook as tqdm

beakerx.pandas_display_table()

## The set of following functions below supports running these expriments.  

In [2]:
from pathlib import Path
from datafc.evaluation import Evaluator

data_folder = Path("../../data/standard")
ex = Experiment("jupyter_ex", interactive=True)
ex.observers.append(MongoObserver.create())


def run_scenario(
    evaluator, scenario_folder, mapping_method, string_similarity, with_flashfill
):
    original_values = []
    target_values = []
    groundtruth_values = []

    for file in scenario_folder.iterdir():
        with file.open(encoding="utf-8") as reader:

            for row in reader.readlines():
                row = row.encode("utf-8").decode("ascii", "ignore")
                if "input" in file.name:
                    original_values.append(row.strip())
                if "transformed" in file.name:
                    target_values.append(row.strip())
                if "groundtruth" in file.name:
                    groundtruth_values.append(row.strip())

    evaluator.run_active_top_k_experiment(
        scenario_folder.name,
        original_values[:1000],
        target_values[:1000],
        groundtruth_values[:1000],
        10,
        with_flashfill=with_flashfill,
    )

    scenario_report = evaluator.generate_scenario_report(scenario_folder.name, 10)
    return scenario_report


@ex.main
def run_dataset(dataset, mapping_method, mapping_features, with_flashfill):
    evaluator = Evaluator(mapping_method, mapping_features)
    scenario_reports = []

    for scenario_folder in tqdm((data_folder / f"{dataset}").iterdir()):
        scenario_report = run_scenario(
            evaluator, scenario_folder, mapping_method, mapping_features, with_flashfill
        )
        scenario_reports.append(scenario_report)

    dataset_report = evaluator.generate_dataset_report(dataset, 10)
    dataset_report["scenarios"] = scenario_reports
    return dataset_report

## Results are added to MongoDB for experiment reproduction

In [None]:
dataset = "museum"
mapping_method = "sim"
mapping_features = ["jaccard"]
with_flashfill = False

hyper_params = {
    "dataset": dataset,
    "mapping_method": mapping_method,
    "mapping_features": mapping_features,
    "with_flashfill": with_flashfill,
}

dataset_report = ex.run(config_updates=hyper_params).result

INFO - jupyter_ex - Running command 'run_dataset'
INFO - jupyter_ex - Started run with ID "28"


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

In [None]:
scenarios_df = pd.DataFrame(dataset_report["scenarios"], columns=["name", "running_time", "active_learning_curve"]).round(2)

scenarios_df