# Baran Experiment

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas
import IPython.display

import raha
import datetime
from ruska import Ruska

In [15]:
c = {'dataset': 'adult', 
     'error_fraction': 0.1,
     'n_rows': None,
     'labeling_budget': 20,
     'feature_generators': ['vicinity', 'value', 'domain'],
     'vicinity_orders': [1],
     'classification_model': 'ABC',
     'n_best_pdeps': 5,
     "vicinity_feature_generator": 'pdep',
     "run": 0
    }

In [16]:
if c["dataset"] in ["bridges", "cars", "glass", "restaurant"]:  # renuver dataset
    rate_formatted = int(str(c["error_fraction"]).split(".")[1])
    run = c["run"] + 1
    data_dict = {
        "name": c["dataset"],
        "path": f"../datasets/renuver/{c['dataset']}/{c['dataset']}_{rate_formatted}_{run}.csv",
        "clean_path": f"../datasets/renuver/{c['dataset']}/clean.csv",
    }
elif c["dataset"] in ["beers", "flights", "hospital", "tax", "toy"]:
    data_dict = {
        "name": c["dataset"],
        "path": f"../datasets/{c['dataset']}/dirty.csv",
        "clean_path": f"../datasets/{c['dataset']}/clean.csv",
    }
elif c["dataset"] in ["adult", "breast-cancer", "letter", "nursery"]:
    rate_formatted = int(c["error_fraction"] * 10)
    data_dict = {
        "name": c["dataset"],
        "path": f"../datasets/{c['dataset']}/MCAR/dirty_{rate_formatted}.csv",
        "clean_path": f"../datasets/{c['dataset']}/clean.csv",
    }
else:
    raise ValueError("Unknown Dataset.")

data = raha.Dataset(data_dict, n_rows=c["n_rows"])
data.detected_cells = dict(data.get_actual_errors_dictionary())
app = raha.Correction()
app.LABELING_BUDGET = c["labeling_budget"]
app.VERBOSE = False
app.FEATURE_GENERATORS = c["feature_generators"]
app.CLASSIFICATION_MODEL = c["classification_model"]
app.VICINITY_ORDERS = c["vicinity_orders"]
app.VICINITY_FEATURE_GENERATOR = c["vicinity_feature_generator"]
app.N_BEST_PDEPS = c["n_best_pdeps"]

d = app.initialize_dataset(data)
print('initialized dataset')
app.initialize_models(d)
print('initialized models')
while len(d.labeled_tuples) < app.LABELING_BUDGET:
    app.sample_tuple(d, random_seed=None)
    app.label_with_ground_truth(d)
    app.update_models(d)
    print('updated')
    app.generate_features(d, synchronous=True)
    print('generated')
    app.predict_corrections(d)

    p, r, f = d.get_data_cleaning_evaluation(d.corrected_cells)[-3:]
    print({"result": {"precision": p, "recall": r, "f1": f}})

initialized dataset
initialized models
updated
generated
{'result': {'precision': 0.8283777434727163, 'recall': 0.2527027027027027, 'f1': 0.3872668225105509}}
updated
generated
{'result': {'precision': 0.6132093497229476, 'recall': 0.39426699426699424, 'f1': 0.4799481568256025}}
updated


KeyboardInterrupt: 