# Baran Experiment

In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
import pandas
import IPython.display

import raha
import datetime

In [3]:
def format_delta(delta: datetime.timedelta):
    hours, remainder = divmod(delta.total_seconds(), 3600)
    minutes, seconds = divmod(remainder, 60)
    
    h = f'{int(hours)}h ' if hours > 0 else ''
    m = f'{int(minutes)}m ' if minutes > 0 else ''
    s = f'{int(seconds)}s'
    
    return h+m+s

def estimate_time_to_finish(times: list, current_run_i: int, total_runs: int):
    deltas = []
    i = 1
    
    while i+1 <= len(times):
        deltas.append(times[i] - times[i-1])
        i += 1
        
    avg = sum(deltas, datetime.timedelta())/len(deltas)
    return f'Run {current_run_i}/{total_runs}. {format_delta(avg)} per run, estimate {format_delta(avg*(total_runs - current_run_i))} to finish'

In [4]:
letter = {"name": "letter",
         "path": "../datasets/letter/dirty.csv",
         "clean_path": "../datasets/letter/clean.csv"}

adult = {
    "name": "adult",
    "path": "../datasets/adult/dirty.csv",
    "clean_path": "../datasets/adult/clean.csv"
}
beers = {
    "name": "beers",
    "path": "../datasets/beers/dirty.csv",
    "clean_path": "../datasets/beers/clean.csv"
}
flights = {
    "name": "flights",
    "path": "../datasets/flights/dirty.csv",
    "clean_path": "../datasets/flights/clean.csv"
}
movies = {
    "name": "movies",
    "path": "../datasets/movies_1/dirty.csv",
    "clean_path": "../datasets/movies_1/clean.csv"
}
rayyan = {
    "name": "rayyan",
    "path": "../datasets/rayyan/dirty.csv",
    "clean_path": "../datasets/rayyan/clean.csv"
}
tax = {
    "name": "tax",
    "path": "../datasets/tax/dirty.csv",
    "clean_path": "../datasets/tax/clean.csv"
}
toy = {
    "name": "toy",
    "path": "../datasets/toy/dirty.csv",
    "clean_path": "../datasets/toy/clean.csv"
}
hospital = {
    "name": "hospital",
    "path": "../datasets/hospital/dirty.csv",
    "clean_path": "../datasets/hospital/clean.csv"
}

In [6]:
datasets = [beers, hospital, rayyan, flights]
results = []
n_pdeps = 4
vicinity_order = [1]
feature_generators = ['pdep', 'naive']
generator_setups = [["value", "domain", "vicinity"]]


for dataset_dictionary in datasets:
    for feat in feature_generators:
        for run in range(10):
            data = raha.Dataset(dataset_dictionary)
            data.detected_cells = dict(data.get_actual_errors_dictionary())
            app = raha.Correction()
            app.LABELING_BUDGET = 20
            app.VERBOSE = False
            app.FEATURE_GENERATORS = ["value", "domain", "vicinity"]
            app.CLASSIFICATION_MODEL = "ABC"
            app.VICINITY_ORDERS = vicinity_order
            app.VICINITY_FEATURE_GENERATOR = feat
            app.N_BEST_PDEPS = n_pdeps

            d = app.initialize_dataset(data)
            app.initialize_models(d)

            while len(d.labeled_tuples) < app.LABELING_BUDGET:
                app.sample_tuple(d, random_seed=None)
                app.label_with_ground_truth(d)
                app.update_models(d)
                app.generate_features(d, synchronous=False)
                app.predict_corrections(d, random_seed=None)

            p, r, f = d.get_data_cleaning_evaluation(d.corrected_cells)[-3:]
            result = {'dataset': d.name, 
                      'run': run,
                      'feature_generator': feat, 
                      'precision': p, 'recall': r, 'f1': f, 'run': run}
            results.append(result)
            print(result)

{'dataset': 'beers', 'run': 0, 'feature_generator': 'pdep', 'precision': 0.9529845586540677, 'recall': 0.9479596515359927, 'f1': 0.9504654637398001}
{'dataset': 'beers', 'run': 1, 'feature_generator': 'pdep', 'precision': 0.8616874135546335, 'recall': 0.8569463548830811, 'f1': 0.8593103448275863}
{'dataset': 'beers', 'run': 2, 'feature_generator': 'pdep', 'precision': 0.9741876008296843, 'recall': 0.969050894085282, 'f1': 0.9716124583381219}
{'dataset': 'beers', 'run': 3, 'feature_generator': 'pdep', 'precision': 0.8959849729983564, 'recall': 0.874828060522696, 'f1': 0.885280129915323}
{'dataset': 'beers', 'run': 4, 'feature_generator': 'pdep', 'precision': 0.981468449448745, 'recall': 0.9591930307198533, 'f1': 0.9702028985507247}
{'dataset': 'beers', 'run': 5, 'feature_generator': 'pdep', 'precision': 0.9699741965751818, 'recall': 0.9479596515359927, 'f1': 0.958840579710145}
{'dataset': 'beers', 'run': 6, 'feature_generator': 'pdep', 'precision': 0.8232311592532842, 'recall': 0.818890

In [7]:
print(results)

[{'dataset': 'beers', 'run': 0, 'feature_generator': 'pdep', 'precision': 0.9529845586540677, 'recall': 0.9479596515359927, 'f1': 0.9504654637398001}, {'dataset': 'beers', 'run': 1, 'feature_generator': 'pdep', 'precision': 0.8616874135546335, 'recall': 0.8569463548830811, 'f1': 0.8593103448275863}, {'dataset': 'beers', 'run': 2, 'feature_generator': 'pdep', 'precision': 0.9741876008296843, 'recall': 0.969050894085282, 'f1': 0.9716124583381219}, {'dataset': 'beers', 'run': 3, 'feature_generator': 'pdep', 'precision': 0.8959849729983564, 'recall': 0.874828060522696, 'f1': 0.885280129915323}, {'dataset': 'beers', 'run': 4, 'feature_generator': 'pdep', 'precision': 0.981468449448745, 'recall': 0.9591930307198533, 'f1': 0.9702028985507247}, {'dataset': 'beers', 'run': 5, 'feature_generator': 'pdep', 'precision': 0.9699741965751818, 'recall': 0.9479596515359927, 'f1': 0.958840579710145}, {'dataset': 'beers', 'run': 6, 'feature_generator': 'pdep', 'precision': 0.8232311592532842, 'recall': 0