# Baran Experiment

In [9]:
%load_ext autoreload
%autoreload 2

In [10]:
import pandas
import IPython.display

import raha
import datetime

In [11]:
def format_delta(delta: datetime.timedelta):
    hours, remainder = divmod(delta.total_seconds(), 3600)
    minutes, seconds = divmod(remainder, 60)
    
    h = f'{int(hours)}h ' if hours > 0 else ''
    m = f'{int(minutes)}m ' if minutes > 0 else ''
    s = f'{int(seconds)}s'
    
    return h+m+s

def estimate_time_to_finish(times: list, current_run_i: int, total_runs: int):
    deltas = []
    i = 1
    
    while i+1 <= len(times):
        deltas.append(times[i] - times[i-1])
        i += 1
        
    avg = sum(deltas, datetime.timedelta())/len(deltas)
    return f'Run {current_run_i}/{total_runs}. {format_delta(avg)} per run, estimate {format_delta(avg*(total_runs - current_run_i))} to finish'

In [12]:
cars_1 = {"name": "cars_1",
         "path": "../datasets/cars_1/dirty.csv",
         "clean_path": "../datasets/cars_1/clean.csv"}

cars_2 = {"name": "cars_2",
         "path": "../datasets/cars_2/dirty.csv",
         "clean_path": "../datasets/cars_2/clean.csv"}

cars_3 = {"name": "cars_3",
         "path": "../datasets/cars_3/dirty.csv",
         "clean_path": "../datasets/cars_3/clean.csv"}

cars_4 = {"name": "cars_4",
         "path": "../datasets/cars_4/dirty.csv",
         "clean_path": "../datasets/cars_4/clean.csv"}

cars_5 = {"name": "cars_5",
         "path": "../datasets/cars_5/dirty.csv",
         "clean_path": "../datasets/cars_5/clean.csv"}

In [19]:
results = []

datasets = range(1,6)
runs = range(1,6)

duration_experiment = len(datasets) * len(runs)
times = [datetime.datetime.now()]
i = 1

for run in runs:
    for dataset in datasets:
        data_dict = {"name": "cars_{0}_{1}".format(run, dataset),
                     "path": "../datasets/renuver/cars/cars_{0}_{1}.csv".format(run, dataset),
                     "clean_path": "../datasets/renuver/cars/clean.csv"
                    }

        data = raha.Dataset(data_dict)
        data.detected_cells = dict(data.get_actual_errors_dictionary())
        app = raha.Correction()
        app.LABELING_BUDGET = 20
        app.VERBOSE = False
        app.FEATURE_GENERATORS = ['value', 'domain', 'vicinity']
        app.CLASSIFICATION_MODEL = "ABC"
        app.VICINITY_ORDERS = [1]
        app.VICINITY_FEATURE_GENERATOR = 'pdep'
        app.N_BEST_PDEPS = 5
        app.GPDEP_CORRECTION_SCORE_THRESHOLD = 0


        d = app.initialize_dataset(data)
        app.initialize_models(d)

        while len(d.labeled_tuples) < app.LABELING_BUDGET:
            app.sample_tuple(d, random_seed=None)
            app.label_with_ground_truth(d)
            app.update_models(d)
            app.generate_features(d, synchronous=True)
            app.predict_corrections(d, random_seed=None)

        p, r, f = d.get_data_cleaning_evaluation(d.corrected_cells)[-3:]
        result = {'dataset': d.name, 
                  'run': run,
                  'precision': p, 'recall': r, 'f1': f}                    
        results.append(result)
        print(result)

        # time estimates for the measurement
        times.append(datetime.datetime.now())
        print(estimate_time_to_finish(times, i, duration_experiment))
        i += 1

{'dataset': 'cars_1_1', 'run': 1, 'precision': 0.9568627450980393, 'recall': 0.9568627450980393, 'f1': 0.9568627450980393}
Run 1/25. 16s per run, estimate 6m 35s to finish
{'dataset': 'cars_1_2', 'run': 1, 'precision': 0.9619921363040629, 'recall': 0.9619921363040629, 'f1': 0.9619921363040629}
Run 2/25. 14s per run, estimate 5m 43s to finish
{'dataset': 'cars_1_3', 'run': 1, 'precision': 0.9698162729658792, 'recall': 0.9698162729658792, 'f1': 0.9698162729658792}
Run 3/25. 15s per run, estimate 5m 36s to finish
{'dataset': 'cars_1_4', 'run': 1, 'precision': 0.9622395833333334, 'recall': 0.9622395833333334, 'f1': 0.9622395833333334}
Run 4/25. 14s per run, estimate 5m 3s to finish
{'dataset': 'cars_1_5', 'run': 1, 'precision': 0.9380764163372859, 'recall': 0.9380764163372859, 'f1': 0.9380764163372859}
Run 5/25. 14s per run, estimate 4m 46s to finish
{'dataset': 'cars_2_1', 'run': 2, 'precision': 0.9296482412060302, 'recall': 0.9296482412060302, 'f1': 0.9296482412060302}
Run 6/25. 15s per 