# Baran Experiment

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas
import IPython.display

import raha
import datetime

In [3]:
def format_delta(delta: datetime.timedelta):
    hours, remainder = divmod(delta.total_seconds(), 3600)
    minutes, seconds = divmod(remainder, 60)
    
    h = f'{int(hours)}h ' if hours > 0 else ''
    m = f'{int(minutes)}m ' if minutes > 0 else ''
    s = f'{int(seconds)}s'
    
    return h+m+s

def estimate_time_to_finish(times: list, current_run_i: int, total_runs: int):
    deltas = []
    i = 1
    
    while i+1 <= len(times):
        deltas.append(times[i] - times[i-1])
        i += 1
        
    avg = sum(deltas, datetime.timedelta())/len(deltas)
    return f'Run {current_run_i}/{total_runs}. {format_delta(avg)} per run, estimate {format_delta(avg*(total_runs - current_run_i))} to finish'

In [13]:
results = []

error_rates = range(1,6)
versions = range(1,6)
datasets = ['bridges', 'cars', 'glass', 'restaurant']

duration_experiment = len(datasets) * len(error_rates) * len(versions)
times = [datetime.datetime.now()]
i = 1


for dataset in datasets:
    for error_rate_pct in error_rates:
        for version in versions:
            data_dict = {"name": dataset,
                         "path": "../datasets/renuver/{0}/{0}_{1}_{2}.csv".format(dataset, error_rate_pct, version),
                         "clean_path": "../datasets/renuver/{0}/clean.csv".format(dataset)}

            data = raha.Dataset(data_dict)
            data.detected_cells = dict(data.get_actual_errors_dictionary())
            app = raha.Correction()
            app.LABELING_BUDGET = 20
            app.VERBOSE = False
            app.FEATURE_GENERATORS = ['domain', 'value', 'vicinity']
            app.CLASSIFICATION_MODEL = "ABC"
            app.VICINITY_ORDERS = [1, 2]
            app.VICINITY_FEATURE_GENERATOR = 'pdep'
            app.N_BEST_PDEPS = 5

            d = app.initialize_dataset(data)
            app.initialize_models(d)
            while len(d.labeled_tuples) < app.LABELING_BUDGET:
                app.sample_tuple(d, random_seed=None)
                app.label_with_ground_truth(d)
                app.update_models(d)
                app.generate_features(d, synchronous=True)
                app.predict_corrections(d, random_seed=None)

                p, r, f = d.get_data_cleaning_evaluation(d.corrected_cells)[-3:]
                result = {'dataset': dataset, 
                          'error_rate_pct': error_rate_pct,
                          'dataset_version': version,
                          'n_sample': len(d.labeled_tuples),
                          'precision': p, 'recall': r, 'f1': f}                    
                results.append(result)
            
            # time estimates for the measurement
            times.append(datetime.datetime.now())
            print(estimate_time_to_finish(times, i, duration_experiment))
            i += 1

Run 1/100. 12s per run, estimate 20m 2s to finish
Run 2/100. 10s per run, estimate 17m 29s to finish
Run 3/100. 10s per run, estimate 16m 23s to finish
Run 4/100. 9s per run, estimate 15m 48s to finish
Run 5/100. 9s per run, estimate 15m 25s to finish
Run 6/100. 9s per run, estimate 15m 14s to finish
Run 7/100. 9s per run, estimate 15m 1s to finish
Run 8/100. 9s per run, estimate 14m 50s to finish
Run 9/100. 9s per run, estimate 14m 37s to finish
Run 10/100. 9s per run, estimate 14m 25s to finish
Run 11/100. 9s per run, estimate 14m 23s to finish
Run 12/100. 9s per run, estimate 14m 26s to finish
Run 13/100. 9s per run, estimate 14m 14s to finish
Run 14/100. 9s per run, estimate 14m 10s to finish
Run 15/100. 9s per run, estimate 13m 58s to finish
Run 16/100. 10s per run, estimate 14m 2s to finish
Run 17/100. 10s per run, estimate 14m 1s to finish
Run 18/100. 10s per run, estimate 14m 4s to finish
Run 19/100. 10s per run, estimate 13m 55s to finish
Run 20/100. 10s per run, estimate 13m 

In [14]:
print(results)

[{'dataset': 'bridges', 'error_rate_pct': 1, 'dataset_version': 1, 'n_sample': 1, 'precision': 0.75, 'recall': 0.21428571428571427, 'f1': 0.3333333333333333}, {'dataset': 'bridges', 'error_rate_pct': 1, 'dataset_version': 1, 'n_sample': 2, 'precision': 0.8571428571428571, 'recall': 0.42857142857142855, 'f1': 0.5714285714285714}, {'dataset': 'bridges', 'error_rate_pct': 1, 'dataset_version': 1, 'n_sample': 3, 'precision': 0.8888888888888888, 'recall': 0.5714285714285714, 'f1': 0.6956521739130435}, {'dataset': 'bridges', 'error_rate_pct': 1, 'dataset_version': 1, 'n_sample': 4, 'precision': 0.9, 'recall': 0.6428571428571429, 'f1': 0.75}, {'dataset': 'bridges', 'error_rate_pct': 1, 'dataset_version': 1, 'n_sample': 5, 'precision': 0.9090909090909091, 'recall': 0.7142857142857143, 'f1': 0.8}, {'dataset': 'bridges', 'error_rate_pct': 1, 'dataset_version': 1, 'n_sample': 6, 'precision': 0.9166666666666666, 'recall': 0.7857142857142857, 'f1': 0.8461538461538461}, {'dataset': 'bridges', 'error