# Baran Experiment

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas
import IPython.display

import raha
import datetime

In [3]:
def format_delta(delta: datetime.timedelta):
    hours, remainder = divmod(delta.total_seconds(), 3600)
    minutes, seconds = divmod(remainder, 60)
    
    h = f'{int(hours)}h ' if hours > 0 else ''
    m = f'{int(minutes)}m ' if minutes > 0 else ''
    s = f'{int(seconds)}s'
    
    return h+m+s

def estimate_time_to_finish(times: list, current_run_i: int, total_runs: int):
    deltas = []
    i = 1
    
    while i+1 <= len(times):
        deltas.append(times[i] - times[i-1])
        i += 1
        
    avg = sum(deltas, datetime.timedelta())/len(deltas)
    return f'Run {current_run_i}/{total_runs}. {format_delta(avg)} per run, estimate {format_delta(avg*(total_runs - current_run_i))} to finish'

In [6]:
results = []

runs = range(10)
datasets = ['letter']

duration_experiment = len(datasets) * len(runs)
times = [datetime.datetime.now()]
i = 1


for dataset_name in datasets:
    for run in runs:
        data_dict = { 
            "name": dataset_name,
            "path": f"../datasets/{dataset_name}/dirty.csv",
            "clean_path": f"../datasets/{dataset_name}/clean.csv"}

        data = raha.Dataset(data_dict)
        data.detected_cells = dict(data.get_actual_errors_dictionary())
        app = raha.Correction()
        app.LABELING_BUDGET = 20
        app.VERBOSE = False
        app.FEATURE_GENERATORS = ['value', 'domain', 'vicinity']
        app.CLASSIFICATION_MODEL = "ABC"
        app.VICINITY_ORDERS = [1]
        app.VICINITY_FEATURE_GENERATOR = 'pdep'
        app.IMPUTER_CACHE_MODEL = True  # datasets are different, so train different models.
        app.N_BEST_PDEPS = 5
        app.GPDEP_CORRECTION_SCORE_THRESHOLD = 0


        d = app.initialize_dataset(data)
        app.initialize_models(d)

        while len(d.labeled_tuples) < app.LABELING_BUDGET:
            app.sample_tuple(d, random_seed=None)
            app.label_with_ground_truth(d)
            app.update_models(d)
            app.generate_features(d, synchronous=True)
            app.predict_corrections(d, random_seed=None)

            p, r, f = d.get_data_cleaning_evaluation(d.corrected_cells)[-3:]
        result = {'dataset': dataset_name, 
                  'run': run,
                  'precision': p, 'recall': r, 'f1': f}                    
        results.append(result)
        print(result)

        # time estimates for the measurement
        times.append(datetime.datetime.now())
        print(estimate_time_to_finish(times, i, duration_experiment))
        i += 1

{'dataset': 'letter', 'run': 0, 'precision': 0.45352387932087457, 'recall': 0.4528324897859626, 'f1': 0.4531779208494798}
Run 1/10. 3m 24s per run, estimate 30m 37s to finish
{'dataset': 'letter', 'run': 1, 'precision': 0.501198738170347, 'recall': 0.48441978169400574, 'f1': 0.49266643926943465}
Run 2/10. 3m 31s per run, estimate 28m 8s to finish
{'dataset': 'letter', 'run': 2, 'precision': 0.49743814046488377, 'recall': 0.4854564302701384, 'f1': 0.4913742554701725}
Run 3/10. 3m 30s per run, estimate 24m 35s to finish
{'dataset': 'letter', 'run': 3, 'precision': 0.4261845234465516, 'recall': 0.4261845234465516, 'f1': 0.4261845234465516}
Run 4/10. 3m 29s per run, estimate 20m 55s to finish
{'dataset': 'letter', 'run': 4, 'precision': 0.44998156568759984, 'recall': 0.4465516190011586, 'f1': 0.4482600312184372}
Run 5/10. 3m 30s per run, estimate 17m 30s to finish
{'dataset': 'letter', 'run': 5, 'precision': 0.4691149834488789, 'recall': 0.45801573266662604, 'f1': 0.46349892008639315}
Run 

In [7]:
print(results)

[{'dataset': 'letter', 'run': 0, 'precision': 0.45352387932087457, 'recall': 0.4528324897859626, 'f1': 0.4531779208494798}, {'dataset': 'letter', 'run': 1, 'precision': 0.501198738170347, 'recall': 0.48441978169400574, 'f1': 0.49266643926943465}, {'dataset': 'letter', 'run': 2, 'precision': 0.49743814046488377, 'recall': 0.4854564302701384, 'f1': 0.4913742554701725}, {'dataset': 'letter', 'run': 3, 'precision': 0.4261845234465516, 'recall': 0.4261845234465516, 'f1': 0.4261845234465516}, {'dataset': 'letter', 'run': 4, 'precision': 0.44998156568759984, 'recall': 0.4465516190011586, 'f1': 0.4482600312184372}, {'dataset': 'letter', 'run': 5, 'precision': 0.4691149834488789, 'recall': 0.45801573266662604, 'f1': 0.46349892008639315}, {'dataset': 'letter', 'run': 6, 'precision': 0.4464906396731508, 'recall': 0.4464906396731508, 'f1': 0.4464906396731508}, {'dataset': 'letter', 'run': 7, 'precision': 0.4926262943206777, 'recall': 0.47868772486127203, 'f1': 0.4855569988247665}, {'dataset': 'let

In [42]:
results = []

datasets_versions = range(1,6)
runs = range(1,6)
datasets = ['cars', 'bridges', 'glass', 'restaurant']

duration_experiment = len(datasets) * len(runs) * len(datasets_versions)
times = [datetime.datetime.now()]
i = 1


for dataset in datasets:
    for run in runs:
        for version in datasets_versions:
            data_dict = {"name": dataset,
                         "path": "../datasets/renuver/{0}/{0}_{1}_{2}.csv".format(dataset, run, version),
                         "clean_path": "../datasets/renuver/{0}/clean.csv".format(dataset)}

            data = raha.Dataset(data_dict)
            data.detected_cells = dict(data.get_actual_errors_dictionary())
            app = raha.Correction()
            app.LABELING_BUDGET = 20
            app.VERBOSE = False
            app.FEATURE_GENERATORS = ['vicinity']
            app.CLASSIFICATION_MODEL = "ABC"
            app.VICINITY_ORDERS = [1, 2]
            app.VICINITY_FEATURE_GENERATOR = 'pdep'
            app.N_BEST_PDEPS = 5
            app.GPDEP_CORRECTION_SCORE_THRESHOLD = 0


            d = app.initialize_dataset(data)
            app.initialize_models(d)

            #app.sample_tuple(d, random_seed=None)
            #app.label_with_ground_truth(d)
            #app.update_models(d)
            app.generate_features(d, synchronous=True)
            app.predict_corrections(d, random_seed=None)

            p, r, f = d.get_data_cleaning_evaluation(d.corrected_cells)[-3:]
            result = {'dataset': dataset, 
                      'run': run,
                      'dataset_version': version,
                      'n_sample': len(d.labeled_tuples),
                      'precision': p, 'recall': r, 'f1': f}                    

            print(result)

{'dataset': 'cars', 'run': 1, 'dataset_version': 1, 'n_sample': 0, 'precision': 0.03064066852367688, 'recall': 0.01437908496732026, 'f1': 0.0195729537366548}
{'dataset': 'cars', 'run': 1, 'dataset_version': 2, 'n_sample': 0, 'precision': 0.04201680672268908, 'recall': 0.019659239842726082, 'f1': 0.026785714285714288}
{'dataset': 'cars', 'run': 1, 'dataset_version': 3, 'n_sample': 0, 'precision': 0.03089887640449438, 'recall': 0.014435695538057743, 'f1': 0.01967799642218247}
{'dataset': 'cars', 'run': 1, 'dataset_version': 4, 'n_sample': 0, 'precision': 0.024861878453038673, 'recall': 0.01171875, 'f1': 0.01592920353982301}
{'dataset': 'cars', 'run': 1, 'dataset_version': 5, 'n_sample': 0, 'precision': 0.025495750708215296, 'recall': 0.011857707509881422, 'f1': 0.01618705035971223}
{'dataset': 'cars', 'run': 2, 'dataset_version': 1, 'n_sample': 0, 'precision': 0.06666666666666667, 'recall': 0.032663316582914576, 'f1': 0.04384485666104554}
{'dataset': 'cars', 'run': 2, 'dataset_version': 