# Baran Experiment

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas
import IPython.display

import raha
import datetime

In [3]:
def format_delta(delta: datetime.timedelta):
    hours, remainder = divmod(delta.total_seconds(), 3600)
    minutes, seconds = divmod(remainder, 60)
    
    h = f'{int(hours)}h ' if hours > 0 else ''
    m = f'{int(minutes)}m ' if minutes > 0 else ''
    s = f'{int(seconds)}s'
    
    return h+m+s

def estimate_time_to_finish(times: list, current_run_i: int, total_runs: int):
    deltas = []
    i = 1
    
    while i+1 <= len(times):
        deltas.append(times[i] - times[i-1])
        i += 1
        
    avg = sum(deltas, datetime.timedelta())/len(deltas)
    return f'Run {current_run_i}/{total_runs}. {format_delta(avg)} per run, estimate {format_delta(avg*(total_runs - current_run_i))} to finish'

In [9]:
results = []

error_rates = range(1,6)
versions = range(1,6)
datasets = ['bridges', 'cars', 'glass']
scoring_strategies = ['new_feature']

duration_experiment = len(datasets) * len(error_rates) * len(versions) * len(scoring_strategies)
times = [datetime.datetime.now()]
i = 1


for dataset in datasets:
    for error_rate_pct in error_rates:
        for version in versions:
            for scoring_strategy in scoring_strategies:
                data_dict = {"name": dataset,
                             "path": "../datasets/renuver/{0}/{0}_{1}_{2}.csv".format(dataset, error_rate_pct, version),
                             "clean_path": "../datasets/renuver/{0}/clean.csv".format(dataset)}

                data = raha.Dataset(data_dict)
                data.detected_cells = dict(data.get_actual_errors_dictionary())
                app = raha.Correction()
                app.LABELING_BUDGET = 20
                app.VERBOSE = False
                app.FEATURE_GENERATORS = ['domain', 'value', 'vicinity']
                app.CLASSIFICATION_MODEL = "ABC"
                app.VICINITY_ORDERS = [1, 2]
                app.VICINITY_FEATURE_GENERATOR = 'pdep'
                app.N_BEST_PDEPS = 5
                app.PDEP_SCORE_STRATEGY = scoring_strategy

                d = app.initialize_dataset(data)
                app.initialize_models(d)
                while len(d.labeled_tuples) < app.LABELING_BUDGET:
                    app.sample_tuple(d, random_seed=None)
                    app.label_with_ground_truth(d)
                    app.update_models(d)
                    app.generate_features(d, synchronous=True)
                    app.predict_corrections(d, random_seed=None)

                    p, r, f = d.get_data_cleaning_evaluation(d.corrected_cells)[-3:]
                    result = {'dataset': dataset, 
                              'error_rate_pct': error_rate_pct,
                              'dataset_version': version,
                              'n_sample': len(d.labeled_tuples),
                              'scoring_strategy': scoring_strategy,
                              'precision': p, 'recall': r, 'f1': f}                    
                    results.append(result)

                # time estimates for the measurement
                times.append(datetime.datetime.now())
                print(estimate_time_to_finish(times, i, duration_experiment))
                i += 1

Run 1/75. 12s per run, estimate 15m 43s to finish
Run 2/75. 12s per run, estimate 14m 59s to finish
Run 3/75. 11s per run, estimate 13m 49s to finish
Run 4/75. 11s per run, estimate 13m 11s to finish
Run 5/75. 10s per run, estimate 12m 49s to finish
Run 6/75. 10s per run, estimate 12m 35s to finish
Run 7/75. 11s per run, estimate 12m 33s to finish
Run 8/75. 11s per run, estimate 12m 32s to finish
Run 9/75. 11s per run, estimate 12m 31s to finish
Run 10/75. 11s per run, estimate 12m 40s to finish
Run 11/75. 11s per run, estimate 12m 33s to finish
Run 12/75. 12s per run, estimate 12m 42s to finish
Run 13/75. 12s per run, estimate 12m 31s to finish
Run 14/75. 12s per run, estimate 12m 23s to finish
Run 15/75. 12s per run, estimate 12m 6s to finish
Run 16/75. 12s per run, estimate 11m 59s to finish
Run 17/75. 12s per run, estimate 12m 0s to finish
Run 18/75. 12s per run, estimate 11m 56s to finish
Run 19/75. 12s per run, estimate 11m 46s to finish
Run 20/75. 12s per run, estimate 11m 43s t

In [10]:
print(results)

[{'dataset': 'bridges', 'error_rate_pct': 1, 'dataset_version': 1, 'n_sample': 1, 'scoring_strategy': 'new_feature', 'precision': 0.6, 'recall': 0.21428571428571427, 'f1': 0.3157894736842105}, {'dataset': 'bridges', 'error_rate_pct': 1, 'dataset_version': 1, 'n_sample': 2, 'scoring_strategy': 'new_feature', 'precision': 0.6666666666666666, 'recall': 0.42857142857142855, 'f1': 0.5217391304347826}, {'dataset': 'bridges', 'error_rate_pct': 1, 'dataset_version': 1, 'n_sample': 3, 'scoring_strategy': 'new_feature', 'precision': 0.7, 'recall': 0.5, 'f1': 0.5833333333333334}, {'dataset': 'bridges', 'error_rate_pct': 1, 'dataset_version': 1, 'n_sample': 4, 'scoring_strategy': 'new_feature', 'precision': 0.7272727272727273, 'recall': 0.5714285714285714, 'f1': 0.64}, {'dataset': 'bridges', 'error_rate_pct': 1, 'dataset_version': 1, 'n_sample': 5, 'scoring_strategy': 'new_feature', 'precision': 0.7692307692307693, 'recall': 0.7142857142857143, 'f1': 0.7407407407407408}, {'dataset': 'bridges', 'er