# Baran Experiment

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas
import IPython.display

import raha
import datetime

In [3]:
def format_delta(delta: datetime.timedelta):
    hours, remainder = divmod(delta.total_seconds(), 3600)
    minutes, seconds = divmod(remainder, 60)
    
    h = f'{int(hours)}h ' if hours > 0 else ''
    m = f'{int(minutes)}m ' if minutes > 0 else ''
    s = f'{int(seconds)}s'
    
    return h+m+s

def estimate_time_to_finish(times: list, current_run_i: int, total_runs: int):
    deltas = []
    i = 1
    
    while i+1 <= len(times):
        deltas.append(times[i] - times[i-1])
        i += 1
        
    avg = sum(deltas, datetime.timedelta())/len(deltas)
    return f'Run {current_run_i}/{total_runs}. {format_delta(avg)} per run, estimate {format_delta(avg*(total_runs - current_run_i))} to finish'

In [4]:
beers = {
    "name": "beers",
    "path": "../datasets/beers/dirty.csv",
    "clean_path": "../datasets/beers/clean.csv"
}
flights = {
    "name": "flights",
    "path": "../datasets/flights/dirty.csv",
    "clean_path": "../datasets/flights/clean.csv"
}
movies = {
    "name": "movies",
    "path": "../datasets/movies_1/dirty.csv",
    "clean_path": "../datasets/movies_1/clean.csv"
}
rayyan = {
    "name": "rayyan",
    "path": "../datasets/rayyan/dirty.csv",
    "clean_path": "../datasets/rayyan/clean.csv"
}
tax = {
    "name": "tax",
    "path": "../datasets/tax/dirty.csv",
    "clean_path": "../datasets/tax/clean.csv"
}
toy = {
    "name": "toy",
    "path": "../datasets/toy/dirty.csv",
    "clean_path": "../datasets/toy/clean.csv"
}
hospital = {
    "name": "hospital",
    "path": "../datasets/hospital/dirty.csv",
    "clean_path": "../datasets/hospital/clean.csv"
}

In [5]:
datasets = [beers, 
            hospital,
            rayyan,
            flights]
experiments = ['adder', 'constant', 'ente', 'disable_vicinity']
runs = range(10)
duration_experiment = len(datasets) * len(runs) * len(experiments)

results = []
times = [datetime.datetime.now()]
i = 1
for dataset_dictionary in datasets:
    for run in runs:
        for e in experiments:            
            app_2 = raha.Correction()
            print(f'Starting run {i}/{duration_experiment}.')
            # How many tuples would you label?
            app_2.LABELING_BUDGET = 20
            app_2.VERBOSE = False
            app_2.CLASSIFICATION_MODEL == "GBC"

            d = raha.dataset.Dataset(dataset_dictionary)
            d.detected_cells = dict(d.get_actual_errors_dictionary())
            d = app_2.initialize_dataset(d)
            
            d.experiment = e
            
            app_2.initialize_models(d)
            while len(d.labeled_tuples) < app_2.LABELING_BUDGET:
                app_2.sample_tuple(d, random_seed=run)
                if d.has_ground_truth:
                    app_2.label_with_ground_truth(d)
                app_2.update_models(d)
                app_2.generate_features(d)
                app_2.predict_corrections(d, random_seed=run)
            
            p, r, f = d.get_data_cleaning_evaluation(d.corrected_cells)[-3:]
            results.append({'dataset': d.name, 'experiment': e, 'run': run, 'precision': p, 'recall': r, 'f1': f})
            
            # time estimates for the measurement
            times.append(datetime.datetime.now())
            print(estimate_time_to_finish(times, i, duration_experiment))
            i += 1

Starting run 1/160.
Run 1/160. 1m 3s per run, estimate 2h 47m 12s to finish
Starting run 2/160.
Run 2/160. 1m 2s per run, estimate 2h 44m 59s to finish
Starting run 3/160.
Run 3/160. 1m 2s per run, estimate 2h 43m 59s to finish
Starting run 4/160.
Run 4/160. 59s per run, estimate 2h 34m 51s to finish
Starting run 5/160.
Run 5/160. 1m 0s per run, estimate 2h 36m 11s to finish
Starting run 6/160.
Run 6/160. 1m 0s per run, estimate 2h 35m 59s to finish
Starting run 7/160.
Run 7/160. 1m 1s per run, estimate 2h 36m 2s to finish
Starting run 8/160.
Run 8/160. 59s per run, estimate 2h 31m 31s to finish
Starting run 9/160.
Run 9/160. 1m 0s per run, estimate 2h 31m 40s to finish
Starting run 10/160.
Run 10/160. 1m 0s per run, estimate 2h 31m 18s to finish
Starting run 11/160.
Run 11/160. 1m 0s per run, estimate 2h 30m 49s to finish
Starting run 12/160.
Run 12/160. 59s per run, estimate 2h 27m 36s to finish
Starting run 13/160.
Run 13/160. 1m 0s per run, estimate 2h 27m 35s to finish
Starting ru

In [6]:
print(results)

[{'dataset': 'beers', 'experiment': 'adder', 'run': 0, 'precision': 0.8847831076428735, 'recall': 0.8837689133425034, 'f1': 0.8842757196926252}, {'dataset': 'beers', 'experiment': 'constant', 'run': 0, 'precision': 0.9924108272198331, 'recall': 0.8993580926180651, 'f1': 0.9435959110042093}, {'dataset': 'beers', 'experiment': 'ente', 'run': 0, 'precision': 0.9738684087727485, 'recall': 0.9569005043558001, 'f1': 0.9653098982423682}, {'dataset': 'beers', 'experiment': 'disable_vicinity', 'run': 0, 'precision': 0.8582278481012658, 'recall': 0.8548830811554333, 'f1': 0.8565521993798094}, {'dataset': 'beers', 'experiment': 'adder', 'run': 1, 'precision': 0.8762035763411279, 'recall': 0.8762035763411279, 'f1': 0.8762035763411279}, {'dataset': 'beers', 'experiment': 'constant', 'run': 1, 'precision': 0.9926253687315634, 'recall': 0.9257221458046767, 'f1': 0.9580071174377224}, {'dataset': 'beers', 'experiment': 'ente', 'run': 1, 'precision': 0.9570666047806916, 'recall': 0.9454378725355341, 'f1

## Analyse was passiert, wenn ich alle Modelle deaktiviere außer das Vicinity Model -- wo kommen die Korrekturen her?

In [31]:
len(d.detected_cells)

948

In [32]:
overlap = 0
for key in d.corrected_cells.keys():
    if key in d.labeled_cells.keys():
        overlap  = overlap + 1

In [33]:
print(len(d.labeled_cells))

220


In [34]:
overlap

61

In [35]:
print(len(d.corrected_cells))

61
