# Baran Experiment

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas
import IPython.display

import raha
import datetime

In [3]:
def format_delta(delta: datetime.timedelta):
    hours, remainder = divmod(delta.total_seconds(), 3600)
    minutes, seconds = divmod(remainder, 60)
    
    h = f'{int(hours)}h ' if hours > 0 else ''
    m = f'{int(minutes)}m ' if minutes > 0 else ''
    s = f'{int(seconds)}s'
    
    return h+m+s

def estimate_time_to_finish(times: list, current_run_i: int, total_runs: int):
    deltas = []
    i = 1
    
    while i+1 <= len(times):
        deltas.append(times[i] - times[i-1])
        i += 1
        
    avg = sum(deltas, datetime.timedelta())/len(deltas)
    return f'Run {current_run_i}/{total_runs}. {format_delta(avg)} per run, estimate {format_delta(avg*(total_runs - current_run_i))} to finish'

In [4]:
beers = {
    "name": "beers",
    "path": "../datasets/beers/dirty.csv",
    "clean_path": "../datasets/beers/clean.csv"
}
flights = {
    "name": "flights",
    "path": "../datasets/flights/dirty.csv",
    "clean_path": "../datasets/flights/clean.csv"
}
movies = {
    "name": "movies",
    "path": "../datasets/movies_1/dirty.csv",
    "clean_path": "../datasets/movies_1/clean.csv"
}
rayyan = {
    "name": "rayyan",
    "path": "../datasets/rayyan/dirty.csv",
    "clean_path": "../datasets/rayyan/clean.csv"
}
tax = {
    "name": "tax",
    "path": "../datasets/tax/dirty.csv",
    "clean_path": "../datasets/tax/clean.csv"
}
toy = {
    "name": "toy",
    "path": "../datasets/toy/dirty.csv",
    "clean_path": "../datasets/toy/clean.csv"
}
hospital = {
    "name": "hospital",
    "path": "../datasets/hospital/dirty.csv",
    "clean_path": "../datasets/hospital/clean.csv"
}

In [6]:
datasets = [beers, 
            hospital,
            rayyan,
            flights]
experiments = ['pdep', 'adder', 'constant', 'ente', 'disable_vicinity']
runs = range(10)
duration_experiment = len(datasets) * len(runs) * len(experiments)

results = []
times = [datetime.datetime.now()]
i = 1
for dataset_dictionary in datasets:
    for run in runs:
        for e in experiments:            
            app_2 = raha.Correction()
            print(f'Starting run {i}/{duration_experiment}.')

            app_2.LABELING_BUDGET = 20
            app_2.VERBOSE = False
            app_2.CLASSIFICATION_MODEL = "GBC"
            app_2.EXPERIMENT = e

            d = raha.dataset.Dataset(dataset_dictionary)
            d.detected_cells = dict(d.get_actual_errors_dictionary())
            d = app_2.initialize_dataset(d)
            
            app_2.initialize_models(d)
            while len(d.labeled_tuples) < app_2.LABELING_BUDGET:
                app_2.sample_tuple(d, random_seed=run)
                if d.has_ground_truth:
                    app_2.label_with_ground_truth(d)
                app_2.update_models(d)
                app_2.generate_features(d)
                app_2.predict_corrections(d, random_seed=run)
            
            p, r, f = d.get_data_cleaning_evaluation(d.corrected_cells)[-3:]
            results.append({'dataset': d.name, 'experiment': e, 'run': run, 'precision': p, 'recall': r, 'f1': f})
            
            # time estimates for the measurement
            times.append(datetime.datetime.now())
            print(estimate_time_to_finish(times, i, duration_experiment))
            i += 1

Starting run 1/200.
Run 1/200. 1m 10s per run, estimate 3h 55m 14s to finish
Starting run 2/200.
Run 2/200. 1m 11s per run, estimate 3h 55m 17s to finish
Starting run 3/200.
Run 3/200. 1m 10s per run, estimate 3h 51m 56s to finish
Starting run 4/200.
Run 4/200. 1m 9s per run, estimate 3h 46m 27s to finish
Starting run 5/200.
Run 5/200. 1m 5s per run, estimate 3h 34m 23s to finish
Starting run 6/200.
Run 6/200. 1m 6s per run, estimate 3h 36m 15s to finish
Starting run 7/200.
Run 7/200. 1m 7s per run, estimate 3h 37m 29s to finish
Starting run 8/200.
Run 8/200. 1m 7s per run, estimate 3h 37m 8s to finish
Starting run 9/200.
Run 9/200. 1m 7s per run, estimate 3h 35m 26s to finish
Starting run 10/200.
Run 10/200. 1m 6s per run, estimate 3h 29m 28s to finish
Starting run 11/200.
Run 11/200. 1m 6s per run, estimate 3h 30m 28s to finish
Starting run 12/200.
Run 12/200. 1m 7s per run, estimate 3h 30m 48s to finish
Starting run 13/200.
Run 13/200. 1m 7s per run, estimate 3h 30m 37s to finish
St

In [7]:
print(results)

[{'dataset': 'beers', 'experiment': 'pdep', 'run': 0, 'precision': 0.9292105263157895, 'recall': 0.8094910591471802, 'f1': 0.8652291105121295}, {'dataset': 'beers', 'experiment': 'adder', 'run': 0, 'precision': 0.7315789473684211, 'recall': 0.6373223292067859, 'f1': 0.6812055868659643}, {'dataset': 'beers', 'experiment': 'constant', 'run': 0, 'precision': 0.7315789473684211, 'recall': 0.6373223292067859, 'f1': 0.6812055868659643}, {'dataset': 'beers', 'experiment': 'ente', 'run': 0, 'precision': 0.9321256038647343, 'recall': 0.8846859238881247, 'f1': 0.9077864031992472}, {'dataset': 'beers', 'experiment': 'disable_vicinity', 'run': 0, 'precision': 0.9321256038647343, 'recall': 0.8846859238881247, 'f1': 0.9077864031992472}, {'dataset': 'beers', 'experiment': 'pdep', 'run': 1, 'precision': 0.9443565400843882, 'recall': 0.8209536909674461, 'f1': 0.8783419180770173}, {'dataset': 'beers', 'experiment': 'adder', 'run': 1, 'precision': 0.7544831223628692, 'recall': 0.6558917927556167, 'f1': 0

## Analyse was passiert, wenn ich alle Modelle deaktiviere außer das Vicinity Model -- wo kommen die Korrekturen her?

In [31]:
len(d.detected_cells)

948

In [32]:
overlap = 0
for key in d.corrected_cells.keys():
    if key in d.labeled_cells.keys():
        overlap  = overlap + 1

In [33]:
print(len(d.labeled_cells))

220


In [34]:
overlap

61

In [35]:
print(len(d.corrected_cells))

61
