# Baran Experiment

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas
import IPython.display

import raha
import datetime

In [3]:
def format_delta(delta: datetime.timedelta):
    hours, remainder = divmod(delta.total_seconds(), 3600)
    minutes, seconds = divmod(remainder, 60)
    
    h = f'{int(hours)}h ' if hours > 0 else ''
    m = f'{int(minutes)}m ' if minutes > 0 else ''
    s = f'{int(seconds)}s'
    
    return h+m+s

def estimate_time_to_finish(times: list, current_run_i: int, total_runs: int):
    deltas = []
    i = 1
    
    while i+1 <= len(times):
        deltas.append(times[i] - times[i-1])
        i += 1
        
    avg = sum(deltas, datetime.timedelta())/len(deltas)
    return f'Run {current_run_i}/{total_runs}. {format_delta(avg)} per run, estimate {format_delta(avg*(total_runs - current_run_i))} to finish'

In [4]:
beers = {
    "name": "beers",
    "path": "../datasets/beers/dirty.csv",
    "clean_path": "../datasets/beers/clean.csv"
}
flights = {
    "name": "flights",
    "path": "../datasets/flights/dirty.csv",
    "clean_path": "../datasets/flights/clean.csv"
}
movies = {
    "name": "movies",
    "path": "../datasets/movies_1/dirty.csv",
    "clean_path": "../datasets/movies_1/clean.csv"
}
rayyan = {
    "name": "rayyan",
    "path": "../datasets/rayyan/dirty.csv",
    "clean_path": "../datasets/rayyan/clean.csv"
}
tax = {
    "name": "tax",
    "path": "../datasets/tax/dirty.csv",
    "clean_path": "../datasets/tax/clean.csv"
}
toy = {
    "name": "toy",
    "path": "../datasets/toy/dirty.csv",
    "clean_path": "../datasets/toy/clean.csv"
}
hospital = {
    "name": "hospital",
    "path": "../datasets/hospital/dirty.csv",
    "clean_path": "../datasets/hospital/clean.csv"
}

In [6]:
datasets = [beers, 
            hospital,
            rayyan,
            flights]
experiments = ['pdep', 'adder', 'constant', 'ente', 'disable_vicinity']
runs = range(3)
duration_experiment = len(datasets) * len(runs) * len(experiments)

results = []
times = [datetime.datetime.now()]
i = 1
for dataset_dictionary in datasets:
    for run in runs:
        for e in experiments:            
            app_2 = raha.Correction()
            print(f'Starting run {i}/{duration_experiment}.')

            app_2.LABELING_BUDGET = 20
            app_2.VERBOSE = False
            app_2.CLASSIFICATION_MODEL = "DTC"
            app_2.EXPERIMENT = e

            d = raha.dataset.Dataset(dataset_dictionary)
            d.detected_cells = dict(d.get_actual_errors_dictionary())
            d = app_2.initialize_dataset(d)
            
            app_2.initialize_models(d)
            while len(d.labeled_tuples) < app_2.LABELING_BUDGET:
                app_2.sample_tuple(d, random_seed=run)
                if d.has_ground_truth:
                    app_2.label_with_ground_truth(d)
                app_2.update_models(d)
                app_2.generate_features(d)
                app_2.predict_corrections(d, random_seed=run)
            
            p, r, f = d.get_data_cleaning_evaluation(d.corrected_cells)[-3:]
            results.append({'dataset': d.name, 'experiment': e, 'run': run, 'precision': p, 'recall': r, 'f1': f})
            
            # time estimates for the measurement
            times.append(datetime.datetime.now())
            print(estimate_time_to_finish(times, i, duration_experiment))
            i += 1

Starting run 1/60.
Run 1/60. 2m 36s per run, estimate 2h 33m 33s to finish
Starting run 2/60.
Run 2/60. 2m 35s per run, estimate 2h 29m 51s to finish
Starting run 3/60.
Run 3/60. 2m 36s per run, estimate 2h 28m 41s to finish
Starting run 4/60.
Run 4/60. 2m 34s per run, estimate 2h 23m 48s to finish
Starting run 5/60.
Run 5/60. 2m 24s per run, estimate 2h 12m 46s to finish
Starting run 6/60.
Run 6/60. 2m 26s per run, estimate 2h 12m 2s to finish
Starting run 7/60.
Run 7/60. 2m 28s per run, estimate 2h 10m 55s to finish
Starting run 8/60.
Run 8/60. 2m 28s per run, estimate 2h 8m 35s to finish
Starting run 9/60.
Run 9/60. 2m 27s per run, estimate 2h 5m 45s to finish
Starting run 10/60.
Run 10/60. 2m 24s per run, estimate 2h 49s to finish
Starting run 11/60.
Run 11/60. 2m 26s per run, estimate 1h 59m 44s to finish
Starting run 12/60.
Run 12/60. 2m 27s per run, estimate 1h 58m 1s to finish
Starting run 13/60.
Run 13/60. 2m 27s per run, estimate 1h 55m 46s to finish
Starting run 14/60.
Run 1

In [7]:
datasets = [beers, 
            hospital,
            rayyan,
            flights]
experiments = ['pdep', 'adder', 'constant', 'ente', 'disable_vicinity']
runs = range(3, 10)
duration_experiment = len(datasets) * len(runs) * len(experiments)

times = [datetime.datetime.now()]
i = 1
for dataset_dictionary in datasets:
    for run in runs:
        for e in experiments:            
            app_2 = raha.Correction()
            print(f'Starting run {i}/{duration_experiment}.')

            app_2.LABELING_BUDGET = 20
            app_2.VERBOSE = False
            app_2.CLASSIFICATION_MODEL = "DTC"
            app_2.EXPERIMENT = e

            d = raha.dataset.Dataset(dataset_dictionary)
            d.detected_cells = dict(d.get_actual_errors_dictionary())
            d = app_2.initialize_dataset(d)
            
            app_2.initialize_models(d)
            while len(d.labeled_tuples) < app_2.LABELING_BUDGET:
                app_2.sample_tuple(d, random_seed=run)
                if d.has_ground_truth:
                    app_2.label_with_ground_truth(d)
                app_2.update_models(d)
                app_2.generate_features(d)
                app_2.predict_corrections(d, random_seed=run)
            
            p, r, f = d.get_data_cleaning_evaluation(d.corrected_cells)[-3:]
            results.append({'dataset': d.name, 'experiment': e, 'run': run, 'precision': p, 'recall': r, 'f1': f})
            
            # time estimates for the measurement
            times.append(datetime.datetime.now())
            print(estimate_time_to_finish(times, i, duration_experiment))
            i += 1

Starting run 1/140.
Run 1/140. 2m 27s per run, estimate 5h 41m 36s to finish
Starting run 2/140.
Run 2/140. 2m 26s per run, estimate 5h 37m 55s to finish
Starting run 3/140.
Run 3/140. 2m 25s per run, estimate 5h 31m 18s to finish
Starting run 4/140.
Run 4/140. 2m 21s per run, estimate 5h 21m 41s to finish
Starting run 5/140.
Run 5/140. 2m 15s per run, estimate 5h 4m 12s to finish
Starting run 6/140.
Run 6/140. 2m 17s per run, estimate 5h 6m 33s to finish
Starting run 7/140.
Run 7/140. 2m 18s per run, estimate 5h 7m 22s to finish
Starting run 8/140.
Run 8/140. 2m 19s per run, estimate 5h 5m 54s to finish
Starting run 9/140.
Run 9/140. 2m 18s per run, estimate 5h 2m 8s to finish
Starting run 10/140.
Run 10/140. 2m 15s per run, estimate 4h 53m 24s to finish
Starting run 11/140.
Run 11/140. 2m 16s per run, estimate 4h 53m 20s to finish
Starting run 12/140.
Run 12/140. 2m 17s per run, estimate 4h 52m 56s to finish
Starting run 13/140.
Run 13/140. 2m 17s per run, estimate 4h 51m 18s to fini

In [9]:
print(results)

[{'dataset': 'beers', 'experiment': 'pdep', 'run': 0, 'precision': 0.8536126147328783, 'recall': 0.8314993122420908, 'f1': 0.842410869817675}, {'dataset': 'beers', 'experiment': 'adder', 'run': 0, 'precision': 0.8489056248529065, 'recall': 0.8269142595139845, 'f1': 0.837765648589014}, {'dataset': 'beers', 'experiment': 'constant', 'run': 0, 'precision': 0.849105461393597, 'recall': 0.8269142595139845, 'f1': 0.8378629500580721}, {'dataset': 'beers', 'experiment': 'ente', 'run': 0, 'precision': 0.8494921514312096, 'recall': 0.8436497019715726, 'f1': 0.8465608465608465}, {'dataset': 'beers', 'experiment': 'disable_vicinity', 'run': 0, 'precision': 0.8541089566020313, 'recall': 0.848234754699679, 'f1': 0.8511617207269381}, {'dataset': 'beers', 'experiment': 'pdep', 'run': 1, 'precision': 0.84375, 'recall': 0.8356258596973866, 'f1': 0.8396682791983414}, {'dataset': 'beers', 'experiment': 'adder', 'run': 1, 'precision': 0.6898148148148148, 'recall': 0.6831728564878496, 'f1': 0.68647777009905

## Analyse was passiert, wenn ich alle Modelle deaktiviere außer das Vicinity Model -- wo kommen die Korrekturen her?

In [31]:
len(d.detected_cells)

948

In [32]:
overlap = 0
for key in d.corrected_cells.keys():
    if key in d.labeled_cells.keys():
        overlap  = overlap + 1

In [33]:
print(len(d.labeled_cells))

220


In [34]:
overlap

61

In [35]:
print(len(d.corrected_cells))

61
