# Baran Experiment

In [1]:
%load_ext autoreload
%autoreload 2

In [11]:
import pandas
import IPython.display

import raha
import datetime
from ruska import Ruska

In [14]:
rksa = Ruska(name='2022-05-30-adult-jenga-diligent',
             description='''In der letzten Messung auf dem Letter-Datensatz hat sich abgezeichnet, dass MCAR
             auf einem Datensatz mit komplexen FDs zu besseren Cleaning-Ergebnissen führt, als die naive Strategie.
             Das möchte ich in dieser Messung bestätigen.''',
             commit=)

TypeError: __init__() missing 7 required positional arguments: 'name', 'description', 'commit', 'config', 'ranges', 'runs', and 'save_path'

In [3]:
def format_delta(delta: datetime.timedelta):
    hours, remainder = divmod(delta.total_seconds(), 3600)
    minutes, seconds = divmod(remainder, 60)
    
    h = f'{int(hours)}h ' if hours > 0 else ''
    m = f'{int(minutes)}m ' if minutes > 0 else ''
    s = f'{int(seconds)}s'
    
    return h+m+s

def estimate_time_to_finish(times: list, current_run_i: int, total_runs: int):
    deltas = []
    i = 1
    
    while i+1 <= len(times):
        deltas.append(times[i] - times[i-1])
        i += 1
        
    avg = sum(deltas, datetime.timedelta())/len(deltas)
    return f'Run {current_run_i}/{total_runs}. {format_delta(avg)} per run, estimate {format_delta(avg*(total_runs - current_run_i))} to finish'

In [7]:
results = []

samplings = ['MCAR', 'MAR', 'MNAR']
error_fractions = [.03, .05, .1, .2 .3, .4, .5]
runs = range(3)

duration_experiment = len(runs) * len(samplings) * len(error_fractions)
times = [datetime.datetime.now()]
i = 1

dataset = 'letter'
score_strategy = 'multiply'

for sampling in samplings:
    for error_fraction in error_fractions:
        for run in range(3):
            rate_formatted = str(error_fraction).split('.')[1]
            data_dict = {"name": dataset,
                         "path": f"../datasets/{dataset}/{sampling}/dirty_{rate_formatted}.csv",
                         "clean_path": f"../datasets/{dataset}/clean.csv"}

            data = raha.Dataset(data_dict)
            data.detected_cells = dict(data.get_actual_errors_dictionary())
            app = raha.Correction()
            app.LABELING_BUDGET = 20
            app.VERBOSE = False
            app.FEATURE_GENERATORS = ['domain', 'value', 'vicinity']
            app.CLASSIFICATION_MODEL = "ABC"
            app.VICINITY_ORDERS = [1]
            app.VICINITY_FEATURE_GENERATOR = 'naive'
            app.N_BEST_PDEPS = 5
            app.PDEP_SCORE_STRATEGY = score_strategy

            d = app.initialize_dataset(data)
            app.initialize_models(d)
            while len(d.labeled_tuples) < app.LABELING_BUDGET:
                app.sample_tuple(d, random_seed=None)
                app.label_with_ground_truth(d)
                app.update_models(d)
                app.generate_features(d, synchronous=True)
                app.predict_corrections(d, random_seed=None)

                p, r, f = d.get_data_cleaning_evaluation(d.corrected_cells)[-3:]
            result = {'dataset': dataset,
                      'run': run,
                      'sampling_strategy': sampling,
                      'error_fraction': error_fraction,
                      'pdep_score_strategy': score_strategy,
                      'precision': p, 'recall': r, 'f1': f}                    
            results.append(result)

            # time estimates for the measurement
            times.append(datetime.datetime.now())
            print(estimate_time_to_finish(times, i, duration_experiment))
            i += 1

Run 1/27. 2m 1s per run, estimate 52m 44s to finish
Run 2/27. 2m 1s per run, estimate 50m 31s to finish
Run 3/27. 2m 1s per run, estimate 48m 38s to finish
Run 4/27. 2m 6s per run, estimate 48m 40s to finish
Run 5/27. 2m 9s per run, estimate 47m 37s to finish
Run 6/27. 2m 11s per run, estimate 46m 9s to finish
Run 7/27. 2m 22s per run, estimate 47m 26s to finish
Run 8/27. 2m 30s per run, estimate 47m 34s to finish
Run 9/27. 6m 50s per run, estimate 2h 3m 13s to finish
Run 10/27. 6m 27s per run, estimate 1h 49m 53s to finish
Run 11/27. 6m 8s per run, estimate 1h 38m 19s to finish
Run 12/27. 5m 52s per run, estimate 1h 28m 9s to finish
Run 13/27. 5m 44s per run, estimate 1h 20m 27s to finish
Run 14/27. 5m 38s per run, estimate 1h 13m 15s to finish
Run 15/27. 5m 32s per run, estimate 1h 6m 28s to finish
Run 16/27. 5m 30s per run, estimate 1h 34s to finish
Run 17/27. 5m 24s per run, estimate 54m 0s to finish
Run 18/27. 5m 18s per run, estimate 47m 43s to finish
Run 19/27. 5m 6s per run, es

## result pdep

In [6]:
print(results)

[{'dataset': 'letter', 'run': 0, 'sampling_strategy': 'MCAR', 'error_fraction': 0.3, 'pdep_score_strategy': 'multiply', 'precision': 0.8643590259609579, 'recall': 0.8643590259609579, 'f1': 0.8643590259609579}, {'dataset': 'letter', 'run': 1, 'sampling_strategy': 'MCAR', 'error_fraction': 0.3, 'pdep_score_strategy': 'multiply', 'precision': 0.952304286576776, 'recall': 0.952304286576776, 'f1': 0.952304286576776}, {'dataset': 'letter', 'run': 2, 'sampling_strategy': 'MCAR', 'error_fraction': 0.3, 'pdep_score_strategy': 'multiply', 'precision': 0.999798752264037, 'recall': 0.999798752264037, 'f1': 0.999798752264037}, {'dataset': 'letter', 'run': 0, 'sampling_strategy': 'MCAR', 'error_fraction': 0.4, 'pdep_score_strategy': 'multiply', 'precision': 1.0, 'recall': 1.0, 'f1': 1.0}, {'dataset': 'letter', 'run': 1, 'sampling_strategy': 'MCAR', 'error_fraction': 0.4, 'pdep_score_strategy': 'multiply', 'precision': 0.8748492159227985, 'recall': 0.8748492159227985, 'f1': 0.8748492159227986}, {'dat

## result naive

In [9]:
print(results)

[{'dataset': 'letter', 'run': 0, 'sampling_strategy': 'MCAR', 'error_fraction': 0.3, 'pdep_score_strategy': 'multiply', 'precision': 0.6015294827933185, 'recall': 0.6015294827933185, 'f1': 0.6015294827933185}, {'dataset': 'letter', 'run': 1, 'sampling_strategy': 'MCAR', 'error_fraction': 0.3, 'pdep_score_strategy': 'multiply', 'precision': 0.7585027168444355, 'recall': 0.7585027168444355, 'f1': 0.7585027168444355}, {'dataset': 'letter', 'run': 2, 'sampling_strategy': 'MCAR', 'error_fraction': 0.3, 'pdep_score_strategy': 'multiply', 'precision': 0.8088146508351781, 'recall': 0.8088146508351781, 'f1': 0.8088146508351781}, {'dataset': 'letter', 'run': 0, 'sampling_strategy': 'MCAR', 'error_fraction': 0.4, 'pdep_score_strategy': 'multiply', 'precision': 0.9473763570566948, 'recall': 0.9473763570566948, 'f1': 0.9473763570566948}, {'dataset': 'letter', 'run': 1, 'sampling_strategy': 'MCAR', 'error_fraction': 0.4, 'pdep_score_strategy': 'multiply', 'precision': 0.7264776839565742, 'recall': 0