# Baran Experiment

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas
import IPython.display

import raha
import datetime
from ruska import Ruska

In [3]:
def run_baran(dataset, sampling, error_fraction, labeling_budget, feature_generators, classification_model, vicinity_orders, vicinity_feature_generator, n_best_pdeps, score_strategy, n_rows, run):
    rate_formatted = str(error_fraction).split('.')[1]
    data_dict = {"name": dataset,
                 "path": f"../datasets/{dataset}/{sampling}/dirty_{rate_formatted}.csv",
                 "clean_path": f"../datasets/{dataset}/clean.csv"}

    data = raha.Dataset(data_dict, n_rows=n_rows)
    data.detected_cells = dict(data.get_actual_errors_dictionary())
    app = raha.Correction()
    app.LABELING_BUDGET = labeling_budget
    app.VERBOSE = False
    app.FEATURE_GENERATORS = feature_generators
    app.CLASSIFICATION_MODEL = classification_model
    app.VICINITY_ORDERS = vicinity_orders
    app.VICINITY_FEATURE_GENERATOR = vicinity_feature_generator
    app.N_BEST_PDEPS = n_best_pdeps
    app.PDEP_SCORE_STRATEGY = score_strategy

    d = app.initialize_dataset(data)
    app.initialize_models(d)
    while len(d.labeled_tuples) < app.LABELING_BUDGET:
        app.sample_tuple(d, random_seed=None)
        app.label_with_ground_truth(d)
        app.update_models(d)
        app.generate_features(d, synchronous=True)
        app.predict_corrections(d, random_seed=None)

        p, r, f = d.get_data_cleaning_evaluation(d.corrected_cells)[-3:]
    return {'precision': p, 'recall': r, 'f1': f}

In [4]:
rsk = Ruska(name='2022-06-05-nursery-simple-mcar',
             description='''Ich wiederhole die Messung auf dem Datensatz mit
             Corruptors simple_mcar Korruption.''',
             commit='ac5596853f8d581640cbd3eced08e98e42eb07a3',
             config={'dataset': 'nursery',
                     'sampling': 'MCAR',
                     'error_fraction': .1,
                     'labeling_budget': 20,
                     'feature_generators': ['value', 'domain', 'vicinity'],
                     'classification_model': 'ABC',
                     'vicinity_orders': [1, 2],
                     'vicinity_feature_generator': 'pdep',
                     'n_best_pdeps': 5,
                     'score_strategy': 'multiply',
                     'n_rows': None,
                     },
             ranges={
                     'error_fraction': [.1, .2, .3, .4, .5, .6, .7, .8, .9, .99],
                     },
             runs=3,
             save_path='/Users/philipp/code/experimente/2022W22-Konsolidierung/'
            )

In [5]:
rsk.run(experiment=run_baran)

Run 1/30. 3m 59s per run, estimate 1h 55m 48s to finish.
Run 2/30. 3m 52s per run, estimate 1h 48m 32s to finish.
Run 3/30. 3m 59s per run, estimate 1h 47m 43s to finish.
Run 4/30. 4m 42s per run, estimate 2h 2m 17s to finish.
Run 5/30. 5m 8s per run, estimate 2h 8m 33s to finish.
Run 6/30. 5m 28s per run, estimate 2h 11m 27s to finish.
Run 7/30. 6m 21s per run, estimate 2h 26m 3s to finish.
Run 8/30. 7m 5s per run, estimate 2h 36m 7s to finish.
Run 9/30. 7m 39s per run, estimate 2h 40m 46s to finish.
Run 10/30. 8m 30s per run, estimate 2h 50m 8s to finish.
Run 11/30. 9m 22s per run, estimate 2h 58m 10s to finish.
Run 12/30. 10m 0s per run, estimate 3h 14s to finish.
Run 13/30. 10m 58s per run, estimate 3h 6m 41s to finish.
Run 14/30. 11m 53s per run, estimate 3h 10m 12s to finish.
Run 15/30. 12m 26s per run, estimate 3h 6m 40s to finish.
Run 16/30. 13m 8s per run, estimate 3h 3m 54s to finish.
Run 17/30. 13m 36s per run, estimate 2h 56m 50s to finish.
Run 18/30. 14m 5s per run, estima

# ruska naive

In [6]:
rsk_naive = Ruska(name='2022-06-05-nursery-simple-mcar-naive',
             description='''Ich wiederhole die Messung auf dem Datensatz mit
             Corruptors simple_mcar Korruption.''',
             commit='ac5596853f8d581640cbd3eced08e98e42eb07a3',
             config={'dataset': 'nursery',
                     'sampling': 'MCAR',
                     'error_fraction': .1,
                     'labeling_budget': 20,
                     'feature_generators': ['value', 'domain', 'vicinity'],
                     'classification_model': 'ABC',
                     'vicinity_orders': [1],
                     'vicinity_feature_generator': 'naive',
                     'n_best_pdeps': 5,
                     'score_strategy': 'multiply',
                      'n_rows': None,
                     },
             ranges={
                     'error_fraction': [.1, .2, .3, .4, .5, .6, .7, .8, .9, .99],
                     },
             runs=3,
             save_path='/Users/philipp/code/experimente/2022W22-Konsolidierung/'
            )

In [7]:
rsk_naive.run(experiment=run_baran)

Run 1/30. 2m 37s per run, estimate 1h 15m 57s to finish.
Run 2/30. 2m 35s per run, estimate 1h 12m 44s to finish.
Run 3/30. 2m 36s per run, estimate 1h 10m 37s to finish.
Run 4/30. 3m 23s per run, estimate 1h 28m 14s to finish.
Run 5/30. 3m 52s per run, estimate 1h 36m 57s to finish.
Run 6/30. 4m 15s per run, estimate 1h 42m 17s to finish.
Run 7/30. 4m 51s per run, estimate 1h 51m 49s to finish.
Run 8/30. 5m 24s per run, estimate 1h 59m 6s to finish.
Run 9/30. 5m 46s per run, estimate 2h 1m 26s to finish.
Run 10/30. 6m 29s per run, estimate 2h 9m 41s to finish.
Run 11/30. 6m 59s per run, estimate 2h 12m 49s to finish.
Run 12/30. 7m 25s per run, estimate 2h 13m 46s to finish.
Run 13/30. 8m 6s per run, estimate 2h 17m 42s to finish.
Run 14/30. 8m 42s per run, estimate 2h 19m 26s to finish.
Run 15/30. 9m 8s per run, estimate 2h 17m 3s to finish.
Run 16/30. 9m 44s per run, estimate 2h 16m 17s to finish.
Run 17/30. 10m 19s per run, estimate 2h 14m 17s to finish.
Run 18/30. 10m 48s per run, 