# Baran Experiment

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas
import IPython.display

import raha
import datetime
from ruska import Ruska

In [3]:
rksa = Ruska(name='2022-05-30-adult-jenga-diligent',
             description='''In der letzten Messung auf dem Letter-Datensatz hat sich abgezeichnet, dass MCAR
             auf einem Datensatz mit komplexen FDs zu besseren Cleaning-Ergebnissen führt, als die naive Strategie.
             Das möchte ich in dieser Messung bestätigen.''',
             commit='ca6f5e8a4904fdaa2f7c862590cda9a7b09dfb84',
             config={'dataset': 'letter',
                     'sampling': 'MCAR',
                     'error_fraction': .03,
                     'labeling_budget': 20,
                     'feature_generators': ['value', 'domain', 'vicinity'],
                     'classification_model': 'ABC',
                     'vicinity_orders': [1, 2],
                     'vicinity_feature_generator': 'pdep',
                     'n_best_pdeps': 5,
                     'score_strategy': 'multiply',
                     'n_rows': 20000
                     },
             ranges={'sampling': ['MCAR', 'MAR', 'MNAR'],
                     'error_fraction': [.03, .05, .1, .2, .3, .4, .5],
                     },
             runs=3,
             save_path='/Users/philipp/code/experimente/2022W21-Jenga-Structured-Approach/'
            )

In [4]:
def run_baran(dataset, sampling, error_fraction, labeling_budget, feature_generators, classification_model, vicinity_orders, vicinity_feature_generator, n_best_pdeps, score_strategy, n_rows, run):
    rate_formatted = str(error_fraction).split('.')[1]
    data_dict = {"name": dataset,
                 "path": f"../datasets/{dataset}/{sampling}/dirty_{rate_formatted}.csv",
                 "clean_path": f"../datasets/{dataset}/clean.csv"}

    data = raha.Dataset(data_dict, n_rows=n_rows)
    data.detected_cells = dict(data.get_actual_errors_dictionary())
    app = raha.Correction()
    app.LABELING_BUDGET = labeling_budget
    app.VERBOSE = False
    app.FEATURE_GENERATORS = feature_generators
    app.CLASSIFICATION_MODEL = classification_model
    app.VICINITY_ORDERS = vicinity_orders
    app.VICINITY_FEATURE_GENERATOR = vicinity_feature_generator
    app.N_BEST_PDEPS = n_best_pdeps
    app.PDEP_SCORE_STRATEGY = score_strategy

    d = app.initialize_dataset(data)
    app.initialize_models(d)
    while len(d.labeled_tuples) < app.LABELING_BUDGET:
        app.sample_tuple(d, random_seed=None)
        app.label_with_ground_truth(d)
        app.update_models(d)
        app.generate_features(d, synchronous=True)
        app.predict_corrections(d, random_seed=None)

        p, r, f = d.get_data_cleaning_evaluation(d.corrected_cells)[-3:]
    return {'precision': p, 'recall': r, 'f1': f}

In [5]:
rksa.run(experiment=run_baran)

Run 1/63. 6m 9s per run, estimate 6h 15m 23s to finish
Run 2/63. 6m 10s per run, estimate 6h 10m 16s to finish
Run 3/63. 6m 4s per run, estimate 5h 58m 43s to finish
Run 4/63. 6m 2s per run, estimate 5h 50m 40s to finish
Run 5/63. 6m 1s per run, estimate 5h 43m 8s to finish
Run 6/63. 5m 58s per run, estimate 5h 34m 8s to finish
Run 7/63. 5m 56s per run, estimate 5h 26m 43s to finish
Run 8/63. 5m 55s per run, estimate 5h 19m 45s to finish
Run 9/63. 5m 54s per run, estimate 5h 13m 0s to finish
Run 10/63. 5m 56s per run, estimate 5h 8m 58s to finish
Run 11/63. 5m 58s per run, estimate 5h 4m 25s to finish
Run 12/63. 5m 59s per run, estimate 4h 59m 31s to finish
Run 13/63. 6m 1s per run, estimate 4h 55m 2s to finish
Run 14/63. 6m 2s per run, estimate 4h 50m 15s to finish
Run 15/63. 6m 4s per run, estimate 4h 45m 12s to finish
Run 16/63. 6m 6s per run, estimate 4h 40m 41s to finish
Run 17/63. 6m 8s per run, estimate 4h 36m 2s to finish
Run 18/63. 6m 9s per run, estimate 4h 31m 13s to finish


# ruska naive

In [6]:
rsk_naive = Ruska(name='2022-05-31-adult-jenga-diligent-naive',
             description='''In der letzten Messung auf dem Letter-Datensatz hat sich abgezeichnet, dass MCAR
             auf einem Datensatz mit komplexen FDs zu besseren Cleaning-Ergebnissen führt, als die naive Strategie.
             Das möchte ich in dieser Messung bestätigen. In dieser Messung habe ich den naive feature-generator
             angestellt.''',
             commit='ca6f5e8a4904fdaa2f7c862590cda9a7b09dfb84',
             config={'dataset': 'letter',
                     'sampling': 'MCAR',
                     'error_fraction': .03,
                     'labeling_budget': 20,
                     'feature_generators': ['value', 'domain', 'vicinity'],
                     'classification_model': 'ABC',
                     'vicinity_orders': [1],
                     'vicinity_feature_generator': 'naive',
                     'n_best_pdeps': 5,
                     'score_strategy': 'multiply',
                     'n_rows': 20000
                     },
             ranges={'sampling': ['MCAR', 'MAR', 'MNAR'],
                     'error_fraction': [.03, .05, .1, .2, .3, .4, .5],
                     },
             runs=3,
             save_path='/Users/philipp/code/experimente/2022W21-Jenga-Structured-Approach/'
            )

In [7]:
rsk_naive.run(experiment=run_baran)

Run 1/63. 19s per run, estimate 20m 11s to finish
Run 2/63. 19s per run, estimate 19m 45s to finish
Run 3/63. 19s per run, estimate 19m 27s to finish
Run 4/63. 21s per run, estimate 20m 47s to finish
Run 5/63. 22s per run, estimate 21m 28s to finish
Run 6/63. 23s per run, estimate 21m 46s to finish
Run 7/63. 24s per run, estimate 22m 23s to finish
Run 8/63. 25s per run, estimate 22m 41s to finish
Run 9/63. 25s per run, estimate 22m 51s to finish
Run 10/63. 31s per run, estimate 27m 25s to finish
Run 11/63. 36s per run, estimate 30m 50s to finish
Run 12/63. 40s per run, estimate 33m 27s to finish
Run 13/63. 44s per run, estimate 36m 38s to finish
Run 14/63. 48s per run, estimate 38m 58s to finish
Run 15/63. 52s per run, estimate 40m 50s to finish
Run 16/63. 56s per run, estimate 43m 40s to finish
Run 17/63. 1m 1s per run, estimate 46m 1s to finish
Run 18/63. 1m 5s per run, estimate 47m 50s to finish
Run 19/63. 1m 11s per run, estimate 51m 26s to finish
Run 20/63. 1m 17s per run, estimat