# Baran Experiment

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas
import IPython.display

import raha
import datetime
from ruska import Ruska

In [3]:
def run_baran(c: dict):
    rate_formatted = str(c['error_fraction']).split('.')[1]
    data_dict = {"name": c['dataset'],
                 "path": f"../datasets/{c['dataset']}/{c['sampling']}/dirty_{rate_formatted}.csv",
                 "clean_path": f"../datasets/{c['dataset']}/clean.csv"}

    data = raha.Dataset(data_dict, n_rows=c['n_rows'])
    data.detected_cells = dict(data.get_actual_errors_dictionary())
    app = raha.Correction()
    app.LABELING_BUDGET = c['labeling_budget']
    app.VERBOSE = False
    app.FEATURE_GENERATORS = c['feature_generators']
    app.CLASSIFICATION_MODEL = c['classification_model']
    app.VICINITY_ORDERS = c['vicinity_orders']
    app.VICINITY_FEATURE_GENERATOR = c['vicinity_feature_generator']
    app.N_BEST_PDEPS = c['n_best_pdeps']
    app.PDEP_SCORE_STRATEGY = c['score_strategy']

    d = app.initialize_dataset(data)
    app.initialize_models(d)
    while len(d.labeled_tuples) < app.LABELING_BUDGET:
        app.sample_tuple(d, random_seed=None)
        app.label_with_ground_truth(d)
        app.update_models(d)
        app.generate_features(d, synchronous=True)
        app.predict_corrections(d, random_seed=None)

        p, r, f = d.get_data_cleaning_evaluation(d.corrected_cells)[-3:]
    return {'result': {'precision': p, 'recall': r, 'f1': f,},
            'config': c}

In [4]:
rsk = Ruska(name='20220610-breast-cancer-parallel',
             description='''A test if parallel computation works.''',
             commit='test',
             config={'dataset': 'breast-cancer',
                     'sampling': 'MCAR',
                     'error_fraction': .1,
                     'labeling_budget': 20,
                     'feature_generators': ['value', 'domain', 'vicinity'],
                     'classification_model': 'ABC',
                     'vicinity_orders': [1, 2],
                     'vicinity_feature_generator': 'pdep',
                     'n_best_pdeps': 5,
                     'score_strategy': 'multiply',
                     'n_rows': None,
                     },
             ranges={
                     'error_fraction': [.1, .2, .3, .4, .5, .6, .7, .8, .9, .99],
                     },
             runs=3,
             save_path='/Users/philipp/code/experimente/2022W23-parallel-ruska'
            )

In [5]:
rsk.run(experiment=run_baran, parallel=True)

Process ForkPoolWorker-3:
Process ForkPoolWorker-1:
Process ForkPoolWorker-4:
Process ForkPoolWorker-2:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib64/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/usr/lib64/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/usr/lib64/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib64/python3.8/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
  File "/usr/lib64/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/usr/lib64/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/usr/lib64/python3.8/multiprocessing/pool.py", line 48, in mapstar
    return list(map(*args))
  File "/usr/lib64/python3.8

In [4]:
rsk_naive = Ruska(name='20220610-breast-cancer-naive-parallel',
             description='''A test if parallel computation works.''',
             commit='test',
             config={'dataset': 'breast-cancer',
                     'sampling': 'MCAR',
                     'error_fraction': .1,
                     'labeling_budget': 20,
                     'feature_generators': ['value', 'domain', 'vicinity'],
                     'classification_model': 'ABC',
                     'vicinity_orders': [1],
                     'vicinity_feature_generator': 'naive',
                     'n_best_pdeps': 5,
                     'score_strategy': 'multiply',
                      'n_rows': None,
                     },
             ranges={
                     'error_fraction': [.1, .2, .3, .4, .5, .6, .7, .8, .9, .99],
                     },
             runs=3,
             save_path='/Users/philipp/code/experimente/2022W23-parallel-ruska'
            )

In [5]:
rsk_naive.run(experiment=run_baran, parallel=True)

Measurement finished
