# Baran Experiment

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas
import IPython.display

import raha
import datetime

In [4]:
def format_delta(delta: datetime.timedelta):
    hours, remainder = divmod(delta.total_seconds(), 3600)
    minutes, seconds = divmod(remainder, 60)
    
    h = f'{int(hours)}h ' if hours > 0 else ''
    m = f'{int(minutes)}m ' if minutes > 0 else ''
    s = f'{int(seconds)}s'
    
    return h+m+s

def estimate_time_to_finish(times: list, current_run_i: int, total_runs: int):
    deltas = []
    i = 1
    
    while i+1 <= len(times):
        deltas.append(times[i] - times[i-1])
        i += 1
        
    avg = sum(deltas, datetime.timedelta())/len(deltas)
    return f'Run {current_run_i}/{total_runs}. {format_delta(avg)} per run, estimate {format_delta(avg*(total_runs - current_run_i))} to finish'

In [5]:
letter = {"name": "letter",
         "path": "../datasets/letter/dirty.csv",
         "clean_path": "../datasets/letter/clean.csv"}

adult = {
    "name": "adult",
    "path": "../datasets/adult/dirty.csv",
    "clean_path": "../datasets/adult/clean.csv"
}
beers = {
    "name": "beers",
    "path": "../datasets/beers/dirty.csv",
    "clean_path": "../datasets/beers/clean.csv"
}
flights = {
    "name": "flights",
    "path": "../datasets/flights/dirty.csv",
    "clean_path": "../datasets/flights/clean.csv"
}
movies = {
    "name": "movies",
    "path": "../datasets/movies_1/dirty.csv",
    "clean_path": "../datasets/movies_1/clean.csv"
}
rayyan = {
    "name": "rayyan",
    "path": "../datasets/rayyan/dirty.csv",
    "clean_path": "../datasets/rayyan/clean.csv"
}
tax = {
    "name": "tax",
    "path": "../datasets/tax/dirty.csv",
    "clean_path": "../datasets/tax/clean.csv"
}
toy = {
    "name": "toy",
    "path": "../datasets/toy/dirty.csv",
    "clean_path": "../datasets/toy/clean.csv"
}
hospital = {
    "name": "hospital",
    "path": "../datasets/hospital/dirty.csv",
    "clean_path": "../datasets/hospital/clean.csv"
}

In [6]:
results = []

feature_generators_sets = [['value', 'domain', 'vicinity', 'imputer'], ['value', 'domain', 'vicinity'], ['value', 'domain', 'imputer']]
vicinity_feature_generators = ['naive', 'pdep']
vicinity_orders_sets = [[1], [2], [1, 2]]
datasets = [letter, beers, flights, hospital]
runs= range(3)

duration_experiment = len(feature_generators_sets) * len(vicinity_feature_generators) * len(vicinity_orders_sets) * len(datasets) * len(runs)
times = [datetime.datetime.now()]
i = 1

for vicinity_order in vicinity_orders_sets:
    for dataset in datasets:
        for vicinity_feature_generator in vicinity_feature_generators:
            for feature_generator in feature_generators_sets:
                for run in runs:
                    data = raha.Dataset(dataset)
                    data.detected_cells = dict(data.get_actual_errors_dictionary())
                    app = raha.Correction()
                    app.LABELING_BUDGET = 20
                    app.VERBOSE = False
                    app.FEATURE_GENERATORS = feature_generator
                    app.CLASSIFICATION_MODEL = "ABC"
                    app.VICINITY_ORDERS = vicinity_order
                    app.VICINITY_FEATURE_GENERATOR = vicinity_feature_generator
                    app.N_BEST_PDEPS = 4

                    d = app.initialize_dataset(data)
                    app.initialize_models(d)

                    while len(d.labeled_tuples) < app.LABELING_BUDGET:
                        app.sample_tuple(d, random_seed=None)
                        app.label_with_ground_truth(d)
                        app.update_models(d)
                        app.generate_features(d, synchronous=False)
                        app.predict_corrections(d, random_seed=None)

                    p, r, f = d.get_data_cleaning_evaluation(d.corrected_cells)[-3:]
                    result = {'dataset': d.name, 
                              'vicinity_order': vicinity_order, 
                              'vicinity_feature_generator': vicinity_feature_generator,
                              'feature_generator': feature_generator,
                              'run': run,
                              'precision': p, 'recall': r, 'f1': f}                    
                    results.append(result)
                    print(result)
                    
                    # time estimates for the measurement
                    times.append(datetime.datetime.now())
                    print(estimate_time_to_finish(times, i, duration_experiment))
                    i += 1

		multi:softmax doesn't support `predict_proba`.  Switch to `multi:softproba` instead
		multi:softmax doesn't support `predict_proba`.  Switch to `multi:softproba` instead
		multi:softmax doesn't support `predict_proba`.  Switch to `multi:softproba` instead
		multi:softmax doesn't support `predict_proba`.  Switch to `multi:softproba` instead
		multi:softmax doesn't support `predict_proba`.  Switch to `multi:softproba` instead
		multi:softmax doesn't support `predict_proba`.  Switch to `multi:softproba` instead
		multi:softmax doesn't support `predict_proba`.  Switch to `multi:softproba` instead
		multi:softmax doesn't support `predict_proba`.  Switch to `multi:softproba` instead
		multi:softmax doesn't support `predict_proba`.  Switch to `multi:softproba` instead
		multi:softmax doesn't support `predict_proba`.  Switch to `multi:softproba` instead
		multi:softmax doesn't support `predict_proba`.  Switch to `multi:softproba` instead
		multi:softmax doesn't support `predict_proba`.  Swit

Traceback (most recent call last):
  File "/Users/philipp/code/python-envs/raha/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3524, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/hf/v_gw8jb14hl89x0x8g88y9pr0000gn/T/ipykernel_42727/2439955131.py", line 36, in <module>
    app.generate_features(d, synchronous=False)
  File "/Users/philipp/code/raha/raha/correction.py", line 654, in generate_features
    feature_generation_results = pool.map(self._feature_generator_process, process_args_list)
  File "/Users/philipp/.pyenv/versions/3.7.10/lib/python3.7/multiprocessing/pool.py", line 268, in map
    return self._map_async(func, iterable, mapstar, chunksize).get()
  File "/Users/philipp/.pyenv/versions/3.7.10/lib/python3.7/multiprocessing/pool.py", line 651, in get
    self.wait(timeout)
  File "/Users/philipp/.pyenv/versions/3.7.10/lib/python3.7/multiprocessing/pool.py", line 648, in wait
    self._event.wait(timeout)
  File "/Us

  File "/Users/philipp/code/python-envs/raha/lib/python3.7/site-packages/autogluon/tabular/learner/abstract_learner.py", line 164, in predict_proba
    y_pred_proba = self.load_trainer().predict_proba(self.transform_features(X), model=model)
  File "/Users/philipp/code/python-envs/raha/lib/python3.7/site-packages/autogluon/core/trainer/abstract_trainer.py", line 529, in predict_proba
    return self._predict_proba_model(X, model)
  File "/Users/philipp/code/python-envs/raha/lib/python3.7/site-packages/autogluon/core/trainer/abstract_trainer.py", line 1551, in _predict_proba_model
    X = self.get_inputs_to_model(model=model, X=X, model_pred_proba_dict=model_pred_proba_dict, fit=False)
  File "/Users/philipp/code/python-envs/raha/lib/python3.7/site-packages/autogluon/core/trainer/abstract_trainer.py", line 554, in get_inputs_to_model
    model_pred_proba_dict = self.get_model_pred_proba_dict(X=X, models=model_set, model_pred_proba_dict=model_pred_proba_dict, fit=fit)
  File "/Users/phil

TypeError: object of type 'NoneType' has no len()