# Baran Experiment

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas
import IPython.display

import raha
import datetime

In [3]:
def format_delta(delta: datetime.timedelta):
    hours, remainder = divmod(delta.total_seconds(), 3600)
    minutes, seconds = divmod(remainder, 60)
    
    h = f'{int(hours)}h ' if hours > 0 else ''
    m = f'{int(minutes)}m ' if minutes > 0 else ''
    s = f'{int(seconds)}s'
    
    return h+m+s

def estimate_time_to_finish(times: list, current_run_i: int, total_runs: int):
    deltas = []
    i = 1
    
    while i+1 <= len(times):
        deltas.append(times[i] - times[i-1])
        i += 1
        
    avg = sum(deltas, datetime.timedelta())/len(deltas)
    return f'Run {current_run_i}/{total_runs}. {format_delta(avg)} per run, estimate {format_delta(avg*(total_runs - current_run_i))} to finish'

In [11]:
beers = {
    "name": "beers",
    "path": "../datasets/beers/dirty.csv",
    "clean_path": "../datasets/beers/clean.csv"
}
flights = {
    "name": "flights",
    "path": "../datasets/flights/dirty.csv",
    "clean_path": "../datasets/flights/clean.csv"
}
movies = {
    "name": "movies",
    "path": "../datasets/movies_1/dirty.csv",
    "clean_path": "../datasets/movies_1/clean.csv"
}
rayyan = {
    "name": "rayyan",
    "path": "../datasets/rayyan/dirty.csv",
    "clean_path": "../datasets/rayyan/clean.csv"
}
tax = {
    "name": "tax",
    "path": "../datasets/tax/dirty.csv",
    "clean_path": "../datasets/tax/clean.csv"
}
toy = {
    "name": "toy",
    "path": "../datasets/toy/dirty.csv",
    "clean_path": "../datasets/toy/clean.csv"
}
hospital = {
    "name": "hospital",
    "path": "../datasets/hospital/dirty.csv",
    "clean_path": "../datasets/hospital/clean.csv"
}

In [12]:
datasets = [hospital]

for dataset_dictionary in datasets:
    app_2 = raha.Correction()

    app_2.LABELING_BUDGET = 20
    app_2.VERBOSE = False
    app_2.CLASSIFICATION_MODEL = "ABC"
    app_2.EXPERIMENT = 'adder'
    app_2.VICINITY_ONLY = True
    app_2.VICINITY_ORDER = 2
    app_2.HIGHER_ORDER_FEATURE_GENERATOR = 'pdep'
    app_2.N_BEST_PDEPS = 5

    d = raha.dataset.Dataset(dataset_dictionary, n_rows=250)
    d.detected_cells = dict(d.get_actual_errors_dictionary())
    d = app_2.initialize_dataset(d)

    app_2.initialize_models(d)
    while len(d.labeled_tuples) < app_2.LABELING_BUDGET:
        app_2.sample_tuple(d, random_seed=0)
        if d.has_ground_truth:
            app_2.label_with_ground_truth(d)
    #app_2.update_models(d)
    app_2.generate_features_synchronously(d)
    app_2.predict_corrections(d, random_seed=0)

    p, r, f = d.get_data_cleaning_evaluation(d.corrected_cells)[-3:]
    print({'dataset': d.name, 'experiment': app_2.EXPERIMENT, 'precision': p, 'recall': r, 'f1': f})

  array = np.asarray(array, order=order, dtype=dtype)


ValueError: Expected 2D array, got 1D array instead:
array=[array([1.        , 1.        , 1.        , 1.        , 0.78571429,
        1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 1.        ])
 array([0.        , 0.        , 0.        , 0.        , 0.21428571,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ])
 array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1.])                                                           ].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [13]:
print(d.vicinity_corrections)

[{'al_ami-5': 1.0}, {'al_ami-5': 1.0}, {'al_ami-1': 0.1323529411764706, 'al_ami-2': 0.16176470588235295, 'al_ami-3': 0.14705882352941177, 'al_ami-4': 0.14705882352941177, 'al_ami-5': 0.1323529411764706, 'al_ami-7a': 0.14705882352941177, 'al_ami-8a': 0.1323529411764706}, {'al_scip-card-2': 0.02631578947368421, 'al_scip-inf-1': 0.02631578947368421, 'al_scip-inf-2': 0.02631578947368421, 'al_scip-inf-3': 0.02631578947368421, 'al_scip-inf-4': 0.18421052631578946, 'al_scip-inf-6': 0.02631578947368421, 'al_scip-vte-1': 0.02631578947368421, 'al_scip-vte-2': 0.02631578947368421, 'al_ami-7a': 0.2631578947368421, 'al_ami-8a': 0.15789473684210525, 'al_ami-4': 0.10526315789473684, 'al_ami-3': 0.07894736842105263, 'al_ami-2': 0.02631578947368421}, {'al_scip-card-2': 0.02631578947368421, 'al_scip-inf-1': 0.02631578947368421, 'al_scip-inf-2': 0.02631578947368421, 'al_scip-inf-3': 0.02631578947368421, 'al_scip-inf-4': 0.18421052631578946, 'al_scip-inf-6': 0.02631578947368421, 'al_scip-vte-1': 0.0263157

In [14]:
print(d.vc_order_n_corrections)

[{'al_ami-5': 1.0}, {'al_ami-5': 1.0}, {'al_ami-5': 1.0}, {'al_ami-5': 1.0}, {'al_ami-5': 1.0}, {'al_ami-5': 1.0}, {'al_ami-5': 1.0}, {'al_ami-5': 1.0}, {'al_ami-5': 1.0}]


In [5]:
datasets = [hospital]

for dataset_dictionary in datasets:
    app_2 = raha.Correction()

    app_2.LABELING_BUDGET = 20
    app_2.VERBOSE = False
    app_2.CLASSIFICATION_MODEL = "DTC"
    app_2.EXPERIMENT = 'adder'
    app_2.VICINITY_ONLY = True
    app_2.HIGHER_ORDER_FEATURE_GENERATOR = ''  # disable

    d = raha.dataset.Dataset(dataset_dictionary, n_rows=250)
    d.detected_cells = dict(d.get_actual_errors_dictionary())
    d = app_2.initialize_dataset(d)

    app_2.initialize_models(d)
    while len(d.labeled_tuples) < app_2.LABELING_BUDGET:
        app_2.sample_tuple(d, random_seed=0)
        if d.has_ground_truth:
            app_2.label_with_ground_truth(d)
    #app_2.update_models(d)
    app_2.generate_features(d)
    app_2.predict_corrections(d, random_seed=0)

    p, r, f = d.get_data_cleaning_evaluation(d.corrected_cells)[-3:]
    print({'dataset': d.name, 'experiment': app_2.EXPERIMENT, 'precision': p, 'recall': r, 'f1': f})

{'dataset': 'hospital', 'experiment': 'adder', 'precision': 0.9572649572649573, 'recall': 0.8682170542635659, 'f1': 0.910569105691057}


In [7]:
len(d.pair_features[(13,1)]['10019'])

20

In [6]:
from raha import pdep
lhss = set([x for x in d.pdep_counts_dict.keys()])
rhss = list(range(d.dataframe.shape[1]))
gpdeps = {lhs: {} for lhs in lhss}
for lhs in lhss:
    for rhs in rhss:
        gpdeps[lhs][rhs] = pdep.calc_gpdep(d.dataframe, d.pdep_counts_dict, lhs, rhs)

In [23]:
inverse_gpdeps = {rhs: {} for rhs in range(d.dataframe.shape[1])}
for lhs in gpdeps:
    for rhs in gpdeps[lhs]:
        inverse_gpdeps[rhs][lhs] = gpdeps[lhs][rhs]

for rhs in inverse_gpdeps:
    inverse_gpdeps[rhs] = {k: v for k, v in sorted(inverse_gpdeps[rhs].items(), key=lambda item: item[1], reverse=True)}

In [49]:
top_ten_pdeps = {rhs: [] for rhs in range(d.dataframe.shape[1])}
for rhs in inverse_gpdeps:
    top_ten_pdeps[rhs] = list(inverse_gpdeps[rhs].items())[:10]

In [7]:
gpdeps

{(8, 14, 15): {0: 0.010000000000000009,
  1: -0.020000000000000018,
  2: 0.0,
  3: -0.010000000000000009,
  4: 0.010000000000000009,
  5: 0.010000000000000009,
  6: -0.06000000000000005,
  7: -0.040000000000000036,
  8: -0.181,
  9: -0.020000000000000018,
  10: -0.040000000000000036,
  11: -0.020000000000000018,
  12: -0.030000000000000027,
  13: -0.020000000000000018,
  14: -0.247,
  15: -0.030399999999999996,
  16: -0.010000000000000009,
  17: 0.010000000000000009,
  18: -0.010000000000000009,
  19: -0.010000000000000009},
 (1, 2, 8): {0: 0.0022222222222221255,
  1: -0.188,
  2: -0.1948,
  3: 0.6896909090909091,
  4: 0.010000000000000009,
  5: 0.010000000000000009,
  6: 0.6549818181818181,
  7: 0.04348484848484846,
  8: -0.181,
  9: 0.6716060606060605,
  10: 0.6748060606060606,
  11: 0.031672727272727186,
  12: 0.49076969696969697,
  13: 0.12746060606060605,
  14: 0.0978831168831169,
  15: -0.0244929292929294,
  16: -0.02618787878787887,
  17: 0.0644242424242426,
  18: 0.085046464646

In [38]:
print(d.dataframe.columns[6])
print(d.dataframe.columns[9])
print('->')
print(d.dataframe.columns[1])

City
CountyName
->
ProviderNumber


In [6]:
datasets = [beers, 
            hospital,
            rayyan,
            flights]
experiments = ['adder']
runs = range(10)
duration_experiment = len(datasets) * len(runs) * len(experiments)

results = []
times = [datetime.datetime.now()]
i = 1
for dataset_dictionary in datasets:
    for run in runs:
        for e in experiments:            
            app_2 = raha.Correction()
            print(f'Starting run {i}/{duration_experiment}.')

            app_2.LABELING_BUDGET = 20
            app_2.VERBOSE = False
            app_2.CLASSIFICATION_MODEL = "ABC"
            app_2.EXPERIMENT = e
            app_2.IMPUTER_FEATURE_GENERATOR = False

            d = raha.Dataset(dataset_dictionary)
            d.detected_cells = dict(d.get_actual_errors_dictionary())
            d = app_2.initialize_dataset(d)
            
            app_2.initialize_models(d)
            while len(d.labeled_tuples) < app_2.LABELING_BUDGET:
                app_2.sample_tuple(d, random_seed=run)
                if d.has_ground_truth:
                    app_2.label_with_ground_truth(d)
            app_2.update_models(d)
            app_2.generate_features(d)
            app_2.predict_corrections(d, random_seed=run)
                
            p, r, f = d.get_data_cleaning_evaluation(d.corrected_cells)[-3:]
            results.append({'dataset': d.name, 'experiment': e, 'run': run, 'precision': p, 'recall': r, 'f1': f})
            
            # time estimates for the measurement
            times.append(datetime.datetime.now())
            print(estimate_time_to_finish(times, i, duration_experiment))
            i += 1

Starting run 1/40.
Run 1/40. 43s per run, estimate 28m 23s to finish
Starting run 2/40.
Run 2/40. 41s per run, estimate 26m 26s to finish
Starting run 3/40.
Run 3/40. 40s per run, estimate 25m 4s to finish
Starting run 4/40.
Run 4/40. 40s per run, estimate 24m 20s to finish
Starting run 5/40.
Run 5/40. 40s per run, estimate 23m 38s to finish
Starting run 6/40.
Run 6/40. 40s per run, estimate 22m 47s to finish
Starting run 7/40.
Run 7/40. 40s per run, estimate 22m 6s to finish
Starting run 8/40.
Run 8/40. 40s per run, estimate 21m 29s to finish
Starting run 9/40.
Run 9/40. 40s per run, estimate 20m 50s to finish
Starting run 10/40.
Run 10/40. 40s per run, estimate 20m 10s to finish
Starting run 11/40.
Run 11/40. 41s per run, estimate 19m 59s to finish
Starting run 12/40.
Run 12/40. 42s per run, estimate 19m 38s to finish
Starting run 13/40.
Run 13/40. 43s per run, estimate 19m 31s to finish
Starting run 14/40.
Run 14/40. 44s per run, estimate 19m 24s to finish
Starting run 15/40.
Run 15

In [8]:
print(results)

[{'dataset': 'beers', 'experiment': 'adder', 'run': 0, 'precision': 0.7274850765637166, 'recall': 0.6425951398441082, 'f1': 0.6824102251978089}, {'dataset': 'beers', 'experiment': 'adder', 'run': 1, 'precision': 0.7274850765637166, 'recall': 0.6425951398441082, 'f1': 0.6824102251978089}, {'dataset': 'beers', 'experiment': 'adder', 'run': 2, 'precision': 0.7280041526083572, 'recall': 0.6430536451169189, 'f1': 0.6828971393791845}, {'dataset': 'beers', 'experiment': 'adder', 'run': 3, 'precision': 0.7274850765637166, 'recall': 0.6425951398441082, 'f1': 0.6824102251978089}, {'dataset': 'beers', 'experiment': 'adder', 'run': 4, 'precision': 0.7280041526083572, 'recall': 0.6430536451169189, 'f1': 0.6828971393791845}, {'dataset': 'beers', 'experiment': 'adder', 'run': 5, 'precision': 0.7280041526083572, 'recall': 0.6430536451169189, 'f1': 0.6828971393791845}, {'dataset': 'beers', 'experiment': 'adder', 'run': 6, 'precision': 0.7274850765637166, 'recall': 0.6425951398441082, 'f1': 0.6824102251

In [29]:
datasets = [beers, 
            hospital,
            rayyan,
            flights]
experiments = ['pdep', 'adder', 'constant', 'ente', 'disable_vicinity']
runs = range(3)
duration_experiment = len(datasets) * len(runs) * len(experiments)

results = []
times = [datetime.datetime.now()]
i = 1
for dataset_dictionary in datasets:
    for run in runs:
        for e in experiments:            
            app_2 = raha.Correction()
            print(f'Starting run {i}/{duration_experiment}.')

            app_2.LABELING_BUDGET = 20
            app_2.VERBOSE = False
            app_2.CLASSIFICATION_MODEL = "DTC"
            app_2.EXPERIMENT = e

            d = raha.dataset.Dataset(dataset_dictionary, n_rows=500)
            d.detected_cells = dict(d.get_actual_errors_dictionary())
            d = app_2.initialize_dataset(d)
            
            app_2.initialize_models(d)
            while len(d.labeled_tuples) < app_2.LABELING_BUDGET:
                app_2.sample_tuple(d, random_seed=run)
                if d.has_ground_truth:
                    app_2.label_with_ground_truth(d)
                app_2.update_models(d)
                app_2.generate_features(d)
                app_2.predict_corrections(d, random_seed=run)
                
            p, r, f = d.get_data_cleaning_evaluation(d.corrected_cells)[-3:]
            results.append({'dataset': d.name, 'experiment': e, 'run': run, 'precision': p, 'recall': r, 'f1': f})
            
            # time estimates for the measurement
            times.append(datetime.datetime.now())
            print(estimate_time_to_finish(times, i, duration_experiment))
            i += 1

Starting run 1/60.


Process ForkPoolWorker-31:
Process ForkPoolWorker-35:
Process ForkPoolWorker-28:
Process ForkPoolWorker-36:
Process ForkPoolWorker-33:
Process ForkPoolWorker-34:


KeyboardInterrupt: 

Process ForkPoolWorker-29:
Process ForkPoolWorker-27:
Process ForkPoolWorker-32:
Process ForkPoolWorker-26:
Process ForkPoolWorker-25:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/philipp/.pyenv/versions/3.7.10/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/Users/philipp/.pyenv/versions/3.7.10/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/Users/philipp/.pyenv/versions/3.7.10/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/Users/philipp/.pyenv/versions/3.7.10/lib/python3.7/multiprocessing/process.py", line 29

In [7]:
print(results)

[{'dataset': 'beers', 'experiment': 'pdep', 'run': 0, 'precision': 0.8536126147328783, 'recall': 0.8314993122420908, 'f1': 0.842410869817675}, {'dataset': 'beers', 'experiment': 'adder', 'run': 0, 'precision': 0.8489056248529065, 'recall': 0.8269142595139845, 'f1': 0.837765648589014}, {'dataset': 'beers', 'experiment': 'constant', 'run': 0, 'precision': 0.849105461393597, 'recall': 0.8269142595139845, 'f1': 0.8378629500580721}, {'dataset': 'beers', 'experiment': 'ente', 'run': 0, 'precision': 0.8494921514312096, 'recall': 0.8436497019715726, 'f1': 0.8465608465608465}, {'dataset': 'beers', 'experiment': 'disable_vicinity', 'run': 0, 'precision': 0.8541089566020313, 'recall': 0.848234754699679, 'f1': 0.8511617207269381}, {'dataset': 'beers', 'experiment': 'pdep', 'run': 1, 'precision': 0.84375, 'recall': 0.8356258596973866, 'f1': 0.8396682791983414}, {'dataset': 'beers', 'experiment': 'adder', 'run': 1, 'precision': 0.6898148148148148, 'recall': 0.6831728564878496, 'f1': 0.68647777009905