# Baran Experiment

In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
import pandas
import IPython.display

import raha
import datetime

In [2]:
def format_delta(delta: datetime.timedelta):
    hours, remainder = divmod(delta.total_seconds(), 3600)
    minutes, seconds = divmod(remainder, 60)
    
    h = f'{int(hours)}h ' if hours > 0 else ''
    m = f'{int(minutes)}m ' if minutes > 0 else ''
    s = f'{int(seconds)}s'
    
    return h+m+s

def estimate_time_to_finish(times: list, current_run_i: int, total_runs: int):
    deltas = []
    i = 1
    
    while i+1 <= len(times):
        deltas.append(times[i] - times[i-1])
        i += 1
        
    avg = sum(deltas, datetime.timedelta())/len(deltas)
    return f'Run {current_run_i}/{total_runs}. {format_delta(avg)} per run, estimate {format_delta(avg*(total_runs - current_run_i))} to finish'

In [3]:
letter = {"name": "letter",
         "path": "../datasets/letter/dirty.csv",
         "clean_path": "../datasets/letter/clean.csv"}

adult = {
    "name": "adult",
    "path": "../datasets/adult/dirty.csv",
    "clean_path": "../datasets/adult/clean.csv"
}
beers = {
    "name": "beers",
    "path": "../datasets/beers/dirty.csv",
    "clean_path": "../datasets/beers/clean.csv"
}
flights = {
    "name": "flights",
    "path": "../datasets/flights/dirty.csv",
    "clean_path": "../datasets/flights/clean.csv"
}
movies = {
    "name": "movies",
    "path": "../datasets/movies_1/dirty.csv",
    "clean_path": "../datasets/movies_1/clean.csv"
}
rayyan = {
    "name": "rayyan",
    "path": "../datasets/rayyan/dirty.csv",
    "clean_path": "../datasets/rayyan/clean.csv"
}
tax = {
    "name": "tax",
    "path": "../datasets/tax/dirty.csv",
    "clean_path": "../datasets/tax/clean.csv"
}
toy = {
    "name": "toy",
    "path": "../datasets/toy/dirty.csv",
    "clean_path": "../datasets/toy/clean.csv"
}
hospital = {
    "name": "hospital",
    "path": "../datasets/hospital/dirty.csv",
    "clean_path": "../datasets/hospital/clean.csv"
}

In [5]:
datasets = [hospital]
results_no_update = []
n_pdeps = [4]
vicinity_orders = [[1]]

for run in range(5):
    dataset_dictionary = hospital
    data = raha.Dataset(dataset_dictionary)
    data.detected_cells = dict(data.get_actual_errors_dictionary())
    app_2 = raha.Correction()
    app_2.LABELING_BUDGET = 20
    app_2.VERBOSE = False
    app_2.CLASSIFICATION_MODEL = "ABC"
    app_2.VICINITY_ONLY = False
    app_2.IMPUTER_FEATURE_GENERATOR = False
    app_2.VICINITY_ORDERS = [1]
    app_2.VICINITY_FEATURE_GENERATOR = 'naive'
    app_2.N_BEST_PDEPS = None

    d = app_2.initialize_dataset(data)
    app_2.initialize_models(d)

    while len(d.labeled_tuples) < app_2.LABELING_BUDGET:
        app_2.sample_tuple(d, random_seed=run)
        app_2.label_with_ground_truth(d)
        app_2.update_models(d)
        app_2.generate_features(d, synchronous=False)
        app_2.predict_corrections(d, random_seed=run)

    p, r, f = d.get_data_cleaning_evaluation(d.corrected_cells)[-3:]
    result = {'dataset': d.name, 'n_pdep': None, 'vicinity_order': [1], 'precision': p, 'recall': r, 'f1': f}
    results_no_update.append(result)
    print(result)

{'dataset': 'hospital', 'n_pdep': None, 'vicinity_order': [1], 'precision': 0.975, 'recall': 0.8428290766208252, 'f1': 0.9041095890410958}
{'dataset': 'hospital', 'n_pdep': None, 'vicinity_order': [1], 'precision': 0.8783783783783784, 'recall': 0.2554027504911591, 'f1': 0.39573820395738196}
{'dataset': 'hospital', 'n_pdep': None, 'vicinity_order': [1], 'precision': 0.9316037735849056, 'recall': 0.7760314341846758, 'f1': 0.8467309753483387}
{'dataset': 'hospital', 'n_pdep': None, 'vicinity_order': [1], 'precision': 0.8952702702702703, 'recall': 0.5206286836935167, 'f1': 0.6583850931677018}
{'dataset': 'hospital', 'n_pdep': None, 'vicinity_order': [1], 'precision': 0.9517102615694165, 'recall': 0.9292730844793713, 'f1': 0.9403578528827037}


In [26]:
print(len(d.labeled_cells))

400


In [36]:
len(d.pair_features[(13,1)]['10018'])

28

In [33]:
len(d.pair_features[(13,1)]['10018'])

29

In [34]:
import numpy as np
len(np.array([0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.02569373, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.25      ,
        0.        , 0.        , 0.03022975, 0.02671756, 0.15337423,
        0.02639916, 0.02601457, 0.02520161, 0.02520161, 0.02631579,
        0.02631579, 0.02702703, 0.        ]))

28

In [5]:
datasets = [letter]
results_no_update = []
n_pdeps = [4]
vicinity_orders = [[1], [2], [1,2]]

for dataset_dictionary in datasets:
    for vicinity_order in vicinity_orders:
        for n_pdep in n_pdeps:
            data = raha.Dataset(dataset_dictionary)
            data.detected_cells = dict(data.get_actual_errors_dictionary())
            app_2 = raha.Correction()
            app_2.LABELING_BUDGET = 20
            app_2.VERBOSE = False
            app_2.CLASSIFICATION_MODEL = "ABC"
            app_2.VICINITY_ONLY = False
            app_2.VICINITY_ORDERS = vicinity_order
            app_2.VICINITY_FEATURE_GENERATOR = 'pdep'
            app_2.N_BEST_PDEPS = n_pdep

            d = app_2.initialize_dataset(data)
            app_2.initialize_models(d)

            while len(d.labeled_tuples) < app_2.LABELING_BUDGET:
                app_2.sample_tuple(d, random_seed=0)
                if d.has_ground_truth:
                    app_2.label_with_ground_truth(d)
            app_2.update_models(d)
            app_2.generate_features(d, synchronous=False)
            app_2.predict_corrections(d, random_seed=0)

            p, r, f = d.get_data_cleaning_evaluation(d.corrected_cells)[-3:]
            result = {'dataset': d.name, 'n_pdep': n_pdep, 'vicinity_order': vicinity_order, 'precision': p, 'recall': r, 'f1': f}
            results_no_update.append(result)
            print(result)

{'dataset': 'letter', 'n_pdep': 4, 'vicinity_order': [1], 'precision': 0.38482035413443355, 'recall': 0.27300445149094454, 'f1': 0.31940926764884237}
{'dataset': 'letter', 'n_pdep': 4, 'vicinity_order': [2], 'precision': 0.47416413373860183, 'recall': 0.31392158058418196, 'f1': 0.37775168770179046}
{'dataset': 'letter', 'n_pdep': 4, 'vicinity_order': [1, 2], 'precision': 0.4801023308698124, 'recall': 0.3089822550155497, 'f1': 0.3759878306682002}


In [6]:
print(results_no_update)

[{'dataset': 'letter', 'n_pdep': 4, 'vicinity_order': [1], 'precision': 0.38482035413443355, 'recall': 0.27300445149094454, 'f1': 0.31940926764884237}, {'dataset': 'letter', 'n_pdep': 4, 'vicinity_order': [2], 'precision': 0.47416413373860183, 'recall': 0.31392158058418196, 'f1': 0.37775168770179046}, {'dataset': 'letter', 'n_pdep': 4, 'vicinity_order': [1, 2], 'precision': 0.4801023308698124, 'recall': 0.3089822550155497, 'f1': 0.3759878306682002}]


In [7]:
datasets = [letter]
results_no_update = []
#n_pdeps = [1, 2, 3, 4, 5]
n_pdeps = [None]
vicinity_orders = [[1], [2], [1,2]]

for dataset_dictionary in datasets:
    for vicinity_order in vicinity_orders:
        for n_pdep in n_pdeps:
            data = raha.Dataset(dataset_dictionary)
            data.detected_cells = dict(data.get_actual_errors_dictionary())
            app_2 = raha.Correction()
            app_2.LABELING_BUDGET = 20
            app_2.VERBOSE = False
            app_2.CLASSIFICATION_MODEL = "ABC"
            app_2.VICINITY_ONLY = False
            app_2.VICINITY_ORDERS = vicinity_order
            app_2.VICINITY_FEATURE_GENERATOR = 'naive'
            app_2.N_BEST_PDEPS = n_pdep

            d = app_2.initialize_dataset(data)
            app_2.initialize_models(d)

            while len(d.labeled_tuples) < app_2.LABELING_BUDGET:
                app_2.sample_tuple(d, random_seed=0)
                if d.has_ground_truth:
                    app_2.label_with_ground_truth(d)
            app_2.update_models(d)
            app_2.generate_features(d, synchronous=False)
            app_2.predict_corrections(d, random_seed=0)

            p, r, f = d.get_data_cleaning_evaluation(d.corrected_cells)[-3:]
            result = {'dataset': d.name, 'n_pdep': n_pdep, 'vicinity_order': vicinity_order, 'precision': p, 'recall': r, 'f1': f}
            results_no_update.append(result)
            print(result)

{'dataset': 'letter', 'n_pdep': None, 'vicinity_order': [1], 'precision': 0.28730077992539843, 'recall': 0.20665894261845236, 'f1': 0.24039723355204823}
{'dataset': 'letter', 'n_pdep': None, 'vicinity_order': [2], 'precision': 0.26977143470329434, 'recall': 0.15330203061162265, 'f1': 0.19550509370868652}
{'dataset': 'letter', 'n_pdep': None, 'vicinity_order': [1, 2], 'precision': 0.2943998081304713, 'recall': 0.14970425025916215, 'f1': 0.19848007114560595}


In [8]:
print(results_no_update)

[{'dataset': 'letter', 'n_pdep': None, 'vicinity_order': [1], 'precision': 0.28730077992539843, 'recall': 0.20665894261845236, 'f1': 0.24039723355204823}, {'dataset': 'letter', 'n_pdep': None, 'vicinity_order': [2], 'precision': 0.26977143470329434, 'recall': 0.15330203061162265, 'f1': 0.19550509370868652}, {'dataset': 'letter', 'n_pdep': None, 'vicinity_order': [1, 2], 'precision': 0.2943998081304713, 'recall': 0.14970425025916215, 'f1': 0.19848007114560595}]


# validate

In [6]:
dataset_name = 'rayyan'
dataset_dictionary = {
    "name": dataset_name,
    "path": f"../datasets/{dataset_name}/dirty.csv",
    "clean_path": f"../datasets/{dataset_name}/clean.csv",
    }
d = raha.dataset.Dataset(dataset_dictionary)
d.detected_cells = dict(d.get_actual_errors_dictionary())
app = raha.Correction()

app.VICINITY_ONLY = False
app.VICINITY_ORDERS = [1]
app.VICINITY_FEATURE_GENERATOR = 'naive'
app.N_BEST_PDEPS = None

d = app.initialize_dataset(d)
app.initialize_models(d)
while len(d.labeled_tuples) < app.LABELING_BUDGET:
    app.sample_tuple(d, random_seed=None)
    if d.has_ground_truth:
        app.label_with_ground_truth(d)
    app.update_models(d)
    app.generate_features(d)
    app.predict_corrections(d, random_seed=None)
p, r, f = d.get_data_cleaning_evaluation(d.corrected_cells)[-3:]
print({'dataset': d.name, 'precision': p, 'recall': r, 'f1': f})

{'dataset': 'rayyan', 'precision': 0.31949685534591193, 'recall': 0.2679324894514768, 'f1': 0.291451520367183}
