# End-to-End Data Cleaning Pipeline with Raha and Baran (Minimal and Sequential)
We build an end-to-end data cleaning pipeline with our configuration-free error detection and correction systems, Raha and Baran.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas
import IPython.display

import raha

## Error Detection with Raha

### 1. Instantiating the Detection Class
We first instantiate the `Detection` class.

In [3]:
app_1 = raha.Detection()

# How many tuples would you label?
app_1.LABELING_BUDGET = 20

# Would you like to see the logs?
app_1.VERBOSE = True

### 2. Instantiating the Dataset
We next load and instantiate the dataset object.

In [6]:
dataset_dictionary = {
    "name": "flights",
    "path": "../datasets/hospital/dirty.csv",
    "clean_path": "../datasets/hospital/clean.csv"
}
d = app_1.initialize_dataset(dataset_dictionary)
d.dataframe.head()

Unnamed: 0,index,provider_number,name,address_1,address_2,address_3,city,state,zip,county,phone,type,owner,emergency_service,condition,measure_code,measure_name,score,sample,state_average
0,1,10018,callahan eye foundation hospital,1720 university blvd,empty,empty,birmingham,al,35233,jefferson,2053258100,acute care hospitals,voluntary non-profit - private,yes,surgical infection prevention,scip-card-2,surgery patients who were taking heart drugs c...,empty,empty,al_scip-card-2
1,2,10018,callahan eye foundation hospital,1720 university blvd,empty,empty,birmingham,al,35233,jefferson,2053258100,acute care hospitals,voluntary non-profit - private,yes,surgical infection prevention,scip-inf-1,surgery patients who were given an antibiotic ...,empty,empty,al_scip-inf-1
2,3,10018,callahan eye foundation hospital,1720 university blvd,empty,empty,birmingham,al,35233,jefferson,2053258100,acute care hospitals,voluntary non-profit - private,yes,surgical infection prevention,scip-inf-2,surgery patients who were given the right kind...,empty,empty,al_scip-inf-2
3,4,10018,callahan eye foundation hospital,1720 university blvd,empty,empty,birminghxm,al,35233,jefferson,2053258100,acute care hospitals,voluntary non-profit - private,yes,surgical infection prevention,scip-inf-3,surgery patients whose preventive antibiotics ...,empty,empty,al_scip-inf-3
4,5,10018,callahan eye foundation hospital,1720 university blvd,empty,empty,birmingham,al,35233,jefferson,2053258100,acute care hospitals,voluntary non-profit - private,yes,surgical infection prevention,scip-inf-4,all heart surgery patients whose blood sugar (...,empty,empty,al_scip-inf-4


# Error Correction with Baran

### 1. Instantiating the Correction Class
We first instantiate the `Correction` class.

In [7]:
app_2 = raha.Correction()

# How many tuples would you label?
app_2.LABELING_BUDGET = 20

# Would you like to see the logs?
app_2.VERBOSE = True

### 2. Initializing the Dataset Object
We next initialize the dataset object.

In [8]:
d = app_2.initialize_dataset(d)
d.detected_cells = dict(d.get_actual_errors_dictionary())
d.dataframe.head()

Unnamed: 0,index,provider_number,name,address_1,address_2,address_3,city,state,zip,county,phone,type,owner,emergency_service,condition,measure_code,measure_name,score,sample,state_average
0,1,10018,callahan eye foundation hospital,1720 university blvd,empty,empty,birmingham,al,35233,jefferson,2053258100,acute care hospitals,voluntary non-profit - private,yes,surgical infection prevention,scip-card-2,surgery patients who were taking heart drugs c...,empty,empty,al_scip-card-2
1,2,10018,callahan eye foundation hospital,1720 university blvd,empty,empty,birmingham,al,35233,jefferson,2053258100,acute care hospitals,voluntary non-profit - private,yes,surgical infection prevention,scip-inf-1,surgery patients who were given an antibiotic ...,empty,empty,al_scip-inf-1
2,3,10018,callahan eye foundation hospital,1720 university blvd,empty,empty,birmingham,al,35233,jefferson,2053258100,acute care hospitals,voluntary non-profit - private,yes,surgical infection prevention,scip-inf-2,surgery patients who were given the right kind...,empty,empty,al_scip-inf-2
3,4,10018,callahan eye foundation hospital,1720 university blvd,empty,empty,birminghxm,al,35233,jefferson,2053258100,acute care hospitals,voluntary non-profit - private,yes,surgical infection prevention,scip-inf-3,surgery patients whose preventive antibiotics ...,empty,empty,al_scip-inf-3
4,5,10018,callahan eye foundation hospital,1720 university blvd,empty,empty,birmingham,al,35233,jefferson,2053258100,acute care hospitals,voluntary non-profit - private,yes,surgical infection prevention,scip-inf-4,all heart surgery patients whose blood sugar (...,empty,empty,al_scip-inf-4


### 3. Initializing the Error Corrector Models
Baran initializes the error corrector models.

In [None]:
app_2.initialize_models(d)

> [0;32m/home/philipp/code/raha/raha/correction.py[0m(403)[0;36minitialize_models[0;34m()[0m
[0;32m    401 [0;31m            [0mvicinity_list[0m [0;34m=[0m [0;34m[[0m[0mcv[0m [0;32mif[0m [0;34m([0m[0mi[0m[0;34m,[0m [0mcj[0m[0;34m)[0m [0;32mnot[0m [0;32min[0m [0md[0m[0;34m.[0m[0mdetected_cells[0m [0;32melse[0m [0mself[0m[0;34m.[0m[0mIGNORE_SIGN[0m [0;32mfor[0m [0mcj[0m[0;34m,[0m [0mcv[0m [0;32min[0m [0menumerate[0m[0;34m([0m[0mrow[0m[0;34m)[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    402 [0;31m            [0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 403 [0;31m            [0;32mfor[0m [0mj[0m[0;34m,[0m [0mvalue[0m [0;32min[0m [0menumerate[0m[0;34m([0m[0mrow[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    404 [0;31m                [0;32mif[0m [0;34m([0m[0mi[0m[0;34m,[0m [0mj[0m[0;34m)[0m [0;32mnot[0m [0;32min[0m [0md[0m[

ipdb>  print(r)


Pandas(Index=0, index='1', provider_number='10018', name='callahan eye foundation hospital', address_1='1720 university blvd', address_2='empty', address_3='empty', city='birmingham', state='al', zip='35233', county='jefferson', phone='2053258100', type='acute care hospitals', owner='voluntary non-profit - private', emergency_service='yes', condition='surgical infection prevention', measure_code='scip-card-2', measure_name='surgery patients who were taking heart drugs caxxed beta bxockers before coming to the hospitax who were kept on the beta bxockers during the period just before and after their surgery', score='empty', sample='empty', state_average='al_scip-card-2')


ipdb>  print(r[0])


0


ipdb>  print(r[1:])


('1', '10018', 'callahan eye foundation hospital', '1720 university blvd', 'empty', 'empty', 'birmingham', 'al', '35233', 'jefferson', '2053258100', 'acute care hospitals', 'voluntary non-profit - private', 'yes', 'surgical infection prevention', 'scip-card-2', 'surgery patients who were taking heart drugs caxxed beta bxockers before coming to the hospitax who were kept on the beta bxockers during the period just before and after their surgery', 'empty', 'empty', 'al_scip-card-2')


ipdb>  print(detected_cells)


*** NameError: name 'detected_cells' is not defined


ipdb>  print(d.detected_cells)


{(13, 1): '10019', (45, 1): '10005', (81, 1): '10006', (213, 1): '10011', (244, 1): '10015', (253, 1): '10015', (273, 1): '10016', (314, 1): '10038', (360, 1): '10086', (408, 1): '10108', (559, 1): '10024', (608, 1): '10027', (627, 1): '10029', (639, 1): '10029', (645, 1): '10032', (655, 1): '10032', (704, 1): '10034', (721, 1): '10035', (724, 1): '10035', (755, 1): '10036', (780, 1): '10039', (789, 1): '10039', (793, 1): '10039', (859, 1): '10044', (878, 1): '10045', (888, 1): '10045', (933, 1): '10047', (955, 1): '10049', (26, 2): 'southeast alabama medical center', (131, 2): 'crenshaw community hospital', (171, 2): 'marshall medical center north', (243, 2): 'dekalb regional medical center', (337, 2): 'flowers hospital', (384, 2): 'univ of south alabama medical center', (385, 2): 'univ of south alabama medical center', (448, 2): 'alaska regional hospital', (532, 2): 'baptist medical center south', (560, 2): 'jackson hospital & clinic inc', (654, 2): 'wedowee hospital', (697, 2): 'com

ipdb>  print(vicinity_list)


['1', '10018', 'callahan eye foundation hospital', '1720 university blvd', 'empty', 'empty', 'birmingham', 'al', '35233', 'jefferson', '2053258100', 'acute care hospitals', 'voluntary non-profit - private', 'yes', 'surgical infection prevention', 'scip-card-2', '<<<IGNORE_THIS_VALUE>>>', 'empty', 'empty', 'al_scip-card-2']


ipdb>  continue


> [0;32m/home/philipp/code/raha/raha/correction.py[0m(413)[0;36minitialize_models[0;34m()[0m
[0;32m    411 [0;31m                    }
[0m[0;32m    412 [0;31m                    [0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 413 [0;31m                    [0mself[0m[0;34m.[0m[0m_vicinity_based_models_updater[0m[0;34m([0m[0md[0m[0;34m.[0m[0mvicinity_models[0m[0;34m,[0m [0mupdate_dictionary[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    414 [0;31m                    [0mself[0m[0;34m.[0m[0m_domain_based_model_updater[0m[0;34m([0m[0md[0m[0;34m.[0m[0mdomain_models[0m[0;34m,[0m [0mupdate_dictionary[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    415 [0;31m        [0;32mif[0m [0mself[0m[0;34m.[0m[0mVERBOSE[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  print(temp_vicinity_list)


['<<<IGNORE_THIS_VALUE>>>', '10018', 'callahan eye foundation hospital', '1720 university blvd', 'empty', 'empty', 'birmingham', 'al', '35233', 'jefferson', '2053258100', 'acute care hospitals', 'voluntary non-profit - private', 'yes', 'surgical infection prevention', 'scip-card-2', '<<<IGNORE_THIS_VALUE>>>', 'empty', 'empty', 'al_scip-card-2']


ipdb>  continue


> [0;32m/home/philipp/code/raha/raha/correction.py[0m(412)[0;36minitialize_models[0;34m()[0m
[0;32m    410 [0;31m                        [0;34m"vicinity"[0m[0;34m:[0m [0mtemp_vicinity_list[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    411 [0;31m                    }
[0m[0;32m--> 412 [0;31m                    [0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    413 [0;31m                    [0mself[0m[0;34m.[0m[0m_vicinity_based_models_updater[0m[0;34m([0m[0md[0m[0;34m.[0m[0mvicinity_models[0m[0;34m,[0m [0mupdate_dictionary[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    414 [0;31m                    [0mself[0m[0;34m.[0m[0m_domain_based_model_updater[0m[0;34m([0m[0md[0m[0;34m.[0m[0mdomain_models[0m[0;34m,[0m [0mupdate_dictionary[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  print(temp_vicinity_list)


['1', '<<<IGNORE_THIS_VALUE>>>', 'callahan eye foundation hospital', '1720 university blvd', 'empty', 'empty', 'birmingham', 'al', '35233', 'jefferson', '2053258100', 'acute care hospitals', 'voluntary non-profit - private', 'yes', 'surgical infection prevention', 'scip-card-2', '<<<IGNORE_THIS_VALUE>>>', 'empty', 'empty', 'al_scip-card-2']


ipdb>  continue


> [0;32m/home/philipp/code/raha/raha/correction.py[0m(413)[0;36minitialize_models[0;34m()[0m
[0;32m    411 [0;31m                    }
[0m[0;32m    412 [0;31m                    [0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 413 [0;31m                    [0mself[0m[0;34m.[0m[0m_vicinity_based_models_updater[0m[0;34m([0m[0md[0m[0;34m.[0m[0mvicinity_models[0m[0;34m,[0m [0mupdate_dictionary[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    414 [0;31m                    [0mself[0m[0;34m.[0m[0m_domain_based_model_updater[0m[0;34m([0m[0md[0m[0;34m.[0m[0mdomain_models[0m[0;34m,[0m [0mupdate_dictionary[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    415 [0;31m        [0;32mif[0m [0mself[0m[0;34m.[0m[0mVERBOSE[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  print(temp_vicinity_list)


['1', '10018', '<<<IGNORE_THIS_VALUE>>>', '1720 university blvd', 'empty', 'empty', 'birmingham', 'al', '35233', 'jefferson', '2053258100', 'acute care hospitals', 'voluntary non-profit - private', 'yes', 'surgical infection prevention', 'scip-card-2', '<<<IGNORE_THIS_VALUE>>>', 'empty', 'empty', 'al_scip-card-2']


ipdb>  continue


> [0;32m/home/philipp/code/raha/raha/correction.py[0m(412)[0;36minitialize_models[0;34m()[0m
[0;32m    410 [0;31m                        [0;34m"vicinity"[0m[0;34m:[0m [0mtemp_vicinity_list[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    411 [0;31m                    }
[0m[0;32m--> 412 [0;31m                    [0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    413 [0;31m                    [0mself[0m[0;34m.[0m[0m_vicinity_based_models_updater[0m[0;34m([0m[0md[0m[0;34m.[0m[0mvicinity_models[0m[0;34m,[0m [0mupdate_dictionary[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    414 [0;31m                    [0mself[0m[0;34m.[0m[0m_domain_based_model_updater[0m[0;34m([0m[0md[0m[0;34m.[0m[0mdomain_models[0m[0;34m,[0m [0mupdate_dictionary[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m


### 4. Interactive Tuple Sampling, Labeling, Model updating, Feature Generation, and Correction Prediction
Baran then iteratively samples a tuple. We should label data cells of each sampled tuple. It then udpates the models accordingly and generates a feature vector for each pair of a data error and a correction candidate. Finally, it trains and applies a classifier to each data column to predict the final correction of each data error. Since we already labeled tuples for Raha, we use the same labeled tuples and do not label new tuples here.

In [None]:
#while len(d.labeled_tuples) < app_2.LABELING_BUDGET:
while len(d.labeled_tuples) < 2:e
    app_2.sample_tuple(d)
    if d.has_ground_truth:
        app_2.label_with_ground_truth(d)
    else:
        print("Label the dirty cells in the following sampled tuple.")
        sampled_tuple = pandas.DataFrame(data=[d.dataframe.iloc[d.sampled_tuple, :]], columns=d.dataframe.columns)
        IPython.display.display(sampled_tuple)
        for j in range(d.dataframe.shape[1]):
            cell = (d.sampled_tuple, j)
            value = d.dataframe.iloc[cell]
            correction = input("What is the correction for value '{}'? Type in the same value if it is not erronous.\n".format(value))
            user_label = 1 if value != correction else 0
            d.labeled_cells[cell] = [user_label, correction]
        d.labeled_tuples[d.sampled_tuple] = 1
    app_2.update_models(d)
    app_2.generate_features_synchronously(d)
    app_2.predict_corrections(d)

for si in d.labeled_tuples:
    d.sampled_tuple = si
    app_2.update_models(d)
    app_2.generate_features(d)
    app_2.predict_corrections(d)

### 5. Storing Results
Baran can also store the error correction results.

In [None]:
app_2.store_results(d)

### 6. Evaluating the Error Correction Task
We can finally evaluate our error correction task.

In [None]:
p, r, f = d.get_data_cleaning_evaluation(d.corrected_cells)[-3:]
print("Baran's performance on {}:\nPrecision = {:.2f}\nRecall = {:.2f}\nF1 = {:.2f}".format(d.name, p, r, f))