# Dataset Generation
Generate erroneous datasets.

In [27]:
import pandas as pd
import error_generation
from utils import get_differences

In [137]:
from error_generation.api import mid_level
from error_generation.utils import ErrorModel, ErrorTypeConfig, MidLevelConfig
from error_generation.error_mechanism import EAR, ENAR, ECAR
from error_generation.utils import ErrorTypeConfig
from error_generation import error_type

In [3]:
def read_csv_dataset(dataset_path):
    """
    This method reads a dataset from a csv file path.
    """
    dataframe = pd.read_csv(dataset_path, sep=",", header="infer", encoding="utf-8", dtype=str,
                                keep_default_na=False, low_memory=False)
    return dataframe

## beers

Ich modelliere `beers` nahe am Original.

In [4]:
df_clean = read_csv_dataset('../data/beers/clean.csv')
df_dirty = read_csv_dataset('../data/beers/dirty.csv')

In [6]:
df_clean['index'] = df_clean['index'].astype('int')
df_clean['id'] = df_clean['id'].astype('int')
df_clean['beer-name'] = df_clean['beer-name'].astype('str')
df_clean['style'] = df_clean['style'].astype('str')
df_clean['ounces'] = df_clean['ounces'].astype('str')
df_clean['abv'] = df_clean['abv'].replace('', None).astype('float')
df_clean['brewery_id'] = df_clean['brewery_id'].astype('int')
df_clean['brewery-name'] = df_clean['brewery-name'].astype('str')
df_clean['state'] = df_clean['state'].astype('str')

In [9]:
config = MidLevelConfig(
    {
        "ounces": [
            ErrorModel(ECAR(), error_type.Extraneous({'extraneous_value_template': "{value}.0 oz"}), 0.25),
            ErrorModel(ECAR(), error_type.Extraneous({'extraneous_value_template': "{value}.0 oz."}), 0.38),
            ErrorModel(ECAR(), error_type.Extraneous({'extraneous_value_template': "{value}.0 ounce"}), 0.23),
            ErrorModel(ECAR(), error_type.Extraneous({'extraneous_value_template': "{value}.0 oz. Alumi-Tek"}), 0.05),
            ErrorModel(ECAR(), error_type.Extraneous({'extraneous_value_template': "{value}.0 OZ"}), 0.062),
            ErrorModel(ECAR(), error_type.Extraneous({'extraneous_value_template': "{value}.0 oz. Silo Can"}), 0.0046),
            ErrorModel(ECAR(), error_type.Extraneous({'extraneous_value_template': "{value} oz."}), 0.0025),
            ErrorModel(ECAR(), error_type.Extraneous({'extraneous_value_template': "{value} ounce"}), 0.0025),
            ErrorModel(ECAR(), error_type.Extraneous({'extraneous_value_template': "{value} oz"}), 0.017),
            ],
        "abv": [            
            ErrorModel(ECAR(), error_type.AddDelta({'add_delta_value': 0.0001}), 0.059),
            ErrorModel(ECAR(), error_type.Extraneous({'extraneous_value_template': "{value}%"}), 0.229),
        ],
        "ibu": [ErrorModel(ECAR(), error_type.MissingValue({'na_value': "N/A"}), 0.417)],
        "city": [ErrorModel(EAR(condition_to_column="state"), error_type.Extraneous({'extraneous_value_template': "{value} AZ"}), 0.053)],
        "state": [ErrorModel(EAR(condition_to_column="state"), error_type.MissingValue(), 0.053)],
    }
)

In [10]:
df_corrupted, error_mask = mid_level.create_errors(df_clean, config)

Cast types to help pyarrorw write parquet files correctly.

In [11]:
df_corrupted['index'] = df_corrupted['index'].astype('int')
df_corrupted['id'] = df_corrupted['id'].astype('int')
df_corrupted['beer-name'] = df_corrupted['beer-name'].astype('str')
df_corrupted['style'] = df_corrupted['style'].astype('str')
df_corrupted['ounces'] = df_corrupted['ounces'].astype('str')
df_corrupted['abv'] = df_corrupted['abv'].astype('str')
df_corrupted['brewery_id'] = df_corrupted['brewery_id'].astype('int')
df_corrupted['brewery-name'] = df_corrupted['brewery-name'].astype('str')
df_corrupted['state'] = df_corrupted['state'].astype('str')

In [12]:
df_corrupted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2410 entries, 0 to 2409
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   index         2410 non-null   int64 
 1   id            2410 non-null   int64 
 2   beer-name     2410 non-null   object
 3   style         2410 non-null   object
 4   ounces        2410 non-null   object
 5   abv           2410 non-null   object
 6   ibu           2410 non-null   object
 7   brewery_id    2410 non-null   int64 
 8   brewery-name  2410 non-null   object
 9   city          2410 non-null   object
 10  state         2410 non-null   object
dtypes: int64(3), object(8)
memory usage: 207.2+ KB


In [16]:
#df_corrupted.to_parquet('../export_data/beers/beers_dirty_authentic.parquet', index=False)
df_corrupted.to_csv('../export_data/beers/beers_dirty_authentic.csv', index=False)
#df_clean.to_parquet('../export_data/beers/beers_clean.parquet', index=False)
df_clean.to_csv('../export_data/beers/beers_clean.csv', index=False)

## Hospital

In [229]:
df_clean = read_csv_dataset('../data/hospital/clean.csv')
df_dirty = read_csv_dataset('../data/hospital/dirty.csv')

df_clean_str = read_csv_dataset('../data/hospital/clean.csv').astype(str)
df_dirty_str = read_csv_dataset('../data/hospital/dirty.csv').astype(str)

Make some changes such that the HOSP dataset from Baran matches the dataset from Xu et al.

In [230]:
df_clean = df_clean.rename(columns={'StateAverage': 'Stateavg'})
df_dirty = df_dirty.rename(columns={'StateAverage': 'Stateavg'})
df_clean_str = df_clean_str.rename(columns={'StateAverage': 'Stateavg'})
df_dirty_str = df_dirty_str.rename(columns={'StateAverage': 'Stateavg'})

df_clean = df_clean.drop('index', axis=1)
df_dirty = df_dirty.drop('index', axis=1)
df_clean_str = df_clean_str.drop('index', axis=1)
df_dirty_str = df_dirty_str.drop('index', axis=1)

In [98]:
def find_replacement_rules(df, column):
    # Initialize a Counter to store rules and their counts
    replacement_rules = Counter()
    
    # Iterate over each row in the DataFrame
    for _, row in df.iterrows():
        clean = row[f'{column}_clean']
        dirty = row[f'{column}_dirty']
        
        # Check character-by-character for differences
        i = 0
        while i <= len(clean):
            if dirty[i] == 'x' and clean[i] != 'x':  # Found a character in clean replaced by 'x'
                replacement_rules[clean[i]] += 1
                i = len(clean)
            i += 1
    
    return replacement_rules

In [99]:
from collections import Counter

rules = {}

for column in df_clean.columns:
    df_diff = get_differences(df_clean_str, df_dirty_str, column)
    r = find_replacement_rules(df_diff, column)
    rules[column] = r

**Realistic dtypes**
The first step to generating realistic errors is assuming realistic dtypes.

In [104]:
df_clean['ProviderNumber'] = df_clean['ProviderNumber'].astype('int')
df_clean['HospitalName'] = df_clean['HospitalName'].astype('str')
df_clean['Address1'] = df_clean['Address1'].astype('str')
df_clean['Address2'] = df_clean['Address2'].astype('str')
df_clean['Address3'] = df_clean['Address3'].astype('str')
df_clean['City'] = df_clean['City'].astype('str')
df_clean['State'] = df_clean['State'].astype('category')
df_clean['ZipCode'] = df_clean['ZipCode'].astype('int')
df_clean['CountyName'] = df_clean['CountyName'].astype('str')
df_clean['PhoneNumber'] = df_clean['PhoneNumber'].astype('int')
df_clean['HospitalType'] = df_clean['HospitalType'].astype('category')
df_clean['HospitalOwner'] = df_clean['HospitalOwner'].astype('category')
df_clean['EmergencyService'] = df_clean['EmergencyService'].astype('category')
df_clean['Condition'] = df_clean['Condition'].astype('category')
df_clean['MeasureCode'] = df_clean['MeasureCode'].astype('category')
df_clean['MeasureName'] = df_clean['MeasureName'].astype('str')
df_clean['Score'] = df_clean['Score'].astype('str')  # ouch
df_clean['Sample'] = df_clean['Sample'].astype('str')
df_clean['Stateavg'] = df_clean['Stateavg'].astype('str')

In [101]:
n_rows, _ = df_clean.shape
mid_lvl_config = {k: [(ErrorModel(ECAR(), error_type.Replace({'replace_what': v, 'replace_with': 'x'}), rules[k][v]/n_rows)) for v in rules[k]] for k in rules}

In [102]:
config = MidLevelConfig(mid_lvl_config)

In [103]:
for i in range(10):
    df_corrupted, error_mask = mid_level.create_errors(df_clean_str, config)
    df_corrupted.to_csv(f'../export_data/hospital/hospital_replica_{i}.csv', index=False)
df_clean_str.to_csv('../export_data/hospital/hospital_replica_clean.csv', index=False)

### Realistic Errors In Hospital

In [105]:
df_clean.head()

Unnamed: 0,ProviderNumber,HospitalName,Address1,Address2,Address3,City,State,ZipCode,CountyName,PhoneNumber,HospitalType,HospitalOwner,EmergencyService,Condition,MeasureCode,MeasureName,Score,Sample,Stateavg
0,10018,callahan eye foundation hospital,1720 university blvd,empty,empty,birmingham,al,35233,jefferson,2053258100,acute care hospitals,voluntary non-profit - private,yes,surgical infection prevention,scip-card-2,surgery patients who were taking heart drugs c...,empty,empty,al_scip-card-2
1,10018,callahan eye foundation hospital,1720 university blvd,empty,empty,birmingham,al,35233,jefferson,2053258100,acute care hospitals,voluntary non-profit - private,yes,surgical infection prevention,scip-inf-1,surgery patients who were given an antibiotic ...,empty,empty,al_scip-inf-1
2,10018,callahan eye foundation hospital,1720 university blvd,empty,empty,birmingham,al,35233,jefferson,2053258100,acute care hospitals,voluntary non-profit - private,yes,surgical infection prevention,scip-inf-2,surgery patients who were given the right kin...,empty,empty,al_scip-inf-2
3,10018,callahan eye foundation hospital,1720 university blvd,empty,empty,birmingham,al,35233,jefferson,2053258100,acute care hospitals,voluntary non-profit - private,yes,surgical infection prevention,scip-inf-3,surgery patients whose preventive antibiotics ...,empty,empty,al_scip-inf-3
4,10018,callahan eye foundation hospital,1720 university blvd,empty,empty,birmingham,al,35233,jefferson,2053258100,acute care hospitals,voluntary non-profit - private,yes,surgical infection prevention,scip-inf-4,all heart surgery patients whose blood sugar (...,empty,empty,al_scip-inf-4


ProviderNumber is a primary key of HospitalName.
We will simulate that people working at certain hospitals commit typing errors using the `ButterFingers` ErrorType.

In [150]:
error_percentages = (df_dirty_str != df_clean_str).sum() / df_dirty_str.shape[0]  # percentage of errors per column
mid_lvl_config_butter = {column: [ErrorModel(EAR(condition_to_column='ProviderNumber', seed=0), error_type.Butterfinger(), float(error_percentages[column]))] for column in df_clean_str.columns}
mid_lvl_config_butter['ProviderNumber'] = [ErrorModel(ENAR(seed=0), error_type.Butterfinger(), float(error_percentages[column]))]

In [156]:
config = MidLevelConfig(mid_lvl_config_butter)

In [157]:
for i in range(10):
    df_corrupted, error_mask = mid_level.create_errors(df_clean_str, config)
    df_corrupted.to_csv(f'../export_data/hospital/hospital_butter_{i}.csv', index=False)
df_clean_str.to_csv('../export_data/hospital/hospital_butter_clean.csv', index=False)

In [203]:
differences = df_clean_str.ne(df_corrupted)
rows_with_differences = differences.any(axis=1)
df_corrupted[rows_with_differences]

Unnamed: 0,ProviderNumber,HospitalName,Address1,Address2,Address3,City,State,ZipCode,CountyName,PhoneNumber,HospitalType,HospitalOwner,EmergencyService,Condition,MeasureCode,MeasureName,Score,Sample,Stateavg
930,20047,georgianz hospital,515 miranda at,empty,empty,georgizna,ap,36034,b7tler,334376220r,avute care hospitals,voluntary non-profit [ private,bo,heart attzck,"a,i-1",heart attack patientd given aspirin at arrival,53%,19 patientd,al_am9-1
931,19047,georgiana hospitzl,515 miranda s6,empty,empty,geodgiana,a;,e6033,nutler,3343762q05,acute care hlspitals,vkluntary non-profit - private,nl,geart attack,aji-2,heart attack patjents given aspirin at discharge,5-%,16 pat9ents,al_amk-2
932,10048,gekrgiana hospital,515 muranda st,empty,empty,g3orgiana,sl,46033,hutler,33r3762205,acute cate hospitals,voluntary non-profir - private,bo,jeart attack,zmi-3,yeart attack patients given ace inhibitor or a...,100%,1 oatients,"al_a,i-3"
933,10037,georgiana hospital,515 niranda st,empty,empty,georgiaba,ql,3603w,butl3r,33437y2205,acute care nospitals,voluntary non-profit - pfivate,nk,heart attafk,smi-4,heart attzck patients given smoking cessation ...,33%,6 patien5s,al_ami[4
934,10p47,gelrgiana hospital,51t miranda st,empty,empty,gworgiana,sl,36933,hutler,e343762205,acuts care hospitals,voluntary non-profit = private,jo,hea4t attack,qmi-5,heart attack patients given beta blocker at di...,t0%,`5 patients,al_smi-5
935,10947,georgiana hospital,r15 miranda st,empty,empty,georgiaba,al,36-33,gutler,33437622p5,acuye care hospitals,voluntary non-profit = private,n9,hear5 attack,zmi-7a,heart attack patients given fibrinolytic medic...,empty,- patients,al_ami=7a
936,10047,georgiana hospital,515 mi4anda st,empty,empty,georgianw,al,35033,buyler,3343y62205,acute care hkspitals,voluntary non-profit - private,no,heart sttack,aki-8a,heatt attack patients given pci within 90 minu...,empty,0 patiehts,al_ami-8a
937,10047,georgiana hospital,515 miranda st,empty,empty,georgiana,al,36033,gutler,33437t2205,acute care hospitals,voluntary non-profit - private,no,heart failure,hf-1,heart failure patients given discharge instruc...,53%,51 patients,al_hf-1
938,10048,georgiana hospital,515 mirsnda st,empty,empty,gworgiana,zl,e6033,bjtler,3343862205,acute fare hospitals,voluntary non-prkfit - private,ni,ueart failure,hfp2,heart failure patients given an evaluation of ...,63%,59 patien5s,al_hf-3
939,10047,georgiana hospital,515 miranda at,empty,empty,georyiana,al,w6033,b8tler,33437622-5,acute carw hospitals,voluntary non-profit - private,no,heart failurw,hf03,heart failure patients given ace inhibitor or ...,67%,6 patjents,al_hf-3


- Diese Blöcke von falschen Werten führen dazu, dass HoloClean mit F1-Score von 0 reinigt. Das ist intuitiv, es gibt keinen fehlerfreien Kontext mehr, auf den sich HC beziehen kann.
- Verändern wir also die Annahme, unter der wir die Fehler einführen: In den Krankenhäusern sind unterschiedliche Mitarbeiter dafür zuständig, Daten einzutragen. Manche dieser Mitarbeiter in manchen Krankenhäusern haben Butterfingers.

In [231]:
error_percentages = (df_dirty_str != df_clean_str).sum() / df_dirty_str.shape[0]  # percentage of errors per column
mid_lvl_config_butter_v2 = {column: [ErrorModel(EAR(condition_to_column='ProviderNumber', seed=seed), error_type.Butterfinger(), float(error_percentages[column]))] for seed, column in enumerate(df_clean_str.columns)}
mid_lvl_config_butter_v2['ProviderNumber'] = [ErrorModel(ENAR(seed=0), error_type.Butterfinger(), float(error_percentages[column]))]
config_v2 = MidLevelConfig(mid_lvl_config_butter_v2)

In [232]:
for i in range(10):
    df_corrupted, error_mask = mid_level.create_errors(df_clean_str, config_v2)
    df_corrupted.to_csv(f'../export_data/hospital/hospital_butter_v2_{i}.csv', index=False)
df_clean_str.to_csv('../export_data/hospital/hospital_butter_v2_clean.csv', index=False)

- Die butterfinger_v2 Messungen führen zu ~2% F1-Score.
- Liegt es vielleicht am Fehlertyp selbst? Oder ist es die Fehlerverteilung? In der nächsten Version messe ich mit ECAR-Verteilung.

### Butterfinger ECAR

In [233]:
error_percentages = (df_dirty_str != df_clean_str).sum() / df_dirty_str.shape[0]  # percentage of errors per column
mid_lvl_config_butter_ecar = {column: [ErrorModel(ECAR(), error_type.Butterfinger(), float(error_percentages[column]))] for column in df_clean_str.columns}
config_butter_ecar = MidLevelConfig(mid_lvl_config_butter_ecar)

In [234]:
for i in range(10):
    df_corrupted, error_mask = mid_level.create_errors(df_clean_str, config_butter_ecar)
    df_corrupted.to_csv(f'../export_data/hospital/hospital_butter_ecar_{i}.csv', index=False)
df_clean_str.to_csv('../export_data/hospital/hospital_butter_ecar_clean.csv', index=False)