# Dataset Generation
Generate erroneous datasets.

In [1]:
import pandas as pd
import error_generation

In [2]:
from error_generation.api import mid_level
from error_generation.utils import ErrorModel, ErrorTypeConfig, MidLevelConfig
from error_generation.error_mechanism import EAR, ENAR, ECAR
from error_generation.utils import ErrorTypeConfig
from error_generation import error_type

In [3]:
def read_csv_dataset(dataset_path):
    """
    This method reads a dataset from a csv file path.
    """
    dataframe = pd.read_csv(dataset_path, sep=",", header="infer", encoding="utf-8", dtype=str,
                                keep_default_na=False, low_memory=False)
    return dataframe

## beers

Ich modelliere `beers` nahe am Original.

In [4]:
df_clean = read_csv_dataset('../data/beers/clean.csv')
df_dirty = read_csv_dataset('../data/beers/dirty.csv')

In [5]:
df_clean['abv'] = df_clean['abv'].replace('', None).astype('float')

In [6]:
config = MidLevelConfig(
    {
        "ounces": [
            ErrorModel(ECAR(), error_type.Extraneous({'extraneous_value_template': "{value}.0 oz"}), 0.25),
            ErrorModel(ECAR(), error_type.Extraneous({'extraneous_value_template': "{value}.0 oz."}), 0.38),
            ErrorModel(ECAR(), error_type.Extraneous({'extraneous_value_template': "{value}.0 ounce"}), 0.23),
            ErrorModel(ECAR(), error_type.Extraneous({'extraneous_value_template': "{value}.0 oz. Alumi-Tek"}), 0.05),
            ErrorModel(ECAR(), error_type.Extraneous({'extraneous_value_template': "{value}.0 OZ"}), 0.062),
            ErrorModel(ECAR(), error_type.Extraneous({'extraneous_value_template': "{value}.0 oz. Silo Can"}), 0.0046),
            ErrorModel(ECAR(), error_type.Extraneous({'extraneous_value_template': "{value} oz."}), 0.0025),
            ErrorModel(ECAR(), error_type.Extraneous({'extraneous_value_template': "{value} ounce"}), 0.0025),
            ErrorModel(ECAR(), error_type.Extraneous({'extraneous_value_template': "{value} oz"}), 0.017),
            ],
        "abv": [            
            ErrorModel(ECAR(), error_type.AddDelta({'add_delta_value': 0.0001}), 0.059),
            ErrorModel(ECAR(), error_type.Extraneous({'extraneous_value_template': "{value}%"}), 0.229),
        ],
        "ibu": [ErrorModel(ECAR(), error_type.MissingValue({'na_value': "N/A"}), 0.417)],
        "city": [ErrorModel(EAR(condition_to_column="state"), error_type.Extraneous({'extraneous_value_template': "{value} AZ"}), 0.053)],
        "state": [ErrorModel(EAR(condition_to_column="state"), error_type.MissingValue(), 0.053)],
    }
)

In [7]:
df_corrupted, error_mask = mid_level.create_errors(df_clean, config)

In [9]:
df_corrupted

Unnamed: 0,index,id,beer-name,style,ounces,abv,ibu,brewery_id,brewery-name,city,state
0,1,1436,Pub Beer,American Pale Lager,12.0 oz.,0.05,,408,10 Barrel Brewing Company,Bend,OR
1,2,2265,Devil's Cup,American Pale Ale (APA),12.0 oz.,0.066,,177,18th Street Brewery,Gary,IN
2,3,2264,Rise of the Phoenix,American IPA,12.0 oz.,0.071,,177,18th Street Brewery,Gary,IN
3,4,2263,Sinister,American Double / Imperial IPA,12.0 oz,0.09,,177,18th Street Brewery,Gary,IN
4,5,2262,Sex and Candy,American IPA,12.0 oz.,0.075,,177,18th Street Brewery,Gary,IN
...,...,...,...,...,...,...,...,...,...,...,...
2405,2406,928,Belgorado,Belgian IPA,12.0 ounce,0.067,45,424,Wynkoop Brewing Company,Denver,CO
2406,2407,807,Rail Yard Ale,American Amber / Red Ale,12.0 oz.,0.052%,,424,Wynkoop Brewing Company,Denver,CO
2407,2408,620,B3K Black Lager,Schwarzbier,12.0 ounce,0.0551,,424,Wynkoop Brewing Company,Denver,CO
2408,2409,145,Silverback Pale Ale,American Pale Ale (APA),12.0 ounce,0.055,40,424,Wynkoop Brewing Company,Denver,CO
