# Dataset Generation
Generate erroneous datasets.

In [1]:
import pandas as pd
import error_generation

In [2]:
from error_generation.api import mid_level
from error_generation.utils import ErrorModel, ErrorTypeConfig, MidLevelConfig
from error_generation.error_mechanism import EAR, ENAR, ECAR
from error_generation.utils import ErrorTypeConfig
from error_generation import error_type

In [3]:
def read_csv_dataset(dataset_path):
    """
    This method reads a dataset from a csv file path.
    """
    dataframe = pd.read_csv(dataset_path, sep=",", header="infer", encoding="utf-8", dtype=str,
                                keep_default_na=False, low_memory=False)
    return dataframe

## beers

Ich modelliere `beers` nahe am Original.

In [4]:
df_clean = read_csv_dataset('../data/beers/clean.csv')
df_dirty = read_csv_dataset('../data/beers/dirty.csv')

In [5]:
df_dirty.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2410 entries, 0 to 2409
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   index         2410 non-null   object
 1   id            2410 non-null   object
 2   beer-name     2410 non-null   object
 3   style         2410 non-null   object
 4   ounces        2410 non-null   object
 5   abv           2410 non-null   object
 6   ibu           2410 non-null   object
 7   brewery_id    2410 non-null   object
 8   brewery-name  2410 non-null   object
 9   city          2410 non-null   object
 10  state         2410 non-null   object
dtypes: object(11)
memory usage: 207.2+ KB


In [6]:
df_clean['index'] = df_clean['index'].astype('int')
df_clean['id'] = df_clean['id'].astype('int')
df_clean['beer-name'] = df_clean['beer-name'].astype('str')
df_clean['style'] = df_clean['style'].astype('str')
df_clean['ounces'] = df_clean['ounces'].astype('str')
df_clean['abv'] = df_clean['abv'].replace('', None).astype('float')
df_clean['brewery_id'] = df_clean['brewery_id'].astype('int')
df_clean['brewery-name'] = df_clean['brewery-name'].astype('str')
df_clean['state'] = df_clean['state'].astype('str')

In [7]:
df_clean['abv']

0       0.050
1       0.066
2       0.071
3       0.090
4       0.075
        ...  
2405    0.067
2406    0.052
2407    0.055
2408    0.055
2409    0.052
Name: abv, Length: 2410, dtype: float64

In [8]:
df_dirty['abv']

0         0.05
1        0.066
2        0.071
3        0.09%
4        0.075
         ...  
2405     0.067
2406     0.052
2407    0.055%
2408    0.055%
2409     0.052
Name: abv, Length: 2410, dtype: object

In [9]:
config = MidLevelConfig(
    {
        "ounces": [
            ErrorModel(ECAR(), error_type.Extraneous({'extraneous_value_template': "{value}.0 oz"}), 0.25),
            ErrorModel(ECAR(), error_type.Extraneous({'extraneous_value_template': "{value}.0 oz."}), 0.38),
            ErrorModel(ECAR(), error_type.Extraneous({'extraneous_value_template': "{value}.0 ounce"}), 0.23),
            ErrorModel(ECAR(), error_type.Extraneous({'extraneous_value_template': "{value}.0 oz. Alumi-Tek"}), 0.05),
            ErrorModel(ECAR(), error_type.Extraneous({'extraneous_value_template': "{value}.0 OZ"}), 0.062),
            ErrorModel(ECAR(), error_type.Extraneous({'extraneous_value_template': "{value}.0 oz. Silo Can"}), 0.0046),
            ErrorModel(ECAR(), error_type.Extraneous({'extraneous_value_template': "{value} oz."}), 0.0025),
            ErrorModel(ECAR(), error_type.Extraneous({'extraneous_value_template': "{value} ounce"}), 0.0025),
            ErrorModel(ECAR(), error_type.Extraneous({'extraneous_value_template': "{value} oz"}), 0.017),
            ],
        "abv": [            
            ErrorModel(ECAR(), error_type.AddDelta({'add_delta_value': 0.0001}), 0.059),
            ErrorModel(ECAR(), error_type.Extraneous({'extraneous_value_template': "{value}%"}), 0.229),
        ],
        "ibu": [ErrorModel(ECAR(), error_type.MissingValue({'na_value': "N/A"}), 0.417)],
        "city": [ErrorModel(EAR(condition_to_column="state"), error_type.Extraneous({'extraneous_value_template': "{value} AZ"}), 0.053)],
        "state": [ErrorModel(EAR(condition_to_column="state"), error_type.MissingValue(), 0.053)],
    }
)

In [10]:
df_corrupted, error_mask = mid_level.create_errors(df_clean, config)

Cast types to help pyarrorw write parquet files correctly.

In [11]:
df_corrupted['index'] = df_corrupted['index'].astype('int')
df_corrupted['id'] = df_corrupted['id'].astype('int')
df_corrupted['beer-name'] = df_corrupted['beer-name'].astype('str')
df_corrupted['style'] = df_corrupted['style'].astype('str')
df_corrupted['ounces'] = df_corrupted['ounces'].astype('str')
df_corrupted['abv'] = df_corrupted['abv'].astype('str')
df_corrupted['brewery_id'] = df_corrupted['brewery_id'].astype('int')
df_corrupted['brewery-name'] = df_corrupted['brewery-name'].astype('str')
df_corrupted['state'] = df_corrupted['state'].astype('str')

In [12]:
df_corrupted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2410 entries, 0 to 2409
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   index         2410 non-null   int64 
 1   id            2410 non-null   int64 
 2   beer-name     2410 non-null   object
 3   style         2410 non-null   object
 4   ounces        2410 non-null   object
 5   abv           2410 non-null   object
 6   ibu           2410 non-null   object
 7   brewery_id    2410 non-null   int64 
 8   brewery-name  2410 non-null   object
 9   city          2410 non-null   object
 10  state         2410 non-null   object
dtypes: int64(3), object(8)
memory usage: 207.2+ KB


In [16]:
#df_corrupted.to_parquet('../export_data/beers/beers_dirty_authentic.parquet', index=False)
df_corrupted.to_csv('../export_data/beers/beers_dirty_authentic.csv', index=False)
#df_clean.to_parquet('../export_data/beers/beers_clean.parquet', index=False)
df_clean.to_csv('../export_data/beers/beers_clean.csv', index=False)