# SkData Steps

The OpenRefine json file for steps was used as reference:

```json

[
  {
    "op": "core/text-transform",
    "description": "Text transform on cells in column Sex using expression grel:if(value == 'male', 1, 0)",
    "engineConfig": {
      "mode": "row-based",
      "facets": []
    },
    "columnName": "Sex",
    "expression": "grel:if(value == 'male', 1, 0)",
    "onError": "keep-original",
    "repeat": false,
    "repeatCount": 10
  }
]

```

**SkData Steps format proposed:**

```json
[
  {
    'data-set': 'data_set_name', // required
    'operation': 'categorize|text-transform|fill-na|drop-na|drop-unique', // required
    'column': 'column_name',  // optional
    'new-column': 'new_column_name', // optional
    'expression': 'dict|inline-if' // required
  }
]
```

In [1]:
from IPython.display import display
from functools import reduce
# local
from skdata.cleaning import *
from skdata.data import summary

import pandas as pd

You can access NaTType as type(pandas.NaT)
  @convert.register((pd.Timestamp, pd.Timedelta), (pd.tslib.NaTType, type(None)))


In [2]:
def replace(value: str, replace_dict: dict):
    """
    """
    if not isinstance(value, str):
        return value
    
    return reduce(
        lambda x, y: x.replace(y, replace_dict[y]), replace_dict, value
    )

In [3]:
def expr(dataset: pd.DataFrame, command: str):
    # aliases
    op = command['operation']
    k = command['column'] if 'column' in command else None
    k_new = k if 'new-column' not in command else command['new-column']
    c_expr = command['expression']
    
    if op == 'text-transform':
        f_expr = eval('lambda value: %s' % c_expr)
        dataset[k_new] = dataset[k].apply(f_expr)
    elif op == 'categorize':
        params = dict(data=dataset, col_name=k, categories=eval(c_expr))
        
        if 'new-column' in command:
            params.update({'new_col_name': k_new})
            
        categorize(**params)
    elif op == 'fill-na':
        fill = c_expr
        
        if c_expr in ['mean', 'max', 'min', 'median']:
            fill = dataset.eval('%s.%s()' % (k, c_expr))
        
        dataset[k].fillna(fill, inplace=True)
    elif op == 'drop-na':
        data = dataset
        f_expr = eval('dropna_%s' % c_expr)
    return dataset

In [4]:
commands = [{
    'data-set': 'data', 
    'operation': 'text-transform',
    'column': 'Sex', 
    'expression': 'value.title()'
}, {
    'data-set': 'data', 
    'operation': 'categorize',
    'column': 'Pclass', 
    'new-column': 'Pclass-name',
    'expression': '{1: "High Class", 2: "Middle Class", 3: "Low Class"}'
}, {
    'data-set': 'data', 
    'operation': 'text-transform',
    'column': 'Embarked', 
    'expression': (
        "replace(value, {'C': 'Cherbourg', 'Q': 'Queenstown', 'S': 'Southampton'})"
    )
}, {
    'data-set': 'data', 
    'operation': 'fill-na',
    'column': 'Age', 
    'expression': 'mean'
}, {
    'data-set': 'data', 
    'operation': 'drop-na',
    'expression': 'columns(data, max_na_values=0.1)'
}, {
    'data-set': 'data', 
    'operation': 'drop-na',
    'expression': 'rows(data)'
}]

In [5]:
data = pd.read_csv('../data/train.csv')

display(summary(data))

for c in commands:
    locals()[c['data-set']] = expr(locals()[c['data-set']], c)
    
display(data.head())
display(summary(data))

Unnamed: 0,Types,Set Values,Count Set,# Observations,# NaN
PassengerId,int64,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",891,891,0
Survived,int64,"[0, 1]",2,891,0
Pclass,int64,"[1, 2, 3]",3,891,0
Name,object,"['Abbing, Mr. Anthony', 'Abbott, Mr. Rossmore ...",891,891,0
Sex,object,"['female', 'male']",2,891,0
Age,float64,"[0.42, 0.67, 0.75, 0.83, 0.92, 1.0, 2.0, 3.0, ...",88,714,177
SibSp,int64,"[0, 1, 2, 3, 4, 5, 8]",7,891,0
Parch,int64,"[0, 1, 2, 3, 4, 5, 6]",7,891,0
Ticket,object,"['110152', '110413', '110465', '110564', '1108...",681,891,0
Fare,float64,"[0.0, 4.0125, 5.0, 6.2375, 6.4375, 6.45, 6.495...",248,891,0


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Pclass-name
0,1,0,3,"Braund, Mr. Owen Harris",Male,22.0,1,0,A/5 21171,7.25,Southampton,Low Class
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",Female,38.0,1,0,PC 17599,71.2833,Cherbourg,High Class
2,3,1,3,"Heikkinen, Miss. Laina",Female,26.0,0,0,STON/O2. 3101282,7.925,Southampton,Low Class
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",Female,35.0,1,0,113803,53.1,Southampton,High Class
4,5,0,3,"Allen, Mr. William Henry",Male,35.0,0,0,373450,8.05,Southampton,Low Class


Unnamed: 0,Types,Set Values,Count Set,# Observations,# NaN
PassengerId,int64,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",889,889,0
Survived,int64,"[0, 1]",2,889,0
Pclass,int64,"[1, 2, 3]",3,889,0
Name,object,"['Abbing, Mr. Anthony', 'Abbott, Mr. Rossmore ...",889,889,0
Sex,object,"['Female', 'Male']",2,889,0
Age,float64,"[0.42, 0.67, 0.75, 0.83, 0.92, 1.0, 2.0, 3.0, ...",89,889,0
SibSp,int64,"[0, 1, 2, 3, 4, 5, 8]",7,889,0
Parch,int64,"[0, 1, 2, 3, 4, 5, 6]",7,889,0
Ticket,object,"['110152', '110413', '110465', '110564', '1108...",680,889,0
Fare,float64,"[0.0, 4.0125, 5.0, 6.2375, 6.4375, 6.45, 6.495...",247,889,0
