 # Table of Contents
<div class="toc" style="margin-top: 1em;"><ul class="toc-item" id="toc-level0"><li><span><a href="http://localhost:8888/notebooks/SkData.ipynb#SkData---Data-Specification" data-toc-modified-id="SkData---Data-Specification-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>SkData - Data Specification</a></span><ul class="toc-item"><li><span><a href="http://localhost:8888/notebooks/SkData.ipynb#Importing-data" data-toc-modified-id="Importing-data-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Importing data</a></span></li><li><span><a href="http://localhost:8888/notebooks/SkData.ipynb#Data-preparing-and-cleaning" data-toc-modified-id="Data-preparing-and-cleaning-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Data preparing and cleaning</a></span></li></ul></li></ul></div>

# SkData - Data Specification

SkData provide a data class to structure and organize the preprocessing data.

The data is stored in **hdf5** format. The original data is kept and all steps 
of preprocessing is kept to and applied on demand.

To import data from *csv* source:

```python
from skdata import SkData

sd = SkData('filename.h5')
sd.import_from(source='filename.csv')
```

In [1]:
from skdata import SkData

You can access NaTType as type(pandas.NaT)
  @convert.register((pd.Timestamp, pd.Timedelta), (pd.tslib.NaTType, type(None)))


## Importing data

In [2]:
sd = SkData('/tmp/titanic.h5')

sd.import_from(
    source='../data/train.csv', 
    index_col='PassengerId',
    target_col='Survived',
    dset_id='train'
)

In [3]:
sd['train'].summary(compute=True)

Unnamed: 0,Types,Set Values,Count Set,# Observations,# NaN
Survived,int64,"[0, 1]",2,891,0
Pclass,int64,"[1, 2, 3]",3,891,0
Name,object,"['Abbing, Mr. Anthony', 'Abbott, Mr. Rossmore ...",891,891,0
Sex,object,"['female', 'male']",2,891,0
Age,float64,"[0.42, 0.67, 0.75, 0.83, 0.92, 1.0, 2.0, 3.0, ...",88,714,177
SibSp,int64,"[0, 1, 2, 3, 4, 5, 8]",7,891,0
Parch,int64,"[0, 1, 2, 3, 4, 5, 6]",7,891,0
Ticket,object,"['110152', '110413', '110465', '110564', '1108...",681,891,0
Fare,float64,"[0.0, 4.0125, 5.0, 6.2375, 6.4375, 6.45, 6.495...",248,891,0
Cabin,object,"['A10', 'A14', 'A16', 'A19', 'A20', 'A23', 'A2...",147,204,687


In [4]:
sd['train'].result.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Data preparing and cleaning

In [5]:
sd['train']['Sex'].replace({'male': 'Male', 'female': 'Female'})
sd['train']['Embarked'].replace({
    'C': 'Cherbourg', 'Q': 'Queenstown', 'S': 'Southampton'
})

sd['train'].compute()
sd['train'].result.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,0,3,"Braund, Mr. Owen Harris",Male,22.0,1,0,A/5 21171,7.25,,Southampton
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",feMale,38.0,1,0,PC 17599,71.2833,C85,Cherbourg
3,1,3,"Heikkinen, Miss. Laina",feMale,26.0,0,0,STON/O2. 3101282,7.925,,Southampton
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",feMale,35.0,1,0,113803,53.1,C123,Southampton
5,0,3,"Allen, Mr. William Henry",Male,35.0,0,0,373450,8.05,,Southampton


In [6]:
survived_dict = {0: 'Died', 1: 'Survived'}
pclass_dict = {1: 'Upper Class', 2: 'Middle Class', 3: 'Lower Class'}

sd['train']['Pclass'].categorize(categories=pclass_dict)
sd['train']['Survived'].categorize(categories=survived_dict)
sd['train']['Sex'].categorize()
sd['train']['Embarked'].categorize()

sd['train'].summary(compute=True)

Unnamed: 0,Types,Set Values,Count Set,# Observations,# NaN
Survived,category,"['Died', 'Survived']",2,891,0
Pclass,category,"['Lower Class', 'Middle Class', 'Upper Class']",3,891,0
Name,object,"['Abbing, Mr. Anthony', 'Abbott, Mr. Rossmore ...",891,891,0
Sex,category,"['Male', 'feMale']",2,891,0
Age,float64,"[0.42, 0.67, 0.75, 0.83, 0.92, 1.0, 2.0, 3.0, ...",88,714,177
SibSp,int64,"[0, 1, 2, 3, 4, 5, 8]",7,891,0
Parch,int64,"[0, 1, 2, 3, 4, 5, 6]",7,891,0
Ticket,object,"['110152', '110413', '110465', '110564', '1108...",681,891,0
Fare,float64,"[0.0, 4.0125, 5.0, 6.2375, 6.4375, 6.45, 6.495...",248,891,0
Cabin,object,"['A10', 'A14', 'A16', 'A19', 'A20', 'A23', 'A2...",147,204,687


In [7]:
sd['train'].result.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,Died,Lower Class,"Braund, Mr. Owen Harris",Male,22.0,1,0,A/5 21171,7.25,,Southampton
2,Survived,Upper Class,"Cumings, Mrs. John Bradley (Florence Briggs Th...",feMale,38.0,1,0,PC 17599,71.2833,C85,Cherbourg
3,Survived,Lower Class,"Heikkinen, Miss. Laina",feMale,26.0,0,0,STON/O2. 3101282,7.925,,Southampton
4,Survived,Upper Class,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",feMale,35.0,1,0,113803,53.1,C123,Southampton
5,Died,Lower Class,"Allen, Mr. William Henry",Male,35.0,0,0,373450,8.05,,Southampton


In [8]:
sd['train'].drop_columns(max_na_values=0.1)
sd['train'].summary(compute=True)

Unnamed: 0,Types,Set Values,Count Set,# Observations,# NaN
Survived,category,"['Died', 'Survived']",2,891,0
Pclass,category,"['Lower Class', 'Middle Class', 'Upper Class']",3,891,0
Name,object,"['Abbing, Mr. Anthony', 'Abbott, Mr. Rossmore ...",891,891,0
Sex,category,"['Male', 'feMale']",2,891,0
SibSp,int64,"[0, 1, 2, 3, 4, 5, 8]",7,891,0
Parch,int64,"[0, 1, 2, 3, 4, 5, 6]",7,891,0
Ticket,object,"['110152', '110413', '110465', '110564', '1108...",681,891,0
Fare,float64,"[0.0, 4.0125, 5.0, 6.2375, 6.4375, 6.45, 6.495...",248,891,0
Embarked,category,"['Cherbourg', 'Queenstown', 'Southampton']",3,889,2


In [9]:
sd['train'].dropna()
sd['train'].summary(compute=True)

Unnamed: 0,Types,Set Values,Count Set,# Observations,# NaN
Survived,category,"['Died', 'Survived']",2,889,0
Pclass,category,"['Lower Class', 'Middle Class', 'Upper Class']",3,889,0
Name,object,"['Abbing, Mr. Anthony', 'Abbott, Mr. Rossmore ...",889,889,0
Sex,category,"['Male', 'feMale']",2,889,0
SibSp,int64,"[0, 1, 2, 3, 4, 5, 8]",7,889,0
Parch,int64,"[0, 1, 2, 3, 4, 5, 6]",7,889,0
Ticket,object,"['110152', '110413', '110465', '110564', '1108...",680,889,0
Fare,float64,"[0.0, 4.0125, 5.0, 6.2375, 6.4375, 6.45, 6.495...",247,889,0
Embarked,category,"['Cherbourg', 'Queenstown', 'Southampton']",3,889,0


In [10]:
sd['train'].drop_columns(max_unique_values=0.3)
sd['train'].summary(compute=True)

Unnamed: 0,Types,Set Values,Count Set,# Observations,# NaN
Survived,category,"['Died', 'Survived']",2,889,0
Pclass,category,"['Lower Class', 'Middle Class', 'Upper Class']",3,889,0
Sex,category,"['Male', 'feMale']",2,889,0
SibSp,int64,"[0, 1, 2, 3, 4, 5, 8]",7,889,0
Parch,int64,"[0, 1, 2, 3, 4, 5, 6]",7,889,0
Fare,float64,"[0.0, 4.0125, 5.0, 6.2375, 6.4375, 6.45, 6.495...",247,889,0
Embarked,category,"['Cherbourg', 'Queenstown', 'Southampton']",3,889,0


In [11]:
print('STEPS:')
sd['train'].attr_load('steps')

STEPS:


[{'column': 'Sex',
  'data-set': 'train',
  'expression': "replace(value, {'male': 'Male', 'female': 'Female'})",
  'operation': 'text-transform'},
 {'column': 'Embarked',
  'data-set': 'train',
  'expression': "replace(value, {'C': 'Cherbourg', 'Q': 'Queenstown', 'S': 'Southampton'})",
  'operation': 'text-transform'},
 {'column': 'Pclass',
  'data-set': 'train',
  'expression': "{1: 'Upper Class', 2: 'Middle Class', 3: 'Lower Class'}",
  'operation': 'categorize'},
 {'column': 'Survived',
  'data-set': 'train',
  'expression': "{0: 'Died', 1: 'Survived'}",
  'operation': 'categorize'},
 {'column': 'Sex',
  'data-set': 'train',
  'expression': 'None',
  'operation': 'categorize'},
 {'column': 'Embarked',
  'data-set': 'train',
  'expression': 'None',
  'operation': 'categorize'},
 {'data-set': 'train',
  'expression': '{"max_na_values":0.1, "axis": 1}',
  'operation': 'drop-na'},
 {'data-set': 'train', 'expression': '{"axis": 0}', 'operation': 'drop-na'},
 {'data-set': 'train',
  'exp