## Titanic Dataset

In [1]:
import datalabframework as dlf

In [7]:
logger = dlf.log.logger()

 - logtime
 - level
 - username
 - git hash
 - git last commit author
 - git last commit email
 - git last commit time
 - git last commit dirty
 - git repo
 - git origin
 - file path
 - file basename
 - msg (either dict or text)

## Extract Data

### Train dataset

In [37]:
train = dlf.engines.get('pandas').read('.elements.raw.train')

In [38]:
train.sample(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
384,385,0,3,"Plotcharsky, Mr. Vasil",male,,0,0,349227,7.8958,,S
566,567,0,3,"Stoytcheff, Mr. Ilia",male,19.0,0,0,349205,7.8958,,S
346,347,1,2,"Smith, Miss. Marion Elsie",female,40.0,0,0,31418,13.0,,S
232,233,0,2,"Sjostedt, Mr. Ernst Adolf",male,59.0,0,0,237442,13.5,,S
172,173,1,3,"Johnson, Miss. Eleanor Ileen",female,1.0,1,1,347742,11.1333,,S
80,81,0,3,"Waelens, Mr. Achille",male,22.0,0,0,345767,9.0,,S
633,634,0,1,"Parr, Mr. William Henry Marsh",male,,0,0,112052,0.0,,S
519,520,0,3,"Pavlovic, Mr. Stefo",male,32.0,0,0,349242,7.8958,,S
320,321,0,3,"Dennis, Mr. Samuel",male,22.0,0,0,A/5 21172,7.25,,S
249,250,0,2,"Carter, Rev. Ernest Courtenay",male,54.0,1,0,244252,26.0,,S


In [94]:
train.dtypes.value_counts()

Index([object, int64, float64], dtype='object')

In [95]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [41]:
train.isin([0,'',None]).sum()

PassengerId      0
Survived       549
Pclass           0
Name             0
Sex              0
Age              0
SibSp          608
Parch          678
Ticket           0
Fare            15
Cabin            0
Embarked         0
dtype: int64

### Test Dataset

In [42]:
test = dlf.engines.get('pandas').read('.elements.raw.test')

In [43]:
test.sample(10)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
98,990,3,"Braf, Miss. Elin Ester Maria",female,20.0,0,0,347471,7.8542,,S
220,1112,2,"Duran y More, Miss. Florentina",female,30.0,1,0,SC/PARIS 2148,13.8583,,C
105,997,3,"Holthen, Mr. Johan Martin",male,28.0,0,0,C 4001,22.525,,S
308,1200,1,"Hays, Mr. Charles Melville",male,55.0,1,1,12749,93.5,B69,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9,C105,C
341,1233,3,"Lundstrom, Mr. Thure Edvin",male,32.0,0,0,350403,7.5792,,S
236,1128,1,"Warren, Mr. Frank Manley",male,64.0,1,0,110813,75.25,D37,C
367,1259,3,"Riihivouri, Miss. Susanna Juhantytar Sanni""""",female,22.0,0,0,3101295,39.6875,,S
184,1076,1,"Douglas, Mrs. Frederick Charles (Mary Helene B...",female,27.0,1,1,PC 17558,247.5208,B58 B60,C
309,1201,3,"Hansen, Mrs. Claus Peter (Jennie L Howard)",female,45.0,1,0,350026,14.1083,,S


In [44]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [45]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [46]:
test.isin([0,'',None]).sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp          283
Parch          324
Ticket           0
Fare             2
Cabin            0
Embarked         0
dtype: int64

### Write out Extracted Data from Raw

If format conversion should happen it should be done here. In this particular example we assume that csv is our raw data and from here on all data is going to be compacted in binary format using HDF5 format. HDF5 allows to store multiple pandas DataFrame in the same file. This can be achieve by mapping multiple metadata aliases to the same data resource.

In [47]:
import warnings
warnings.filterwarnings('ignore')

In [48]:
dlf.engines.get('pandas').write(train, 'train')
dlf.engines.get('pandas').write(test,  'test')

### Metrics on Raw Data 

 A table with all the dataset being read
 
 - Number of Entries
 - Null Values
 - Type Purity

In [49]:
pd = dlf.engines.get("pandas").context()

In [50]:
index = ['train', 'test']
data  = [train.isnull().sum(), test.isnull().sum()]
metrics = pd.DataFrame(data, index, dtype='Int64')

metrics['_total'] = [train.shape[0], test.shape[0]]

In [51]:
# display the metrics on null fields
metrics

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,_total
train,177,687,2,0,0,0,0,0,0,0,0.0,0,891
test,86,327,0,1,0,0,0,0,0,0,,0,418


In [52]:
index = ['train', 'test']
data  = [train.dtypes, test.dtypes]
schema = pd.DataFrame(data, index)

schema['_total'] = [train.shape[0], test.shape[0]]

In [53]:
# display the extracted schema for each attribute
schema

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,_total
train,float64,object,object,float64,object,int64,int64,int64,object,int64,int64,object,891
test,float64,object,object,float64,object,int64,int64,int64,object,int64,,object,418


coerce (to be done)

### High Level summary metrics

This metrics are independent on the columns names and the number of columns

In [54]:
data = [train.dtypes.values, test.dtypes.values]
df  =  pd.DataFrame(data)

from collections import Counter
l = [item for sb in df.values.tolist() for item in sb]
d = dict(Counter(l))

#missing column between 'train' and 'test'
d[None]

1

#### For each data set
Collect shape, number of zero, Null, NaN elements, types

In [55]:
d = {'id':'train'}

In [56]:
t = list(train.shape)
d.update({'size': {'samples': t[0], 'features':t[1]} })

In [57]:
d.update({
    'stats': {
        'zero': train.isin([0]).sum().sum(), 
        'empty': train.isin(['']).sum().sum(),
        'none': train.isin([None]).sum().sum(),
        'null': train.isnull().sum().sum()
    }
})

In [110]:
d.update({'types':{k.str:v for k,v in train.dtypes.value_counts().items()}})

In [111]:
dd ={'data':d, 'type':'dataset', 'task':'stats'}

In [112]:
#dlf.data.stats('train', engine="current")
#dlf.data.info('train', engine="current")
#dlf.data.read('train', engine="current")
#dlf.data.write('train', obj=df, engine="current")
d

{'id': 'train',
 'size': {'features': 12, 'samples': 891},
 'stats': {'empty': 0, 'none': 0, 'null': 866, 'zero': 1850},
 'types': {'<f8': 2, '<i8': 5, '|O': 5}}

In [113]:
logger.info(dd)

<LogRecord: root, 20, <ipython-input-113-1b3234256952>, 1, "{'data': {'id': 'train', 'size': {'samples': 891, 'features': 12}, 'stats': {'zero': 1850, 'empty': 0, 'none': 0, 'null': 866}, 'types': {'|O': 5, '<i8': 5, '<f8': 2}}, 'type': 'dataset', 'task': 'stats'}">
2018-02-28 10:56:27,625 - INFO - natbusa - ec81da3 - natbusa - natalino.busa@gmail.com - 1519636878 - True - https://github.com/natbusa/dsp-titanic.git - dsp-titanic - extract.ipynb - elements/extract - {'data': {'id': 'train', 'size': {'samples': 891, 'features': 12}, 'stats': {'zero': 1850, 'empty': 0, 'none': 0, 'null': 866}, 'types': {'|O': 5, '<i8': 5, '<f8': 2}}, 'type': 'dataset', 'task': 'stats'}
