## Titanic Dataset

In [1]:
import pandas as pd

In [1]:
import datalabframework as dlf

## Extract Data

### Train dataset

In [2]:
train = dlf.engines.get('pandas').read('train')

In [3]:
train.sample(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
31,32,1,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,,1,0,PC 17569,146.5208,B78,C
325,326,1,1,"Young, Miss. Marie Grice",female,36.0,0,0,PC 17760,135.6333,C32,C
756,757,0,3,"Carlsson, Mr. August Sigfrid",male,28.0,0,0,350042,7.7958,,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
600,601,1,2,"Jacobsohn, Mrs. Sidney Samuel (Amy Frances Chr...",female,24.0,2,1,243847,27.0,,S
508,509,0,3,"Olsen, Mr. Henry Margido",male,28.0,0,0,C 4001,22.525,,S
280,281,0,3,"Duane, Mr. Frank",male,65.0,0,0,336439,7.75,,Q
253,254,0,3,"Lobb, Mr. William Arthur",male,30.0,1,0,A/5. 3336,16.1,,S
397,398,0,2,"McKane, Mr. Peter David",male,46.0,0,0,28403,26.0,,S
434,435,0,1,"Silvey, Mr. William Baird",male,50.0,1,0,13507,55.9,E44,S


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [5]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
train.isin([0,'',None]).sum()

PassengerId      0
Survived       549
Pclass           0
Name             0
Sex              0
Age              0
SibSp          608
Parch          678
Ticket           0
Fare            15
Cabin            0
Embarked         0
dtype: int64

### Test Dataset

In [7]:
test = dlf.engines.get('pandas').read('test')

In [8]:
test.sample(10)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
256,1148,3,"Mahon, Mr. John",male,,0,0,AQ/4 3130,7.75,,Q
13,905,2,"Howard, Mr. Benjamin",male,63.0,1,0,24065,26.0,,S
37,929,3,"Cacic, Miss. Manda",female,21.0,0,0,315087,8.6625,,S
205,1097,1,"Omont, Mr. Alfred Fernand",male,,0,0,F.C. 12998,25.7417,,C
321,1213,3,"Krekorian, Mr. Neshan",male,25.0,0,0,2654,7.2292,F E57,C
345,1237,3,"Abelseth, Miss. Karen Marie",female,16.0,0,0,348125,7.65,,S
370,1262,2,"Giles, Mr. Edgar",male,21.0,1,0,28133,11.5,,S
323,1215,1,"Rowe, Mr. Alfred G",male,33.0,0,0,113790,26.55,,S
56,948,3,"Cor, Mr. Bartol",male,35.0,0,0,349230,7.8958,,S
405,1297,2,"Nourney, Mr. Alfred (Baron von Drachstedt"")""",male,20.0,0,0,SC/PARIS 2166,13.8625,D38,C


In [9]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [10]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [11]:
test.isin([0,'',None]).sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp          283
Parch          324
Ticket           0
Fare             2
Cabin            0
Embarked         0
dtype: int64

### Write out Clean Data

If format conversion should happen it should be done here. In this particular example we assume that csv is our raw data and from here on all data is going to be compacted in binary format using HDF5 format. HDF5 allows to store multiple pandas DataFrame in the same file. This can be achieve by mapping multiple metadata aliases to the same data resource.

In [19]:
import warnings
warnings.filterwarnings('ignore')

In [20]:
dlf.engines.get('pandas').write(train, '.elements.clean.train')
dlf.engines.get('pandas').write(test,  '.elements.clean.test')

### Metrics on Raw Data 

 A table with all the dataset being read
 
 - Number of Entries
 - Null Values
 - Type Purity

In [13]:
index = ['train', 'test']
data  = [train.isnull().sum(), test.isnull().sum()]
metrics = pd.DataFrame(data, index, dtype='Int64')

metrics['_total'] = [train.shape[0], test.shape[0]]

NameError: name 'pd' is not defined

In [18]:
metrics

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,_total
train,177,687,2,0,0,0,0,0,0,0,0.0,0,891
test,86,327,0,1,0,0,0,0,0,0,,0,418


In [19]:
index = ['train', 'test']
data  = [train.dtypes, test.dtypes]
schema = pd.DataFrame(data, index)

schema['_total'] = [train.shape[0], test.shape[0]]

In [20]:
schema

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,_total
train,float64,object,object,float64,object,int64,int64,int64,object,int64,int64,object,891
test,float64,object,object,float64,object,int64,int64,int64,object,int64,,object,418


coerce (to be done)

### High Level summary metrics

This metrics are independent on the columns names and the number of columns

In [70]:
index = ['train', 'test']
data = [train.dtypes.values, test.dtypes.values]
df  =  pd.DataFrame(data, index)

In [74]:
from collections import Counter
l = [item for sb in df.values.tolist() for item in sb]
Counter(l)

Counter({dtype('int64'): 9, dtype('float64'): 4, None: 1, dtype('O'): 10})

In [77]:
train.shape

(891, 12)

In [85]:
train.isin([0,'',None]).sum().sum()

1850