## Titanic Dataset

In [1]:
import datalabframework as dlf

In [2]:
dlf.project.rootpath()

'/home/natbusa/Projects/dsp-titanic/src'

## Data

In [3]:
metadata = dlf.params.metadata()
metadata

{'engines': {'spark': {'config': {'jobname': 'default', 'master': 'local[1]'},
   'context': 'spark'}},
 'loggers': {'kafka': {'enable': False,
   'hosts': ['localhost:29092'],
   'severity': 'info',
   'topic': 'dlf'},
  'stream': {'enable': True, 'severity': 'info'}},
 'providers': {'local': {'rootpath': '../data', 'service': 'fs'}},
 'resources': {'.etl.clean.test': {'format': 'parquet',
   'path': 'datasets/clean/test',
   'provider': 'local'},
  '.etl.clean.train': {'format': 'parquet',
   'path': 'datasets/clean/train',
   'provider': 'local'},
  '.etl.extract.test': {'format': 'parquet',
   'path': 'datasets/extract/test',
   'provider': 'local'},
  '.etl.extract.train': {'format': 'parquet',
   'path': 'datasets/extract/train',
   'provider': 'local'},
  '.etl.features.test': {'format': 'parquet',
   'path': 'datasets/features/test',
   'provider': 'local'},
  '.etl.features.train': {'format': 'parquet',
   'path': 'datasets/features/train',
   'provider': 'local'},
  '.etl.raw

#### Init Spark

In [4]:
engine = dlf.engines.get('spark')
spark = engine.context()

In [5]:
#print out name and version
'{}:{}'.format(engine.info['context'], spark.sparkSession.version)

'spark:2.3.1'

## Train

#### Load Dataset

In [6]:
df = engine.read('.etl.raw.train', header=True, inferSchema=True)
for column in df.schema:
    print('{:<12} {:<10} {}'.format(column.name, str(column.dataType)[:-4], column.nullable))

PassengerId  Integer    True
Survived     Integer    True
Pclass       Integer    True
Name         String     True
Sex          String     True
Age          Double     True
SibSp        Integer    True
Parch        Integer    True
Ticket       String     True
Fare         Double     True
Cabin        String     True
Embarked     String     True


#### Save as parquet

In [7]:
engine.write(df, '.etl.extract.train', mode='overwrite')

#### Explore the dataset

Check for Null or NaN values, and count them per column

In [8]:
from pyspark.sql.functions import isnan, when, count, col
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
|PassengerId|Survived|Pclass|Name|Sex|Age|SibSp|Parch|Ticket|Fare|Cabin|Embarked|
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
|          0|       0|     0|   0|  0|177|    0|    0|     0|   0|  687|       2|
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+



## Test

#### Load Dataset

In [9]:
df = engine.read('.etl.raw.test', header=True, inferSchema=True)
for column in df.schema:
    print('{:<12} {:<10} {}'.format(column.name, str(column.dataType)[:-4], column.nullable))

PassengerId  Integer    True
Pclass       Integer    True
Name         String     True
Sex          String     True
Age          Double     True
SibSp        Integer    True
Parch        Integer    True
Ticket       String     True
Fare         Double     True
Cabin        String     True
Embarked     String     True


#### Save as parquet

In [10]:
engine.write(df,'.etl.extract.test', mode='overwrite')

#### Explore the dataset

In [11]:
for column in df.schema:
    print('{:<12} {:<10} {}'.format(column.name, str(column.dataType)[:-4], column.nullable))

PassengerId  Integer    True
Pclass       Integer    True
Name         String     True
Sex          String     True
Age          Double     True
SibSp        Integer    True
Parch        Integer    True
Ticket       String     True
Fare         Double     True
Cabin        String     True
Embarked     String     True


Check for Null or NaN values, and count them per column

In [12]:
from etl.features.features import describe_all

importing Jupyter notebook from /home/natbusa/Projects/dsp-titanic/src/etl/features/features.ipynb


In [13]:
df.describe().show()

+-------+------------------+------------------+--------------------+------+------------------+------------------+------------------+------------------+------------------+-----+--------+
|summary|       PassengerId|            Pclass|                Name|   Sex|               Age|             SibSp|             Parch|            Ticket|              Fare|Cabin|Embarked|
+-------+------------------+------------------+--------------------+------+------------------+------------------+------------------+------------------+------------------+-----+--------+
|  count|               418|               418|                 418|   418|               332|               418|               418|               418|               417|   91|     418|
|   mean|            1100.5|2.2655502392344498|                null|  null|30.272590361445783|0.4473684210526316|0.3923444976076555|223850.98986486485|  35.6271884892086| null|    null|
| stddev|120.81045760473994|0.8418375519640503|                null|  

In [14]:
describe_all(df).show()

+--------+------------------+------------------+--------------------+------+------------------+------------------+------------------+------------------+------------------+------+--------+
| summary|       PassengerId|            Pclass|                Name|   Sex|               Age|             SibSp|             Parch|            Ticket|              Fare| Cabin|Embarked|
+--------+------------------+------------------+--------------------+------+------------------+------------------+------------------+------------------+------------------+------+--------+
|     nan|                 0|                 0|                   0|     0|                 0|                 0|                 0|                 0|                 0|     0|       0|
|  isnull|                 0|                 0|                   0|     0|                86|                 0|                 0|                 0|                 1|   327|       0|
|    type|           Integer|           Integer|            