## Titanic Dataset

In [31]:
!pip install git+https://github.com/natbusa/datalabframework.git

Collecting git+https://github.com/natbusa/datalabframework.git
  Cloning https://github.com/natbusa/datalabframework.git to /tmp/pip-lveumzrv-build
[33mYou are using pip version 9.0.3, however version 10.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [1]:
import datalabframework as dlf

In [2]:
dlf.project.rootpath()

'/home/jovyan/src'

## Data

In [3]:
# get the aliases of the data resources
import json

metadata = dlf.params.metadata()
print(json.dumps(metadata['data']['resources'], indent=2))

{
  ".elements.extract.train": {
    "path": "extract/data.hdf",
    "format": "hdf",
    "provider": "local"
  },
  ".etl.extract.train": {
    "path": "datasets/extract/train",
    "provider": "local"
  },
  ".etl.extract.test": {
    "path": "datasets/extract/test",
    "provider": "local"
  },
  ".elements.raw.test": {
    "path": "raw/test.csv",
    "format": "csv",
    "provider": "local"
  },
  ".elements.extract.test": {
    "path": "extract/data.hdf",
    "format": "hdf",
    "provider": "local"
  },
  ".etl.raw.test": {
    "path": "datasets/raw/test.csv",
    "format": "csv",
    "provider": "local"
  },
  ".etl.raw.train": {
    "path": "datasets/raw/train.csv",
    "format": "csv",
    "provider": "local"
  },
  ".bla": {
    "oh": "la la"
  },
  ".elements.raw.train": {
    "path": "raw/train.csv",
    "format": "csv",
    "provider": "local"
  }
}


#### Init Spark

In [10]:
engine = dlf.engines.get('spark')
spark = engine.context()

In [11]:
#print out name and version
'{}:{}'.format(engine.info['context'], spark.sparkSession.version)

'spark:2.3.0'

#### Load Dataset

In [22]:
df = engine.read('.etl.raw.train')
df.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

Remove Ticket and Cabin columns

In [23]:
df = df.drop('Ticket').drop('Cabin')

#### Save as parquet

In [26]:
df.write.parquet(dlf.data.path('.etl.extract.train'), mode='overwrite')

#### Explore the dataset

In [27]:
df.show(10, truncate=True)

+-----------+--------+------+--------------------+------+----+-----+-----+-------+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|   Fare|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+-------+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|   7.25|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|71.2833|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|  7.925|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|   53.1|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|   8.05|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0| 8.4583|       Q|
|          7|       0|     1|McCarthy, Mr. Tim...|  male|54.0|    0|    0|51.8625|       S|
|          8|       0|     3|Palsson, Master. ...|  male| 2.0|    3|    1| 21.07

In [28]:
for column in df.schema:
    print('{:<12} {:<10} {}'.format(column.name, str(column.dataType)[:-4], column.nullable))

PassengerId  Integer    True
Survived     Integer    True
Pclass       Integer    True
Name         String     True
Sex          String     True
Age          Double     True
SibSp        Integer    True
Parch        Integer    True
Fare         Double     True
Embarked     String     True
