## Titanic Dataset

In [1]:
%%datalabframework getfilename

In [2]:
import datalabframework as dlf
logger = dlf.log.initLogger(__name__, kafka_topic="datalab", kafka_servers="kafka:9092")

2017-04-30 14:04:29,428 - jovyan - __main__ - INFO - init - {'notebook': {'filename': 'extract.ipynb', 'filepath': '/home/jovyan/work/notebooks/extract'}, 'project': {'rootpath': '/home/jovyan/work/notebooks', 'main': 'main.ipynb'}, 'datalab': {'framework': '0.1'}}


#### Load data (local)

In [3]:
# EXPORT

def load_csv(spark, path):
    # load data in pandas dataFrame
    df = spark.read \
              .option("header", "true") \
              .option("inferSchema", "true") \
              .format("csv") \
              .load(path)
              
    return df

def clean_dataset(df):
    # all join, selct, filter things should go here
    df = df.drop('Ticket')
    df = df.drop('Cabin')

    return df

### Testing and Exploration

#### Params

In [4]:
DATA_ROOT = "/home/jovyan/work/data"
d = {
    'input_sample' : 1.0,
    'input_source' : DATA_ROOT + '/titanic/data/raw/train.csv',
    
    'output_sample' : 1.0,
    'output_source' : DATA_ROOT + '/titanic/data/set/clean/train.parquet'
}

p = dlf.params.config_fromdict(d)

#### Init Spark

In [5]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName(dlf.project.filename()).getOrCreate()
spark.version

'2.1.0'

#### Load Dataset

In [6]:
df = load_csv(spark, p.input_source)
df = clean_dataset(df)

#### Save as parquet

In [7]:
df.write.parquet(p.output_source, mode='overwrite')

#### Explore the dataset

In [8]:
df.show(10, truncate=True)

+-----------+--------+------+--------------------+------+----+-----+-----+-------+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|   Fare|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+-------+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|   7.25|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|71.2833|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|  7.925|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|   53.1|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|   8.05|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0| 8.4583|       Q|
|          7|       0|     1|McCarthy, Mr. Tim...|  male|54.0|    0|    0|51.8625|       S|
|          8|       0|     3|Palsson, Master. ...|  male| 2.0|    3|    1| 21.07

In [9]:
for column in df.schema:
    print('{:<12} {:<10} {}'.format(column.name, str(column.dataType)[:-4], column.nullable))

PassengerId  Integer    True
Survived     Integer    True
Pclass       Integer    True
Name         String     True
Sex          String     True
Age          Double     True
SibSp        Integer    True
Parch        Integer    True
Fare         Double     True
Embarked     String     True
