# Titanic Survival Prediction with RayDP

In [None]:
import ray
import os
import re
import pandas as pd, numpy as np

from pyspark.sql.functions import *

from tensorflow import keras

import raydp
from raydp.tf import TFEstimator
from raydp.utils import random_split

## Initialize or connect to existed Ray cluster

Firstly, you need to init or connect to a ray cluster. Note that you should set include_java to True(For ray 0.8.7).

In [None]:
ray.init()

After initializing ray cluster, you can use the raydp api to get a spark session  
`init_spark` take 4 required parameters and 1 optional parameters:  
1. app_name: the application name
2. num_executors: number of executors for spark application
3. cores_per_executor: number of cores for each executor
4. executor_memory: memory size for each executor 
5. config[option]: extra config for spark

In [None]:
app_name = "Titanic survival prediction with RayDp"
num_executors = 4
cores_per_executor = 1
memory_per_executor = "1GB"
spark = raydp.init_spark(app_name, num_executors, cores_per_executor, memory_per_executor)

## Distributed data preprocessing with pyspark

Then you can use pyspark api for distributed data preprocessing with the spark session

You need to download the dataset from https://www.kaggle.com/c/titanic/data  
Read it with spark session and you will get a DataFrame

In [None]:
train = spark.read.format("csv").option("header", "true") \
        .option("inferSchema", "true") \
        .load("/mnt/DP_disk8/dataset/titanic_train.csv")

Then do data processing and feature engineering with spark

In [None]:
def fill_na(data):
    
    # Fill NA in column Fare, Age and Embarked
    data = data.fillna({"Embarked": "S"})
    
    fare_avg = data.select(mean(col("Fare")).alias("mean")).collect()
    data = data.na.fill({"Fare": fare_avg[0]["mean"]})
    
    age_avg = data.select(mean(col("Age")).alias("mean")).collect()
    data = data.na.fill({'Age': age_avg[0]["mean"]})
    
    return data

In [None]:
def do_features(data):
    
    # Add some new features
    data = data.withColumn("name_length", length("Name"))
    data = data.withColumn("has_cabin", col("Cabin").isNotNull().cast('int'))
    
    data = data.withColumn("family_size", col("SibSp") + col("Parch") + 1)
    data = data.withColumn("is_alone", (col("family_size") == 1).cast('int'))
    
    
    # Add some features about passengers' title with spark udf
    @udf("string")
    def get_title(name):
        title = ''
        title_match = re.search(' ([A-Za-z]+)\.', name)
        if (title_match):
            title = title_match.group(1)
            if (title in ['Lady', 'Countess','Capt', 'Col','Don', 'Dr',
                          'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']):
                title = 'Rare'
            return title
        return title
    data = data.withColumn("Title", get_title(col("Name")))
    data = data.withColumn("Title", regexp_replace("Title", "Mlle|Ms", "Miss"))
    data = data.withColumn("Title", regexp_replace("Title", "Mme", "Mrs"))
    
    # Encode column Sex
    sex_udf = udf(lambda x: 0 if x == "female" else 1)
    data = data.withColumn("Sex", sex_udf(col("Sex")).cast('int'))
    
    # Encode column Title
    title_map = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
    title_udf = udf(lambda x: title_map[x])
    data = data.withColumn("Title", title_udf(col("Title")).cast('int'))
    
    # Encode column Embarked
    embarked_map = {'S': 0, 'C': 1, 'Q': 2}
    embarked_udf = udf(lambda x: embarked_map[x])
    data = data.withColumn("Embarked", embarked_udf(col("Embarked")).cast('int'))
    
    # Categorize column Fare
    @udf("int")
    def fare_map(fare):
        if (fare <= 7.91):
            return 0
        elif fare <= 14.454:
            return 1
        elif fare <= 31:
            return 2
        else:
            return 3
    data = data.withColumn("Fare", fare_map(col("Fare")))
    
    # Categorize column Age
    @udf("int")
    def age_map(age):
        if age <= 16:
            return 0
        elif age <= 32:
            return 1
        elif age <= 48:
            return 2
        elif age <= 64:
            return 3
        else:
            return 4
    data = data.withColumn("Age", age_map(col("Age")))
    
    return data

In [None]:
def drop_cols(data):
    
    # Drop useless columns
    data = data.drop("PassengerId") \
        .drop("Name") \
        .drop("Ticket") \
        .drop("Cabin") \
        .drop("SibSp")
    
    return data

In [None]:
train = fill_na(train)

train = do_features(train)

train = drop_cols(train)

In [None]:
train.show(5)

## Distributed model training

In [None]:
# train_df, test_df = random_split(train, [0.95, 0.05])
features = [field.name for field in list(train.schema) if field.name != "Survived"]

Define the keras model  
Each feature will be regarded as an input with shape (1,) 

In [None]:
inTensor = []
for _ in range(len(features)):
    inTensor.append(keras.Input((1,)))
concatenated = keras.layers.concatenate(inTensor)
fc1 = keras.layers.Dense(32, activation='relu')(concatenated)
fc2 = keras.layers.Dense(32, activation='relu')(fc1)
dp1 = keras.layers.Dropout(0.25)(fc2)
fc3 = keras.layers.Dense(16, activation='relu')(dp1)
dp2 = keras.layers.Dropout(0.25)(fc3)
fc4 = keras.layers.Dense(1, activation='sigmoid')(dp2)
model = keras.models.Model(inTensor, fc4)

Define the optimizer and loss function  
Then create the tensorflow estimator provided by Raydp

In [None]:
rmsp = keras.optimizers.RMSprop()
loss = keras.losses.BinaryCrossentropy()
estimator = TFEstimator(num_workers=4, model=model, optimizer=rmsp, loss=loss, metrics=["binary_accuracy"],
                        feature_columns=features, label_columns="Survived", batch_size=32, num_epochs=100,
                        config={"fit_config": {"steps_per_epoch": train.count() // 32}})

In [None]:
estimator.fit_on_spark(train, None)

In [None]:
# Shutdown raydp and spark
raydp.stop_spark()
ray.shutdown()