In [1]:
from __future__ import division

# import necessary libs
import numpy  as np
import pandas as pd

# general spark modules
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.functions import lit

# spark ml modules 
from pyspark.ml.linalg import DenseVector
from pyspark.ml.feature import StandardScaler
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, RandomForestRegressor

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorIndexer

from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

# classification 
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier

from pyspark.ml.evaluation import BinaryClassificationEvaluator
import time
import itertools

## Load Data

In [3]:
# load data as dataframe
train_df = spark.read.csv('/FileStore/tables/titanic/train.csv', header=True)
test_df  = spark.read.csv('/FileStore/tables/titanic/test.csv',  header=True)

In [4]:
train_df.select('Parch').distinct().show()

In [5]:
train_df.show(10)

In [6]:
# prepare data for models

# Write a custom function to convert the data type of DataFrame columns# Write 
def convertColumn(df, names, newType):
    for name in names: 
        df = df.withColumn(name, df[name].cast(newType))
    return df 

In [7]:
# cast numerical columns to float
# numerical_cols = ['Age', 'SibSp', 'Parch', 'Fare']
numerical_cols = ['Fare', 'Age']
label_col = ['Survived']
train_df  = convertColumn(train_df,  numerical_cols + label_col, FloatType())
test_df   = convertColumn(test_df,   numerical_cols, FloatType())

# fill missing values with 0
train_df  = train_df.dropna()
test_df   = test_df.dropna()

In [8]:
stages = []
categorical_cols = ['Embarked', 'Sex'] 

for categorical_col in categorical_cols:
    string_indexer = StringIndexer(inputCol=categorical_col, outputCol=categorical_col + "_index", handleInvalid='error')
    stages += [string_indexer]

assembler_inputs = numerical_cols + [c + "_index" for c in categorical_cols]
assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features")

stages  += [assembler]

## Model Selection

In [10]:
# Create a Pipeline.
pipeline = Pipeline(stages=stages)

# Run the feature transformations.
#  - fit() computes feature statistics as needed.
pipelineModel = pipeline.fit(train_df)

#  - transform() actually transforms the features.
train = pipelineModel.transform(train_df)
test  = pipelineModel.transform(test_df)

In [11]:
train.show(10)

In [12]:
# Split the data into train and test sets
train_data, val_data = train.randomSplit([0.8,  0.2],  seed=1234)

### Logistic regression

In [14]:
# define parameters
regParam        = [0.1, 0.5, 2.0]
elasticNetParam = [0.0,  0.5, 1.0]
maxIter         = [10, 50, 100]
experiments     = list(itertools.product(regParam, elasticNetParam, maxIter))
print len(experiments)

In [15]:
for ind, experiment in enumerate(experiments):
    regParam        = experiment[0]
    elasticNetParam = experiment[1]
    maxIter         = experiment[2]
    
    start_time = time.time()
    print ind
    print 'params: ', regParam, elasticNetParam, maxIter
    
    lr = LogisticRegression(labelCol="Survived", 
                            featuresCol="features", 
                            regParam=regParam,
                            elasticNetParam=elasticNetParam,
                            maxIter=maxIter
                            )
    
    # Train model with Training Data
    lrModel     = lr.fit(train_data)
    
    # Make predictions on validation data using the transform() method.
    # LogisticRegression.transform() will only use the 'features' column.
    predictions = lrModel.transform(val_data)
    
    # evaluate predictions
    evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol='Survived')
    auc       = evaluator.evaluate(predictions)
    
    print 'AUC: ', auc
    print "--- %s seconds ---" % (time.time() - start_time)
    print 

In [16]:
predictions = lrModel.transform(test)
# predictions.select('PassengerId', 'prediction').coalesce(1).write.csv('result.csv')

## Decision Tree Classifier

In [18]:
# define parameters
maxDepth = [15, 30]
maxBins  = [10, 60, 80]
experiments     = list(itertools.product(maxDepth, maxBins))
print len(experiments)

In [19]:
for ind, experiment in enumerate(experiments):
    maxDepth = experiment[0]
    maxBins  = experiment[1]

    start_time = time.time()
    print ind
    print 'params: ', maxDepth, maxBins
    
    # Create initial Decision Tree Model
    dt = DecisionTreeClassifier(labelCol="Survived", featuresCol="features", maxDepth=maxDepth, maxBins=maxBins)
    
    # Train model with Training Data
    dtModel = dt.fit(train_data)
    
    # Make predictions on validation data using the transform() method.
    # LogisticRegression.transform() will only use the 'features' column.
    predictions = dtModel.transform(val_data)
    
    # evaluate predictions
    evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol='Survived')
    auc       = evaluator.evaluate(predictions)
    print 'AUC: ', auc
    print "--- %s seconds ---" % (time.time() - start_time)
    print 

### Random Forest

In [21]:
# define parameters
numTrees         = [5, 10, 50, 100]
subsamplingRate  = [0.8]
maxDepth         = [10, 15]
experiments      = list(itertools.product(numTrees, maxDepth, subsamplingRate))
print len(experiments)

In [22]:
for ind, experiment in enumerate(experiments):
    numTrees = experiment[0]
    maxDepth = experiment[1]
    subsamplingRate = experiment[2]

    start_time = time.time()
    print ind
    print 'params: ', numTrees, maxDepth, subsamplingRate
    
    # Create an initial RandomForest model.
    rf = RandomForestClassifier(labelCol="Survived", featuresCol="features", 
                                numTrees=numTrees, 
                                maxDepth=maxDepth,
                                subsamplingRate=subsamplingRate)
    
    # Train model with Training Data
    rfModel = rf.fit(train_data)
    
    # Make predictions on validation data using the transform() method.
    # LogisticRegression.transform() will only use the 'features' column.
    predictions = rfModel.transform(val_data)
    
    # evaluate predictions
    evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol='Survived')
    auc       = evaluator.evaluate(predictions)
    
    print 'AUC: ', auc
    print "--- %s seconds ---" % (time.time() - start_time)
    print 