In [1]:
%pylab inline

import numpy as np
# from pyspark import SparkConf

# import plotly 
# import plotly.plotly as py
# from plotly.graph_objs import Layout
# from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
# plotly.offline.init_notebook_mode()

Populating the interactive namespace from numpy and matplotlib


## Intitialize SQLContext


In [2]:
train_data_df = (sqlContext
                 .read
                 .format('csv')
                 .options(header='true', inferSchema='true')
                 .load('./data/train.csv'))
train_data_df.cache()
train_data_df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



## Data Exploration

In [3]:
train_data_df.describe().show()

+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|summary|      PassengerId|           Survived|            Pclass|                Name|   Sex|               Age|             SibSp|              Parch|            Ticket|             Fare|Cabin|Embarked|
+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|  count|              891|                891|               891|                 891|   891|               714|               891|                891|               891|              891|  204|     889|
|   mean|            446.0| 0.3838383838383838| 2.308641975308642|                null|  null| 29.69911764705882|0.5230078563411896|0.38159371492704824|260318.54916792738| 32.20420

In [4]:
train_data_df
# Same as display(train_data_df) 

DataFrame[PassengerId: int, Survived: int, Pclass: int, Name: string, Sex: string, Age: double, SibSp: int, Parch: int, Ticket: string, Fare: double, Cabin: string, Embarked: string]

## Data Cleaning
What is something that you noticed that could cause issues?
Hopefully, you noticed that some data objects (objects here are rows which represent a passenger) have a null age. This is going to cause some issues. How can we fix these non-values without affecting any population properties of the data? Let's use some [**domain knowledge**](http://www.simafore.com/blog/the-value-of-domain-knowledge-in-data-science) to do some more data exploration. Let's see if age follows any trends based off of class and/or ticket fare.

In [5]:
explore_age_df = train_data_df.orderBy('Age', ascending=True)

avg_age_df = explore_age_df.where(explore_age_df['Age'].isNotNull()).groupBy('Pclass').avg('Age')
avg_age_df = avg_age_df.select('Pclass', avg_age_df['avg(Age)'].alias('Age'))
avg_age_df

DataFrame[Pclass: int, Age: double]

## Fill missing Age values with the average age per class

In [6]:
from pyspark.sql.functions import when, col

avg_age_list = avg_age_df.collect()

# Replace null values with the average age values from our passenger class list
data_with_age_df = (train_data_df
                     .select('*', 
                             when(train_data_df['Age'].isNull() & (train_data_df['Pclass'] == 1), 
                                  avg_age_list[0].Age)
                             .otherwise(when(train_data_df['Age'].isNull() & (train_data_df['Pclass'] == 2), 
                                             avg_age_list[1].Age)
                                        .otherwise(when(train_data_df['Age'].isNull() & (train_data_df['Pclass'] == 3), 
                                                        avg_age_list[2].Age)
                                                   .otherwise(col('Age')))).alias('FilledAge')))

# Replace the Age column values with those from our FilledAge column and then drop FilledAge.
data_with_age_df = data_with_age_df.withColumn('Age', data_with_age_df['FilledAge']).drop('FilledAge')

## Index Sex

In [7]:
from pyspark.sql.types import IntegerType
from pyspark.sql import functions
def sex_to_int(sex):
  if(sex.lower() == 'male'):
    return 0
  else:
    return 1
sex_classify = functions.udf(sex_to_int, IntegerType())
sex_int_df = data_with_age_df.select('*', sex_classify(data_with_age_df['Sex']).alias('IntSex'))
data_sex_indexed_df = sex_int_df.withColumn('Sex', sex_int_df['IntSex']).drop('IntSex').cache()

data_sex_indexed_df

DataFrame[PassengerId: int, Survived: int, Pclass: int, Name: string, Sex: int, Age: double, SibSp: int, Parch: int, Ticket: string, Fare: double, Cabin: string, Embarked: string]

## Index Cabin

In [8]:
def cabin_to_int(cabin):
    if cabin:
        return ord(cabin[0])-ord('A')+1 #A:1; B:2; C:3; D:4; None:0
    else:
        return 0
cabin_classify = functions.udf(cabin_to_int, IntegerType())

cabin_int_df = data_sex_indexed_df.select('*', cabin_classify(data_sex_indexed_df['Cabin']).alias('IntCabin'))
data_cabin_indexed_df = cabin_int_df.withColumn('Cabin', cabin_int_df['IntCabin']).drop('IntCabin').cache()

data_cabin_indexed_df

DataFrame[PassengerId: int, Survived: int, Pclass: int, Name: string, Sex: int, Age: double, SibSp: int, Parch: int, Ticket: string, Fare: double, Cabin: int, Embarked: string]

##  Index Embarked

In [9]:
def embarked_to_int(embarked):
    if embarked:
        return ord(embarked) #TBD
    else:
        return 0
embarked_classify = functions.udf(embarked_to_int, IntegerType())

embarked_int_df = data_cabin_indexed_df.select('*', embarked_classify(data_sex_indexed_df['Embarked']).alias('IntEmbarked'))
data_embarked_indexed_df = embarked_int_df.withColumn('Embarked', embarked_int_df['IntEmbarked']).drop('IntEmbarked').cache()

data_embarked_indexed_df

DataFrame[PassengerId: int, Survived: int, Pclass: int, Name: string, Sex: int, Age: double, SibSp: int, Parch: int, Ticket: string, Fare: double, Cabin: int, Embarked: int]

## Machine Learning
### Prepare data

In [66]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(
    inputCols=['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked'], outputCol='features'
)

data = assembler.transform(data_embarked_indexed_df).select(col('PassengerId'),col('Survived').alias('label'),'features')
splits = data.randomSplit([0.8, 0.2])
train = splits[0].cache() #significant ~30% improvement to fitting
test = splits[1].cache()
train, test

(DataFrame[PassengerId: int, label: int, features: vector],
 DataFrame[PassengerId: int, label: int, features: vector])

## Questions 4
### build at least two of {Naïve Bayes, Logistic Regression, random forests, support vector machines or neural networks using the libraries of Spark.MLLib only. 
Please refer to the code below.
### Explain your choice; 
### plot learning curves; 
### explain observed behavior; 
### investigate which features are most informative; 

As the baseline, the following features are chosen based on common sense ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']. 

Then the individual features are removed to measure the (negative) impact to the prediction performance.



**Accuracy**|**Baseline**|**-'Pclass'**|**- 'Sex'**|**- 'Age'**|**-'SibSp'**|**-'Parch'**|**-'Fare'**|**-'Embarked'**
:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:
NaiveBayes                    |0.72826087|0.710382514|0.646153846|0.674157303|0.683544304|0.67816092|0.803108808|0.709677419
LogisticRegression            |0.804347826|0.808743169|0.687179487|0.792134831|0.797468354|0.827586207|0.808290155|0.779569892
RandomForestClassifier        |0.831521739|0.825136612|0.733333333|0.814606742|0.816455696|0.844827586|0.808290155|0.790322581
MultilayerPerceptronClassifier|0.809782609|0.759562842|0.687179487|0.837078652|0.689873418|0.643678161|0.792746114|0.784946237


Based on the results above, below is the ranking of the most informative features's:

* Sex
* Age
* Pclass
* SibSp
* Embarked
* Parch
* Fare






### do at least one round of error analysis to maximize your chosen metric (F1, accuracy, weighted F1); 

The prediction accuracy using features: 

    Features: 'Cabin','Sex','Age','SibSp','Parch','Fare','Embarked'
    NaiveBayes                    	0.710382513661
    LogisticRegression            	0.808743169399
    RandomForestClassifier        	0.825136612022
    MultilayerPerceptronClassifier	0.75956284153

The replacement of Cablin with Pclass improves the 

    Features: 'Pclass','Sex','Age','SibSp','Parch','Fare','Embarked'
    NaiveBayes                    	0.728260869565
    LogisticRegression            	0.804347826087
    RandomForestClassifier        	0.83152173913
    MultilayerPerceptronClassifier	0.809782608696

### explain your choice of metric.
The accuracy is the number of correct predictions made divided by the total number of predictions made. It is chosen because it's a good performance indicator of the prediction and corralates well with the F1/weighted F1 score.

In [67]:
from pyspark.ml.classification import *
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

for classifier in (NaiveBayes, LogisticRegression, RandomForestClassifier, MultilayerPerceptronClassifier): # '[]' won't work
    
    if classifier != MultilayerPerceptronClassifier:
        model = classifier()
    else:
        #Number of inputs = the size of feature vectors. Number of outputs = the total number of labels.
        features_size = data.select("features").first()[0].size
        model = classifier(layers=[features_size,10,2]) 
    model_trained = model.fit(train)

    test_predicted = model_trained.transform(test)
    #test_predicted.show(10)

    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction") 
    accuracy = evaluator.evaluate(test_predicted, {evaluator.metricName: "accuracy"}) # f1|weightedPrecision|weightedRecall|accuracy
    print(""+classifier.__name__.ljust(30) + '\t' + str(accuracy))
    
#     print('Wrong predictions for error analysis')
#     test_predicted.filter(test_predicted['prediction'] != test_predicted['label']).show(5)

NaiveBayes                    	0.632768361582
LogisticRegression            	0.80790960452
RandomForestClassifier        	0.80790960452
MultilayerPerceptronClassifier	0.655367231638


## References