In [1]:
%pylab inline

import numpy as np
# from pyspark import SparkConf

# import plotly 
# import plotly.plotly as py
# from plotly.graph_objs import Layout
# from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
# plotly.offline.init_notebook_mode()

Populating the interactive namespace from numpy and matplotlib


##### SQLContext


In [2]:
train_data_df = (sqlContext
                 .read
                 .format('csv')
                 .options(header='true', inferSchema='true')
                 .load('./data/train.csv'))
train_data_df.cache()
train_data_df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



#### Data Exploration

In [3]:
train_data_df.describe().show()

+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|summary|      PassengerId|           Survived|            Pclass|                Name|   Sex|               Age|             SibSp|              Parch|            Ticket|             Fare|Cabin|Embarked|
+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|  count|              891|                891|               891|                 891|   891|               714|               891|                891|               891|              891|  204|     889|
|   mean|            446.0| 0.3838383838383838| 2.308641975308642|                null|  null| 29.69911764705882|0.5230078563411896|0.38159371492704824|260318.54916792738| 32.20420

In [4]:
train_data_df
# Same as display(train_data_df) 

DataFrame[PassengerId: int, Survived: int, Pclass: int, Name: string, Sex: string, Age: double, SibSp: int, Parch: int, Ticket: string, Fare: double, Cabin: string, Embarked: string]

#### Data Cleaning
What is something that you noticed that could cause issues?
Hopefully, you noticed that some data objects (objects here are rows which represent a passenger) have a null age. This is going to cause some issues. How can we fix these non-values without affecting any population properties of the data? Let's use some [**domain knowledge**](http://www.simafore.com/blog/the-value-of-domain-knowledge-in-data-science) to do some more data exploration. Let's see if age follows any trends based off of class and/or ticket fare.

In [5]:
explore_age_df = train_data_df.orderBy('Age', ascending=True)

avg_age_df = explore_age_df.where(explore_age_df['Age'].isNotNull()).groupBy('Pclass').avg('Age')
avg_age_df = avg_age_df.select('Pclass', avg_age_df['avg(Age)'].alias('Age'))
avg_age_df

DataFrame[Pclass: int, Age: double]

###### Average Age vs. Class
We've barely done any exploration, and we can already possibly see a correlation between the average age of the passenger and their passenger class. We could easily use these averages to fill in missing data or we can keep looking for greater identifiers to try to eliminate bias.

##### Fill missing Age values with the average age per class

In [7]:
from pyspark.sql.functions import when, col

avg_age_list = avg_age_df.collect()

# Replace null values with the average age values from our passenger class list
data_with_age_df = (train_data_df
                     .select('*', 
                             when(train_data_df['Age'].isNull() & (train_data_df['Pclass'] == 1), 
                                  avg_age_list[0].Age)
                             .otherwise(when(train_data_df['Age'].isNull() & (train_data_df['Pclass'] == 2), 
                                             avg_age_list[1].Age)
                                        .otherwise(when(train_data_df['Age'].isNull() & (train_data_df['Pclass'] == 3), 
                                                        avg_age_list[2].Age)
                                                   .otherwise(col('Age')))).alias('FilledAge')))

# Replace the Age column values with those from our FilledAge column and then drop FilledAge.
data_with_age_df = data_with_age_df.withColumn('Age', data_with_age_df['FilledAge']).drop('FilledAge')

###### Index Sex

In [8]:
from pyspark.sql.types import IntegerType
from pyspark.sql import functions
def sex_to_int(sex):
  if(sex.lower() == 'male'):
    return 0
  else:
    return 1
sex_classify = functions.udf(sex_to_int, IntegerType())
sex_int_df = data_with_age_df.select('*', sex_classify(data_with_age_df['Sex']).alias('IntSex'))
data_sex_indexed_df = sex_int_df.withColumn('Sex', sex_int_df['IntSex']).drop('IntSex').cache()

data_sex_indexed_df

DataFrame[PassengerId: int, Survived: int, Pclass: int, Name: string, Sex: int, Age: double, SibSp: int, Parch: int, Ticket: string, Fare: double, Cabin: string, Embarked: string]

###### Index Cabin

In [9]:
def cabin_to_int(cabin):
    if cabin:
        return ord(cabin[0])-ord('A')+1 #A:1; B:2; C:3; D:4; None:0
    else:
        return 0
cabin_classify = functions.udf(cabin_to_int, IntegerType())

cabin_int_df = data_sex_indexed_df.select('*', cabin_classify(data_sex_indexed_df['Cabin']).alias('IntCabin'))
data_cabin_indexed_df = cabin_int_df.withColumn('Cabin', cabin_int_df['IntCabin']).drop('IntCabin').cache()

data_cabin_indexed_df

DataFrame[PassengerId: int, Survived: int, Pclass: int, Name: string, Sex: int, Age: double, SibSp: int, Parch: int, Ticket: string, Fare: double, Cabin: int, Embarked: string]

#####  Index Embarked

In [10]:
def embarked_to_int(embarked):
    if embarked:
        return ord(embarked) #TBD
    else:
        return 0
embarked_classify = functions.udf(embarked_to_int, IntegerType())

embarked_int_df = data_cabin_indexed_df.select('*', embarked_classify(data_sex_indexed_df['Embarked']).alias('IntEmbarked'))
data_embarked_indexed_df = embarked_int_df.withColumn('Embarked', embarked_int_df['IntEmbarked']).drop('IntEmbarked').cache()

data_embarked_indexed_df

DataFrame[PassengerId: int, Survived: int, Pclass: int, Name: string, Sex: int, Age: double, SibSp: int, Parch: int, Ticket: string, Fare: double, Cabin: int, Embarked: int]

In [11]:
data_embarked_indexed_df

DataFrame[PassengerId: int, Survived: int, Pclass: int, Name: string, Sex: int, Age: double, SibSp: int, Parch: int, Ticket: string, Fare: double, Cabin: int, Embarked: int]

## Machine Learning

In [12]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(
    inputCols=['Sex','Age','SibSp','Parch','Fare','Cabin','Embarked'], outputCol='features'
)

training = assembler.transform(data_embarked_indexed_df).select(col('Survived').alias('label'),'features')
training

DataFrame[label: int, features: vector]

## Naive Bayes

In [13]:
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes()
model1 = nb.fit(training)
