<br><br><br><br><br><h1 style="font-size:2em;color:#2467C0">Predict survival on the Titanic and get familiar with ML basics</h1><br><br><br>

In [139]:
from pyspark.sql import SQLContext
from pyspark.sql import DataFrameNaFunctions
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import Binarizer
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer
import pyspark.sql.functions as F

In [140]:
sqlContext = SQLContext(sc)
df = sqlContext.read.load('file:///home/vdnguyen/kaggle/kaggle-titanic/data/train.csv', 
                          format='com.databricks.spark.csv', 
                          header='true',inferSchema='true')
testDF = sqlContext.read.load('file:///home/vdnguyen/kaggle/kaggle-titanic/data/test.csv', 
                          format='com.databricks.spark.csv', 
                          header='true',inferSchema='true')
resultDF = sqlContext.read.load('file:///home/vdnguyen/kaggle/kaggle-titanic/data/gender_submission.csv', 
                          format='com.databricks.spark.csv', 
                          header='true',inferSchema='true')
df.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [141]:
df.count()

891

In [142]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [143]:
newDF = df.withColumn('SexInt', (F.instr(df.Sex, 'female') == 0).cast('int'))
newTestDF = testDF.withColumn('SexInt', (F.instr(testDF.Sex, 'female') == 0).cast('int'))
newDF.printSchema()
newDF.show(5)

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)
 |-- SexInt: integer (nullable = true)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|SexInt|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|     1|
|         

In [144]:
df.describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
PassengerId,891,446.0,257.3538420152301,1,891
Survived,891,0.3838383838383838,0.48659245426485753,0,1
Pclass,891,2.308641975308642,0.8360712409770491,1,3
Name,891,,,"""Andersson, Mr. August Edvard (""""Wennerstrom"""")""","van Melkebeke, Mr. Philemon"
Sex,891,,,female,male
Age,714,29.69911764705882,14.526497332334035,0.42,80.0
SibSp,891,0.5230078563411896,1.1027434322934315,0,8
Parch,891,0.38159371492704824,0.8060572211299488,0,6
Ticket,891,260318.54916792738,471609.26868834975,110152,WE/P 5735


In [145]:
featureColumns = ['Pclass', 'SexInt', 'Age', 'SibSp', 'Parch', 'Fare']

In [146]:
newDF = newDF.drop('Cabin', 'Ticket', 'PassengerId')
newTestDF = newTestDF.drop('Cabin', 'Ticket')
newDF = newDF.na.drop()
newTestDF = newTestDF.na.drop()
newDF.count(), len(newDF.columns)

(712, 10)

In [147]:
assembler = VectorAssembler(inputCols=featureColumns, outputCol="features")
assembled = assembler.transform(newDF)
testAssembled = assembler.transform(newTestDF)
assembled.show(5)
testAssembled.show(5)

+--------+------+--------------------+------+----+-----+-----+-------+--------+------+--------------------+
|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|   Fare|Embarked|SexInt|            features|
+--------+------+--------------------+------+----+-----+-----+-------+--------+------+--------------------+
|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|   7.25|       S|     1|[3.0,1.0,22.0,1.0...|
|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|71.2833|       C|     0|[1.0,0.0,38.0,1.0...|
|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|  7.925|       S|     0|[3.0,0.0,26.0,0.0...|
|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|   53.1|       S|     0|[1.0,0.0,35.0,1.0...|
|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|   8.05|       S|     1|[3.0,1.0,35.0,0.0...|
+--------+------+--------------------+------+----+-----+-----+-------+--------+------+--------------------+
only showing top 5 rows

+--

In [148]:
dt = DecisionTreeClassifier(labelCol="Survived", featuresCol="features", maxDepth=5,minInstancesPerNode=20, impurity="gini")

In [149]:
pipeline = Pipeline(stages=[dt])
model = pipeline.fit(assembled)

In [150]:
predictions = model.transform(testAssembled)

In [151]:
predictions.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Embarked: string (nullable = true)
 |-- SexInt: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = true)



In [153]:
extractedPredictions = predictions.select("PassengerId", "prediction")
output = resultDF.join(extractedPredictions,['PassengerId'],"inner")
output.write.save(path='file:///home/vdnguyen/kaggle/kaggle-titanic/data/result.csv', 
                          format='com.databricks.spark.csv', 
                          header='true')