In [1]:
%pylab inline

import numpy as np
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import functions
from pyspark.ml.feature import VectorAssembler

Populating the interactive namespace from numpy and matplotlib


## Intitialize SQLContext


In [2]:
train_data_df = (sqlContext
                 .read
                 .format('csv')
                 .options(header='true', inferSchema='true')
                 .load('./data/train.csv'))
train_data_df.cache()
train_data_df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



## Data Exploration

In [3]:
train_data_df # Same as display(train_data_df) 

DataFrame[PassengerId: int, Survived: int, Pclass: int, Name: string, Sex: string, Age: double, SibSp: int, Parch: int, Ticket: string, Fare: double, Cabin: string, Embarked: string]

## Data Cleaning


In [4]:
explore_age_df = train_data_df.orderBy('Age', ascending=True)

avg_age_df = explore_age_df.where(explore_age_df['Age'].isNotNull()).groupBy('Pclass').avg('Age')
avg_age_df = avg_age_df.select('Pclass', avg_age_df['avg(Age)'].alias('Age'))
avg_age_df

DataFrame[Pclass: int, Age: double]

## Fill missing Age values with the average age per class

In [5]:

avg_age_list = avg_age_df.collect()

# Replace null values with the average age values from our passenger class list
data_with_age_df = (train_data_df
                     .select('*', 
                             when(train_data_df['Age'].isNull() & (train_data_df['Pclass'] == 1), 
                                  avg_age_list[0].Age)
                             .otherwise(when(train_data_df['Age'].isNull() & (train_data_df['Pclass'] == 2), 
                                             avg_age_list[1].Age)
                                        .otherwise(when(train_data_df['Age'].isNull() & (train_data_df['Pclass'] == 3), 
                                                        avg_age_list[2].Age)
                                                   .otherwise(col('Age')))).alias('FilledAge')))

# Replace the Age column values with those from our FilledAge column and then drop FilledAge.
data_with_age_df = data_with_age_df.withColumn('Age', data_with_age_df['FilledAge']).drop('FilledAge')

## Index Sex

In [6]:

def sex_to_int(sex):
  if(sex.lower() == 'male'):
    return 0
  else:
    return 1
sex_classify = functions.udf(sex_to_int, IntegerType())
sex_int_df = data_with_age_df.select('*', sex_classify(data_with_age_df['Sex']).alias('IntSex'))
data_sex_indexed_df = sex_int_df.withColumn('Sex', sex_int_df['IntSex']).drop('IntSex').cache()

data_sex_indexed_df

DataFrame[PassengerId: int, Survived: int, Pclass: int, Name: string, Sex: int, Age: double, SibSp: int, Parch: int, Ticket: string, Fare: double, Cabin: string, Embarked: string]

## Index Cabin

In [7]:
def cabin_to_int(cabin):
    if cabin:
        return ord(cabin[0])-ord('A')+1 #A:1; B:2; C:3; D:4; None:0
    else:
        return 0
cabin_classify = functions.udf(cabin_to_int, IntegerType())

cabin_int_df = data_sex_indexed_df.select('*', cabin_classify(data_sex_indexed_df['Cabin']).alias('IntCabin'))
data_cabin_indexed_df = cabin_int_df.withColumn('Cabin', cabin_int_df['IntCabin']).drop('IntCabin').cache()

data_cabin_indexed_df

DataFrame[PassengerId: int, Survived: int, Pclass: int, Name: string, Sex: int, Age: double, SibSp: int, Parch: int, Ticket: string, Fare: double, Cabin: int, Embarked: string]

##  Index Embarked

In [8]:
def embarked_to_int(embarked):
    if embarked:
        return ord(embarked) #TBD
    else:
        return 0
embarked_classify = functions.udf(embarked_to_int, IntegerType())

embarked_int_df = data_cabin_indexed_df.select('*', embarked_classify(data_sex_indexed_df['Embarked']).alias('IntEmbarked'))
data_embarked_indexed_df = embarked_int_df.withColumn('Embarked', embarked_int_df['IntEmbarked']).drop('IntEmbarked').cache()

data_embarked_indexed_df

DataFrame[PassengerId: int, Survived: int, Pclass: int, Name: string, Sex: int, Age: double, SibSp: int, Parch: int, Ticket: string, Fare: double, Cabin: int, Embarked: int]

## Machine Learning
### Prepare data

In [9]:
assembler = VectorAssembler(
    inputCols=['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked'], outputCol='features'
)

In [10]:
data = assembler.transform(data_embarked_indexed_df).select(col('PassengerId'),col('Survived').alias('label'),'features')

In [11]:
splits = data.randomSplit([0.8, 0.2])
train = splits[0].cache() #significant ~30% improvement to fitting
test = splits[1].cache()
train, test

(DataFrame[PassengerId: int, label: int, features: vector],
 DataFrame[PassengerId: int, label: int, features: vector])

### Compare various classifiers

In [12]:
from pyspark.ml.classification import *
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

for classifier in (NaiveBayes, LogisticRegression, RandomForestClassifier, MultilayerPerceptronClassifier): # '[]' won't work
    
    if classifier != MultilayerPerceptronClassifier:
        model = classifier()
    else:
        #Number of inputs = the size of feature vectors. Number of outputs = the total number of labels.
        features_size = data.select("features").first()[0].size
        model = classifier(layers=[features_size,10,2]) 
    model_trained = model.fit(train)

    test_predicted = model_trained.transform(test)

    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction") 
    accuracy = evaluator.evaluate(test_predicted, {evaluator.metricName: "accuracy"}) # f1|weightedPrecision|weightedRecall|accuracy
    print(""+classifier.__name__.ljust(30) + '\t' + str(accuracy))
#     test_predicted.show(10)

#     print('Wrong predictions for error analysis')
#     test_predicted.filter(test_predicted['prediction'] != test_predicted['label']).show(5)

NaiveBayes                    	0.689024390244
LogisticRegression            	0.817073170732
RandomForestClassifier        	0.835365853659
MultilayerPerceptronClassifier	0.682926829268


### Proceed with Random Forest

In [13]:
rf_model = RandomForestClassifier()
rf_model_trained = rf_model.fit(train)
rf_test_predicted = rf_model_trained.transform(test)
rf_test_predicted.show(5)
# rf_test_predicted.count()

+-----------+-----+--------------------+--------------------+--------------------+----------+
|PassengerId|label|            features|       rawPrediction|         probability|prediction|
+-----------+-----+--------------------+--------------------+--------------------+----------+
|          8|    0|[3.0,0.0,2.0,3.0,...|[13.1683130699088...|[0.65841565349544...|       0.0|
|         27|    0|[3.0,0.0,29.87763...|[17.9464697911859...|[0.89732348955929...|       0.0|
|         30|    0|[3.0,0.0,29.87763...|[17.9687937434503...|[0.89843968717251...|       0.0|
|         36|    0|[1.0,0.0,42.0,1.0...|[11.6113412997694...|[0.58056706498847...|       0.0|
|         52|    0|[3.0,0.0,21.0,0.0...|[17.2287344609829...|[0.86143672304914...|       0.0|
+-----------+-----+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



In [22]:
def extract_survival_prob(probability):
    return probability.values[1].item() 

extract_survival_prob_udf = functions.udf(extract_survival_prob, FloatType())

survival_prob_udf_df = rf_test_predicted.select('*', extract_survival_prob_udf(rf_test_predicted['probability']).alias('survival_prob_udf'))
survival_prob_df = survival_prob_udf_df.withColumn('probability', survival_prob_udf_df['survival_prob_udf']).drop('survival_prob_udf').cache()
survival_prob_df

DataFrame[PassengerId: int, label: int, features: vector, rawPrediction: vector, probability: float, prediction: double]

In [23]:
likely_survivors_df = survival_prob_df.filter("probability > 0.8")
likely_survivors_df.show(5)

+-----------+-----+--------------------+--------------------+-----------+----------+
|PassengerId|label|            features|       rawPrediction|probability|prediction|
+-----------+-----+--------------------+--------------------+-----------+----------+
|         59|    1|[2.0,1.0,5.0,1.0,...|[1.58862351008527...|  0.9205688|       1.0|
|         99|    1|[2.0,1.0,34.0,0.0...|[1.88721744848173...|  0.9056391|       1.0|
|        137|    1|[1.0,1.0,19.0,0.0...|[1.83923146069322...| 0.90803844|       1.0|
|        167|    1|[1.0,1.0,38.23344...|[0.99442140724819...| 0.95027894|       1.0|
|        238|    1|[2.0,1.0,8.0,0.0,...|[1.48862351008527...|  0.9255688|       1.0|
+-----------+-----+--------------------+--------------------+-----------+----------+
only showing top 5 rows



In [21]:
from pyspark.ml.clustering import GaussianMixture

gmm = GaussianMixture().setK(2).setSeed(538009335)
gmm_model = gmm.fit(likely_survivors_df)

print("Gaussians shown as a DataFrame: ")
gmm_model.gaussiansDF.show(truncate=False)

IllegalArgumentException: u'requirement failed: Column prediction already exists.'