In [29]:
# check spark context
sc

In [None]:
# import sys
# !{sys.executable} -m pip install pandas

In [30]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.sql.functions import mean,col,split, col, regexp_extract, when, lit, isnan, count, isnull
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import QuantileDiscretizer

In [31]:
from pyspark.ml.classification import DecisionTreeClassifier, LogisticRegression, \
                                        RandomForestClassificationModel, GBTClassifier, LinearSVC, NaiveBayes

# Read and Inspect Dataframe

In [32]:
# read dataframe into distributed memory
train_path = "gs://kueppers/titanic/train_titanic.csv"
titanic_df = spark.read.csv(train_path,header = 'True',inferSchema='True')

In [34]:
titanic_df.toPandas().head(5)  # not recommended
titanic_df.limit(5).toPandas()  # push as much logic as possible to spark

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [35]:
display(titanic_df)

DataFrame[PassengerId: int, Survived: int, Pclass: int, Name: string, Sex: string, Age: double, SibSp: int, Parch: int, Ticket: string, Fare: double, Cabin: string, Embarked: string]

In [37]:
titanic_df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [38]:
titanic_df.count()

891

In [40]:
# selecting columns, retrieve only the head
titanic_df.select("Survived", "Pclass", "Embarked").limit(10).toPandas()

Unnamed: 0,Survived,Pclass,Embarked
0,0,3,S
1,1,1,C
2,1,3,S
3,1,1,S
4,0,3,S
5,0,3,Q
6,0,1,S
7,0,3,S
8,1,3,S
9,1,2,C


# EDA
## Target Understanding

In [41]:
titanic_df.groupBy("Survived").count().toPandas()

Unnamed: 0,Survived,count
0,1,342
1,0,549


In [42]:
titanic_df.groupBy("Sex", "Survived").count().toPandas()

Unnamed: 0,Sex,Survived,count
0,male,0,468
1,female,1,233
2,female,0,81
3,male,1,109


In [43]:
# distinct values
titanic_df.select("Survived").distinct().toPandas()

Unnamed: 0,Survived
0,1
1,0


## Null Values

In [44]:
# check single column for NaNs
titanic_df.where(col("age").isNull()).count()

177

In [45]:
# check all columns for Null Values
titanic_df.select([count(when(isnull(c), c)).alias(c) for c in titanic_df.columns]).toPandas()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,0,0,0,0,177,0,0,0,0,687,2


In [46]:
# idea of this check:
print(titanic_df.columns)
c = titanic_df.columns[5]
titanic_df.select(when(isnan(c), c)).count()

['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


891

## Pre-processing
### Imputation of Age based on the Name Feature (Initials)

In [47]:
titanic_df.select("name").limit(10).toPandas()

Unnamed: 0,name
0,"Braund, Mr. Owen Harris"
1,"Cumings, Mrs. John Bradley (Florence Briggs Th..."
2,"Heikkinen, Miss. Laina"
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)"
4,"Allen, Mr. William Henry"
5,"Moran, Mr. James"
6,"McCarthy, Mr. Timothy J"
7,"Palsson, Master. Gosta Leonard"
8,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)"
9,"Nasser, Mrs. Nicholas (Adele Achem)"


In [48]:
# Extract the initial and add as column
titanic_df = titanic_df.withColumn("Initial", 
                                   regexp_extract(col("Name"),
                                                  "([A-Za-z]+)\.",
                                                  1)
                                  )

In [49]:
titanic_df.limit(3).toPandas()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Initial
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss


In [50]:
titanic_df.select("Initial").distinct().limit(10).toPandas()

Unnamed: 0,Initial
0,Don
1,Miss
2,Countess
3,Col
4,Rev
5,Lady
6,Master
7,Mme
8,Capt
9,Mr


In [51]:
# average age per initial
titanic_df.groupby('Initial').avg('Age').toPandas()

Unnamed: 0,Initial,avg(Age)
0,Don,40.0
1,Miss,21.773973
2,Countess,33.0
3,Col,58.0
4,Rev,43.166667
5,Lady,48.0
6,Master,4.574167
7,Mme,24.0
8,Capt,70.0
9,Mr,32.36809


In [52]:
titanic_df.where(col("Age").isNull()).count()

177

In [53]:
# we will do the imputation via pandas / python since this is not large data
avg_per_initial = titanic_df.groupby('Initial').avg('Age').toPandas()
avg_per_initial = avg_per_initial.set_index('Initial')
avg_per_initial

Unnamed: 0_level_0,avg(Age)
Initial,Unnamed: 1_level_1
Don,40.0
Miss,21.773973
Countess,33.0
Col,58.0
Rev,43.166667
Lady,48.0
Master,4.574167
Mme,24.0
Capt,70.0
Mr,32.36809


In [54]:
# initials (only where missing)
missing_initials = titanic_df.where(col("age").isNull()).select("Initial").distinct().toPandas()
missing_initials

Unnamed: 0,Initial
0,Miss
1,Master
2,Mr
3,Dr
4,Mrs


In [55]:
# imputation - if age is null set the average age of respective initial

for initial in missing_initials['Initial']:
    print(initial)
    titanic_df = titanic_df.withColumn("Age",
                          when((titanic_df["Initial"] == initial) & 
                               (titanic_df["Age"].isNull()), 
                               avg_per_initial.loc[initial, 'avg(Age)']).otherwise(titanic_df["Age"])
                         )



Miss
Master
Mr
Dr
Mrs


In [56]:
titanic_df.limit(10).toPandas()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Initial
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr
5,6,0,3,"Moran, Mr. James",male,32.36809,0,0,330877,8.4583,,Q,Mr
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,Mr
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,Master
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,Mrs
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,Mrs


In [58]:
titanic_df.where(col("Age").isNull()).count()

0

### Imputation for Embarked

In [59]:
titanic_df.groupBy("Embarked").count().toPandas()

Unnamed: 0,Embarked,count
0,Q,77
1,,2
2,C,168
3,S,644


In [60]:
# set missing values to S (most frwquent port of embarkation)
titanic_df = titanic_df.na.fill({"Embarked" : 'S'})

### Drop Cabin

In [61]:
titanic_df.where(col("Cabin").isNull()).count()

687

In [63]:
titanic_df = titanic_df.drop("Cabin")

### Create feature family-size

In [64]:
titanic_df = titanic_df.withColumn("Family_Size",col('SibSp')+col('Parch'))

In [65]:
titanic_df.limit(2).toPandas()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Initial,Family_Size
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,Mr,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,Mrs,1


In [66]:
titanic_df = titanic_df.withColumn('Alone',lit(0))  # create column with literal value 0

In [67]:
titanic_df = titanic_df.withColumn("Alone",when(titanic_df["Family_Size"] == 0, 1).otherwise(titanic_df["Alone"]))

In [68]:
titanic_df.limit(5).toPandas()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Initial,Family_Size,Alone
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,Mr,1,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,Mrs,1,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,Miss,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,Mrs,1,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,Mr,0,1


### Convert Sex, Embarked & Initial Columns via StringIndexer

In [69]:
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(titanic_df) for column in ["Sex","Embarked","Initial"]]
pipeline = Pipeline(stages=indexers)
titanic_df = pipeline.fit(titanic_df).transform(titanic_df)

In [70]:
titanic_df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Embarked: string (nullable = false)
 |-- Initial: string (nullable = true)
 |-- Family_Size: integer (nullable = true)
 |-- Alone: integer (nullable = false)
 |-- Sex_index: double (nullable = false)
 |-- Embarked_index: double (nullable = false)
 |-- Initial_index: double (nullable = false)



### Drop columns which are not required

In [71]:
titanic_df = titanic_df.drop("PassengerId","Name","Ticket","Cabin","Embarked","Sex","Initial")

# Machine Learning Preparation
## Put features into vector

In [72]:
feature = VectorAssembler(inputCols=titanic_df.columns[1:],outputCol="features")
feature_vector= feature.transform(titanic_df)

In [73]:
feature_vector.limit(10).toPandas()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Family_Size,Alone,Sex_index,Embarked_index,Initial_index,features
0,0,3,22.0,1,0,7.25,1,0,0.0,0.0,0.0,"(3.0, 22.0, 1.0, 0.0, 7.25, 1.0, 0.0, 0.0, 0.0..."
1,1,1,38.0,1,0,71.2833,1,0,1.0,1.0,2.0,"[1.0, 38.0, 1.0, 0.0, 71.2833, 1.0, 0.0, 1.0, ..."
2,1,3,26.0,0,0,7.925,0,1,1.0,0.0,1.0,"[3.0, 26.0, 0.0, 0.0, 7.925, 0.0, 1.0, 1.0, 0...."
3,1,1,35.0,1,0,53.1,1,0,1.0,0.0,2.0,"[1.0, 35.0, 1.0, 0.0, 53.1, 1.0, 0.0, 1.0, 0.0..."
4,0,3,35.0,0,0,8.05,0,1,0.0,0.0,0.0,"(3.0, 35.0, 0.0, 0.0, 8.05, 0.0, 1.0, 0.0, 0.0..."
5,0,3,32.36809,0,0,8.4583,0,1,0.0,2.0,0.0,"(3.0, 32.368090452261306, 0.0, 0.0, 8.4583, 0...."
6,0,1,54.0,0,0,51.8625,0,1,0.0,0.0,0.0,"(1.0, 54.0, 0.0, 0.0, 51.8625, 0.0, 1.0, 0.0, ..."
7,0,3,2.0,3,1,21.075,4,0,0.0,0.0,3.0,"[3.0, 2.0, 3.0, 1.0, 21.075, 4.0, 0.0, 0.0, 0...."
8,1,3,27.0,0,2,11.1333,2,0,1.0,0.0,2.0,"[3.0, 27.0, 0.0, 2.0, 11.1333, 2.0, 0.0, 1.0, ..."
9,1,2,14.0,1,0,30.0708,1,0,1.0,1.0,2.0,"[2.0, 14.0, 1.0, 0.0, 30.0708, 1.0, 0.0, 1.0, ..."


## Train-test-split

In [74]:
(trainingData, testData) = feature_vector.randomSplit([0.8, 0.2],seed = 11)

## Logistic Regression
### Train Model

In [75]:
lr = LogisticRegression(labelCol="Survived", featuresCol="features")
#Training 
lrModel = lr.fit(trainingData)
lr_prediction = lrModel.transform(testData)
lr_prediction.select("prediction", "Survived", "features").show()
evaluator = MulticlassClassificationEvaluator(labelCol="Survived", predictionCol="prediction", metricName="accuracy")

+----------+--------+--------------------+
|prediction|Survived|            features|
+----------+--------+--------------------+
|       0.0|       0|[1.0,19.0,3.0,2.0...|
|       1.0|       0|[1.0,27.0,0.0,2.0...|
|       0.0|       0|(10,[0,1,4,6],[1....|
|       1.0|       0|[1.0,28.0,1.0,0.0...|
|       0.0|       0|(10,[0,1,4,6],[1....|
|       0.0|       0|(10,[0,1,4,6,8],[...|
|       0.0|       0|(10,[0,1,6],[1.0,...|
|       1.0|       0|[1.0,40.0,0.0,0.0...|
|       1.0|       0|[1.0,42.0,0.0,0.0...|
|       0.0|       0|(10,[0,1,2,4,5],[...|
|       0.0|       0|[1.0,51.0,0.0,1.0...|
|       0.0|       0|(10,[0,1,4,6,8],[...|
|       0.0|       0|(10,[0,1,4,6,8],[...|
|       0.0|       0|(10,[0,1,4,6],[2....|
|       0.0|       0|(10,[0,1,4,6],[2....|
|       0.0|       0|(10,[0,1,4,6],[2....|
|       0.0|       0|(10,[0,1,2,4,5],[...|
|       0.0|       0|(10,[0,1,2,4,5],[...|
|       0.0|       0|(10,[0,1,2,4,5],[...|
|       0.0|       0|(10,[0,1,4,6],[2....|
+----------

### Evaluate Model

In [76]:
lr_accuracy = evaluator.evaluate(lr_prediction)
print("Accuracy of LogisticRegression is = %g"% (lr_accuracy))
print("Test Error of LogisticRegression = %g " % (1.0 - lr_accuracy))

Accuracy of LogisticRegression is = 0.830409
Test Error of LogisticRegression = 0.169591 


## Decision Tree
### Train Model

In [None]:
dt = DecisionTreeClassifier(labelCol="Survived", featuresCol="features")
dt_model = dt.fit(trainingData)
dt_prediction = dt_model.transform(testData)
dt_prediction.select("prediction", "Survived", "features").show()

### Evaluate

In [None]:
dt_accuracy = evaluator.evaluate(dt_prediction)
print("Accuracy of DecisionTreeClassifier is = %g"% (dt_accuracy))
print("Test Error of DecisionTreeClassifier = %g " % (1.0 - dt_accuracy))

## RandomForest
### Train Model

In [None]:
rf = DecisionTreeClassifier(labelCol="Survived", featuresCol="features")
rf_model = rf.fit(trainingData)
rf_prediction = rf_model.transform(testData)
rf_prediction.select("prediction", "Survived", "features").show()

### Evaluate

In [None]:
rf_accuracy = evaluator.evaluate(rf_prediction)
print("Accuracy of RandomForestClassifier is = %g"% (rf_accuracy))
print("Test Error of RandomForestClassifier  = %g " % (1.0 - rf_accuracy))

## Gradient-boosted tree classifier
### Train Model

In [None]:
gbt = GBTClassifier(labelCol="Survived", featuresCol="features",maxIter=10)
gbt_model = gbt.fit(trainingData)
gbt_prediction = gbt_model.transform(testData)
gbt_prediction.select("prediction", "Survived", "features").show()

### Evaluate

In [None]:
gbt_accuracy = evaluator.evaluate(gbt_prediction)
print("Accuracy of Gradient-boosted tree classifie is = %g"% (gbt_accuracy))
print("Test Error of Gradient-boosted tree classifie %g"% (1.0 - gbt_accuracy))

## NaiveBayes
### Train Model

In [None]:
nb = NaiveBayes(labelCol="Survived", featuresCol="features")
nb_model = nb.fit(trainingData)
nb_prediction = nb_model.transform(testData)
nb_prediction.select("prediction", "Survived", "features").show()

### Evaluate

In [None]:
nb_accuracy = evaluator.evaluate(nb_prediction)
print("Accuracy of NaiveBayes is  = %g"% (nb_accuracy))
print("Test Error of NaiveBayes  = %g " % (1.0 - nb_accuracy))

## SupportVectorMachine
### Train Model

In [None]:
svm = LinearSVC(labelCol="Survived", featuresCol="features")
svm_model = svm.fit(trainingData)
svm_prediction = svm_model.transform(testData)
svm_prediction.select("prediction", "Survived", "features").show()

### Evaluate Model

In [None]:
svm_accuracy = evaluator.evaluate(svm_prediction)
print("Accuracy of Support Vector Machine is = %g"% (svm_accuracy))
print("Test Error of Support Vector Machine = %g " % (1.0 - svm_accuracy))

# Simulate horizontal scaling
## Create large dataset from small one

In [20]:
original_csv = "gs://kueppers/titanic/train_titanic.csv"
titanic_df = spark.read.csv(original_csv, header = 'True',inferSchema='True')

In [21]:
df = titanic_df.toPandas()

In [22]:
new_df = df.copy()

for i in range(500):
    new_df = new_df.append(df.copy())
    
len(new_df)

446391

In [23]:
#spark.conf.set("spark.sql.execution.arrow.enabled", "false")  # use arrow if installed

titanic_df_large = spark.createDataFrame(new_df)

In [25]:
# write to bucket
outputFileName = "gs://kueppers/titanic/train_large.csv"
#titanic_df_large.write.format('com.databricks.spark.csv').save(train_path_large)
titanic_df_large.write.format("com.databricks.spark.csv") \
            .option("header", "true") \
            .mode("overwrite") \
            .save(outputFileName)

# Define ML Pipeline (simplified!)

In [78]:
def pipeline(my_df):

    # (A) preprocessing
    
    # Extract the initial and add as column
    my_df = my_df.withColumn("Initial", regexp_extract(col("Name"), "([A-Za-z]+)\.", 1))
    
    # impute age based on pandas
    avg_per_initial = my_df.groupby('Initial').avg('Age').toPandas()
    avg_per_initial = avg_per_initial.set_index('Initial')
    
    # imputation - if age is null set the average age of respective initial
    # initials (only where missing)
    my_missing_initials = my_df.where(col("age").isNull()).select("Initial").distinct().toPandas()
    
    for initial in my_missing_initials['Initial']:
        my_df = my_df.withColumn("Age",
                              when((my_df["Initial"] == initial) & 
                                   (my_df["Age"].isNull()), 
                                   avg_per_initial.loc[initial, 'avg(Age)']).otherwise(my_df["Age"])
                             )
    
    # set missing values to S (most frwquent port of embarkation)
    my_df = my_df.na.fill({"Embarked" : 'S'})

    # drop cabin
    my_df = my_df.drop("Cabin")
    
    # calculate family size
    my_df = my_df.withColumn("Family_Size",col('SibSp')+col('Parch'))

    # create alone column
    my_df = my_df.withColumn('Alone',lit(0))  # create column with literal value 0
    my_df = my_df.withColumn("Alone",when(my_df["Family_Size"] == 0, 1).otherwise(my_df["Alone"]))

    # convert string columns
    indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(my_df) for column in ["Sex","Embarked","Initial"]]
    pipeline = Pipeline(stages=indexers)
    my_df = pipeline.fit(my_df).transform(my_df)
    
    # drop columns that are not reequired
    my_df = my_df.drop("PassengerId","Name","Ticket","Cabin","Embarked","Sex","Initial")
    
    
    
    # (B) Machine Learning
    
    # create feature matrix
    my_feature = VectorAssembler(inputCols=my_df.columns[1:],outputCol="features")
    my_feature_vector= my_feature.transform(my_df)
    
    # train-test-split
    (my_trainingData, my_testData) = my_feature_vector.randomSplit([0.8, 0.2],seed = 11)
    
    
    # train model
    my_rf = DecisionTreeClassifier(labelCol="Survived", featuresCol="features")
    my_rf_model = my_rf.fit(my_trainingData)
    my_rf_prediction = my_rf_model.transform(my_testData)
    my_rf_prediction.select("prediction", "Survived", "features").show()

# Measure horizontal scaling

In [79]:
import time
def measurement(function, parameter):
        start = time.time()
        function(parameter)
        end = time.time()
        print(end - start)

In [80]:
titanic_small_df = spark.read.csv("gs://kueppers/titanic/train_titanic.csv", header = 'True',inferSchema='True')

In [81]:
measurement(pipeline, titanic_small_df)

+----------+--------+--------------------+
|prediction|Survived|            features|
+----------+--------+--------------------+
|       0.0|       0|[1.0,19.0,3.0,2.0...|
|       0.0|       0|[1.0,27.0,0.0,2.0...|
|       0.0|       0|(10,[0,1,4,6],[1....|
|       0.0|       0|[1.0,28.0,1.0,0.0...|
|       0.0|       0|(10,[0,1,4,6],[1....|
|       0.0|       0|(10,[0,1,4,6,8],[...|
|       0.0|       0|(10,[0,1,6],[1.0,...|
|       0.0|       0|[1.0,40.0,0.0,0.0...|
|       0.0|       0|[1.0,42.0,0.0,0.0...|
|       0.0|       0|(10,[0,1,2,4,5],[...|
|       0.0|       0|[1.0,51.0,0.0,1.0...|
|       0.0|       0|(10,[0,1,4,6,8],[...|
|       0.0|       0|(10,[0,1,4,6,8],[...|
|       0.0|       0|(10,[0,1,4,6],[2....|
|       1.0|       0|(10,[0,1,4,6],[2....|
|       0.0|       0|(10,[0,1,4,6],[2....|
|       0.0|       0|(10,[0,1,2,4,5],[...|
|       0.0|       0|(10,[0,1,2,4,5],[...|
|       0.0|       0|(10,[0,1,2,4,5],[...|
|       0.0|       0|(10,[0,1,4,6],[2....|
+----------

In [82]:
titanic_large_df = spark.read.csv("gs://kueppers/titanic/train_large.csv/part*", header = 'True',inferSchema='True')

In [83]:
titanic_large_df.limit(10).toPandas()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [84]:
measurement(pipeline, titanic_large_df)

+----------+--------+--------------------+
|prediction|Survived|            features|
+----------+--------+--------------------+
|       0.0|       0|[1.0,2.0,1.0,2.0,...|
|       0.0|       0|[1.0,2.0,1.0,2.0,...|
|       0.0|       0|[1.0,2.0,1.0,2.0,...|
|       0.0|       0|[1.0,2.0,1.0,2.0,...|
|       0.0|       0|[1.0,2.0,1.0,2.0,...|
|       0.0|       0|[1.0,2.0,1.0,2.0,...|
|       0.0|       0|[1.0,2.0,1.0,2.0,...|
|       0.0|       0|[1.0,2.0,1.0,2.0,...|
|       0.0|       0|[1.0,2.0,1.0,2.0,...|
|       0.0|       0|[1.0,2.0,1.0,2.0,...|
|       0.0|       0|[1.0,2.0,1.0,2.0,...|
|       0.0|       0|[1.0,2.0,1.0,2.0,...|
|       0.0|       0|[1.0,2.0,1.0,2.0,...|
|       0.0|       0|[1.0,2.0,1.0,2.0,...|
|       0.0|       0|[1.0,2.0,1.0,2.0,...|
|       0.0|       0|[1.0,2.0,1.0,2.0,...|
|       0.0|       0|[1.0,2.0,1.0,2.0,...|
|       0.0|       0|[1.0,2.0,1.0,2.0,...|
|       0.0|       0|[1.0,2.0,1.0,2.0,...|
|       0.0|       0|[1.0,2.0,1.0,2.0,...|
+----------