In [153]:
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import StringIndexer,VectorAssembler,OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [154]:
spark = SparkSession.builder.appName("logReg").getOrCreate()

In [155]:
df=spark.read.csv("titanic.csv",inferSchema=True,header=True)

In [156]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [157]:
df.head(2)

[Row(PassengerId=1, Survived=0, Pclass=3, Name='Braund, Mr. Owen Harris', Sex='male', Age=22.0, SibSp=1, Parch=0, Ticket='A/5 21171', Fare=7.25, Cabin=None, Embarked='S'),
 Row(PassengerId=2, Survived=1, Pclass=1, Name='Cumings, Mrs. John Bradley (Florence Briggs Thayer)', Sex='female', Age=38.0, SibSp=1, Parch=0, Ticket='PC 17599', Fare=71.2833, Cabin='C85', Embarked='C')]

In [158]:
df.describe()[["Summary"]].show()

+-------+
|Summary|
+-------+
|  count|
|   mean|
| stddev|
|    min|
|    max|
+-------+



In [159]:
df=df.na.drop()

In [160]:
df.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [161]:
col_list=[ 'Survived',
 'Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked']

In [162]:
df=df[col_list]

In [163]:
df.printSchema()

root
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Embarked: string (nullable = true)



In [164]:
Sex_indexer=StringIndexer(inputCol="Sex",outputCol="Sex_index")
Sex_encoder=OneHotEncoder(inputCol="Sex_index", outputCol="Sex_vector")
Embarked_indexer=StringIndexer(inputCol="Embarked",outputCol="Embarked_index")
Embarked_encoder=OneHotEncoder(inputCol="Embarked_index", outputCol="Embarked_vector")
assembler=VectorAssembler(inputCols=['Pclass', 'Sex_vector', 'Age','SibSp','Parch','Fare','Embarked_vector'],outputCol="featuresVector")
logReg=LogisticRegression(featuresCol="featuresVector",labelCol="Survived")

In [165]:
# this step was only to check if all the stages of the pipeline have been coded correctly
#df=Sex_indexer.fit(df).transform(df)
#df=Sex_encoder.transform(df)
#df=Embarked_indexer.fit(df).transform(df)
#df=Embarked_encoder.transform(df)
#df=assembler.transform(df)

In [166]:
pipeLine=Pipeline(stages=[Sex_indexer,Sex_encoder,Embarked_indexer,Embarked_encoder,assembler,logReg])

In [167]:
train_data,test_data=df.randomSplit([0.7,0.3])

In [168]:
model=pipeLine.fit(train_data)

In [169]:
results=model.transform(test_data)

In [170]:
my_eval=BinaryClassificationEvaluator(rawPredictionCol="prediction",labelCol="Survived")

In [171]:
my_eval.evaluate(results)

0.7047619047619047