In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression
import os

data_dir = '../../data/Spark_ML/Logistic_Regression/'

In [None]:
spark = SparkSession.builder.appName('logreg_code_along').getOrCreate()

data = spark.read.csv(os.path.join(data_dir, 'titanic.csv'), inferSchema=True, header=True)

data.printSchema()

## Dealing with missing data

In [None]:
my_cols = data.select(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'])

my_final_data = my_cols.na.drop()

## Feature operations

In [None]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer, OneHotEncoder, StringIndexer

gender_indexer = StringIndexer(inputCol='Sex', outputCol='SexIndex')
gender_encoder = OneHotEncoder(inputCol='SexIndex', outputCol='SexVec')

embark_indexer = StringIndexer(inputCol='Embarked', outputCol='EmbarkIndex')
embark_encoder = OneHotEncoder(inputCol='EmbarkIndex', outputCol='EmbarkVec')

assembler = VectorAssembler(inputCols=['Pclass', 'SexVec', 'EmbarkVec', 'Age', 'SibSp', 'Parch', 'Fare'],
                            outputCol='features')


In [None]:
from pyspark.ml import Pipeline

log_reg_titanic =  LogisticRegression(featuresCol='features', labelCol='Survived')

pipeline = Pipeline(stages=[gender_indexer, embark_indexer, 
                            gender_encoder, embark_encoder, 
                            assembler, log_reg_titanic])

In [None]:
training_data, test_data = my_final_data.randomSplit([0.7, 0.3])

fit_model = pipeline.fit(training_data)

In [None]:
results = fit_model.transform(test_data)

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

my_bin_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Survived')

AUC = my_bin_eval.evaluate(results)

In [None]:
print(AUC)