<a href="https://colab.research.google.com/github/mosesyhc/de300-wn2024-notes/blob/main/examples/ex-pipeline-titanic-full.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Retrieving Java, Spark, and `findspark` in Python

In [None]:
!pip install -q findspark
!pip install -q seaborn

In [None]:
!pip install pyspark

# `titanic` Dataset

In [None]:
import seaborn as sns
titanic = sns.load_dataset('titanic', data_home='dataset/', cache=True)

In [None]:
titanic = spark.read.csv(
    "dataset/titanic.csv", inferSchema=True, header=True
)

# PySpark setup

In [None]:
from pyspark.sql import SparkSession

# initiate spark session
spark = (
    SparkSession.builder
    .appName("ML Pipeline")
    .getOrCreate()
)


In [None]:
titanic.printSchema()

In [None]:
titanic.describe().show()

In [None]:
# clean up duplicate columns
titanic = titanic.drop('deck', 'embark_town', 'alive', 'class')

In [None]:
titanic.describe().show()

In [None]:
titanic.select('embarked').distinct().show()

In [None]:
# continuous variables
CONTINUOUS_COLUMNS = [
    'pclass',
    'age',
    'sibsp',
    'parch',
    'fare',
]

# binary text variables
BINARY_COLUMNS = [
    'sex'
]

# categorical variables
CATEGORICAL_COLUMNS = [
    'embarked',
    'who'
]

# boolean variables
BOOLEAN_COLUMNS = [
    'adult_male',
    'alone'
]

In [None]:
# transform boolean columns
import pyspark.sql.types as T
for x in BOOLEAN_COLUMNS:
  titanic = titanic.withColumn(x, titanic[x].cast(T.IntegerType()))

# Estimator

In [None]:
import pyspark.ml.feature as MF

# transform sex column
sex_indexer = MF.StringIndexer(inputCol='sex',
                               outputCol='sex_index')

# transform who column
who_indexer = MF.StringIndexer(inputCol='who',
                               outputCol='who_index')
who_encoder = MF.OneHotEncoder(inputCol='who_index',
                               outputCol='who_vec')

# transform embarked column
embarked_indexer = MF.StringIndexer(inputCol='embarked',
                                  outputCol='embarked_index',
                                  handleInvalid='skip')
embarked_encoder = MF.OneHotEncoder(inputCol='embarked_index_impute',
                                  outputCol='embarked_vec')

In [None]:
# age imputation
age_imputer  = MF.Imputer(strategy='median',
                          inputCol='age',
                          outputCol='age')

# embarked imputation
embarked_imputer = MF.Imputer(strategy='mode',
                              inputCol='embarked_index',
                              outputCol='embarked_index_impute')

# Transformer

In [None]:
# collecting all predictors / features
assembler = MF.VectorAssembler(inputCols=CONTINUOUS_COLUMNS
                               + ['sex_index']
                               + ['embarked_vec']
                               + ['who_vec'],
                               outputCol='features')


# Pre-training pipeline



In [None]:
from pyspark.ml import Pipeline

titanic_pipeline = Pipeline(
    stages=[
        age_imputer,
        sex_indexer,
        who_indexer,
        who_encoder,
        embarked_indexer,
        embarked_imputer,
        embarked_encoder,
        assembler
    ]
)

In [None]:
assembled_titanic = titanic_pipeline.fit(titanic).transform(titanic)

In [None]:
assembled_titanic.show(10)

# Logistic Regression model

In [None]:
# logistic regression
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol='features',labelCol='survived')

In [None]:
lr_titanic = lr.fit(assembled_titanic)

In [None]:
titanic_pred = lr_titanic.transform(assembled_titanic)

In [None]:
titanic_pred.show(5)

# Evaluation

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator as bcEval

binary_eval = bcEval(rawPredictionCol='prediction',
                     labelCol='survived')

In [None]:
titanic_eval = binary_eval.evaluate(titanic_pred)

In [None]:
print(titanic_eval)

# Understanding the model via extraction of coefficients

In [None]:
lr_titanic.intercept

In [None]:
lr_titanic.coefficients

In [None]:
print(CONTINUOUS_COLUMNS
+ ['sex_index']
+ ['embarked_vec']
+ ['who_vec']
)