In [43]:
import numpy as np
import pyspark_helper.util as util
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import RegressionEvaluator, BinaryClassificationEvaluator
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, coalesce, col, lit, regexp_extract, when

In [2]:
spark = SparkSession\
        .builder\
        .getOrCreate()

df = spark.read.format("csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load("train.csv")

# cheeky unpacking to turn column titles to lower
df = df.toDF(*[c.lower() for c in df.columns])

## Data Exploration, Cleaning, and Imputation

Examine the count of missing data

In [3]:
missing = util.print_missing(df=df)

Total samples: 891
Column: age, num missing: 177 (% missing: 19.9%)
Column: cabin, num missing: 687 (% missing: 77.1%)
Column: embarked, num missing: 2 (% missing: 0.2%)


Over 3/4 of "Cabin" data is missing (687/891). There are a total 148 unique values, so 204 of the non-missing samples are unique. There may be useful information here still (e.g. cabin area, treating A23 and A12 as equivalent). But for now, "Cabin" is removed.

Embarked has a small amount missing. These are set these to 0 for the sake of one-hot encoding, but will have minimal effect on data. For age, a recursive tree model will be used to predict the remaining values (alternatively, simple mean could be used).

In [4]:
# Drop columns that seem to offer little in terms of predictive value
removed_features = ["passengerid", "name", "ticket", "cabin"]
# Create a list of features
initial_features = df.columns
target = "survived"
initial_features.remove(target)
for remove in removed_features:
    initial_features.remove(remove)

Columns are kept where there is some a priori reason to believe they would act as strong predictors. These include:
Pclass, Age, Sex, SibSp, Parch, Fare, Embarked

Name, PassengerId, and Ticket are omitted as well due to being largely unique values without much discernible useful information

Replace missing values with 0. Since these variables are categorical and 0 is not a natural value, missing data will in essence be its own category.

In [5]:
# all string columns will be required to be encoded
features_for_encoding = ['sex', 'embarked']
numeric_features = ['pclass', 'age', 'sibsp', 'parch', 'fare']
print("String features for encoding: {0}\
    \nNumeric features: {1}".format(features_for_encoding, numeric_features))
df = df.na.fill("0", subset=features_for_encoding)
print("\n", "-"*8, " Columns with null values printed below ", "-"*8, sep="")
missing = util.print_missing(df)

String features for encoding: ['sex', 'embarked']
Numeric features: ['pclass', 'age', 'sibsp', 'parch', 'fare']

-------- Columns with null values printed below --------
Total samples: 891
Column: age, num missing: 177 (% missing: 19.9%)
Column: cabin, num missing: 687 (% missing: 77.1%)


### Encoding String/categorical features for modelling

The one-hot transformer for Sex and Embarked is created and kept for use later on the test set. (Note this has been modularised in the helper module)

Pipeline: String -> StringIndex -> OneHotEncoding

In [6]:
onehot_transformer = util.string_to_onehot_transformer(df=df, columns_for_encoding=features_for_encoding)
df = onehot_transformer.transform(df)

### Imputing age
Age has many missing values, but is numerical but holds important information, so it is preferable not to drop it. Missing values can be imputed with the mean, but instead will be predicted using a decision tree model regressor. Since this is only to fill missing data, there will not be extensive hyperparameter tuning at this point. A quick cross-validation will indicate whether prediction to impute performs better than mean imputation/

Below, the predictors for age are vectorised. Then the decision tree regressor is trained.

In [7]:
feature_cols = numeric_features + [col+"_encoded" for col in features_for_encoding]
age_predictors = feature_cols.copy()
age_predictors.remove("age")
vectoriser = util.create_feature_vectoriser(df, age_predictors, "age_predictors")
df = vectoriser.transform(df)

First the DataFrame is filtered to only train on non-null labels. Then, cross-validation is performed to check the OOF metric (RMSE) against the same metric if mean values were imputed.

In [8]:
df_filtered = df.filter(df["age"].isNotNull())

dt = DecisionTreeRegressor()
dt_params = [{dt.featuresCol: "age_predictors", 
              dt.labelCol: "age", 
              dt.predictionCol: "age_prediction",
              dt.maxDepth: 5}]
regression_evaluator = RegressionEvaluator(predictionCol="age_prediction",
                                          labelCol="age")

rpart_cv = CrossValidator(estimator=dt,
                          estimatorParamMaps=dt_params,
                          evaluator=regression_evaluator,
                          numFolds=5)

rpart_cv_model = rpart_cv.fit(df_filtered)
best_age_rpart = rpart_cv_model.bestModel

In [9]:
mean_age = df_filtered.agg(avg(col("age"))).collect()[0][0]
print("Mean of non-null values in training set: {0:.2f}".format(mean_age))
df_filtered = df_filtered.withColumn("mean_age", lit(mean_age))
evaluator = RegressionEvaluator()\
        .setMetricName("rmse")\
        .setLabelCol("age")\
        .setPredictionCol("mean_age")
mean_rmse = evaluator.evaluate(df_filtered)
predicted_rmse = rpart_cv_model.avgMetrics[0]
print("Out of fold RMSE of predictions against training set: {0:.2f}\
        \nRMSE of mean value against training set: {1:.2f}".format(predicted_rmse, mean_rmse))

Mean of non-null values in training set: 29.70
Out of fold RMSE of predictions against training set: 12.78        
RMSE of mean value against training set: 14.52


The OOF RMSE is lower than the mean RMSE for labeled ages, indicating that predicted ages are more accurate than the mean. This means that for samples with null age values, the predicted values will (in general) be stronger predictors than the mean. Now the model will be trained on all available age data and applied to the missing data. Then, all features for Survival are vectorised.

In [10]:
df = best_age_rpart.transform(df)

In [11]:
df = df.withColumn("age", coalesce(df.age, df.age_prediction))
df = df.drop("age_predictors", "age_prediction")
feature_vectoriser = util.create_feature_vectoriser(df, feature_cols, "features")
df = feature_vectoriser.transform(df)

In [12]:
df = df.withColumnRenamed("survived", "label")
print("-"*8, f"Columns to be used as features")
print(feature_cols, "\n")
print("-"*8, "Top 5 rows of label and vectorised features")
df.select("label", "features").show(5)
df.select(*feature_cols).show(5)

-------- Columns to be used as features
['pclass', 'age', 'sibsp', 'parch', 'fare', 'sex_encoded', 'embarked_encoded'] 

-------- Top 5 rows of label and vectorised features
+-----+--------------------+
|label|            features|
+-----+--------------------+
|    0|[3.0,22.0,1.0,0.0...|
|    1|[1.0,38.0,1.0,0.0...|
|    1|(9,[0,1,4,6],[3.0...|
|    1|[1.0,35.0,1.0,0.0...|
|    0|[3.0,35.0,0.0,0.0...|
+-----+--------------------+
only showing top 5 rows

+------+----+-----+-----+-------+-------------+----------------+
|pclass| age|sibsp|parch|   fare|  sex_encoded|embarked_encoded|
+------+----+-----+-----+-------+-------------+----------------+
|     3|22.0|    1|    0|   7.25|(1,[0],[1.0])|   (3,[0],[1.0])|
|     1|38.0|    1|    0|71.2833|    (1,[],[])|   (3,[1],[1.0])|
|     3|26.0|    0|    0|  7.925|    (1,[],[])|   (3,[0],[1.0])|
|     1|35.0|    1|    0|   53.1|    (1,[],[])|   (3,[0],[1.0])|
|     3|35.0|    0|    0|   8.05|(1,[0],[1.0])|   (3,[0],[1.0])|
+------+----+-----+-

Note AUC will be used as the evaluation metric, so thresholds are not a concern. Model will be first run with the base hyperparameters before performing random (as opposed to grid) search with 5-fold CV.

In [13]:
gbt = GBTClassifier()
gbt_transformer = gbt.fit(df)
df = gbt_transformer.transform(df)

In [14]:
df.select(*["label", "prediction", "probability", "rawPrediction"]).show(5)
print("Incorrect predictions:",df.filter(df.label != df.prediction).count())

evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
print("AUC: {0:.2f}",evaluator.evaluate(df))

+-----+----------+--------------------+--------------------+
|label|prediction|         probability|       rawPrediction|
+-----+----------+--------------------+--------------------+
|    0|       0.0|[0.91398813051760...|[1.18166614054903...|
|    1|       1.0|[0.05278475052523...|[-1.4436520151715...|
|    1|       1.0|[0.48663789379179...|[-0.0267305771414...|
|    1|       1.0|[0.04441291376077...|[-1.5343978119854...|
|    0|       0.0|[0.88634908728346...|[1.02698964775194...|
+-----+----------+--------------------+--------------------+
only showing top 5 rows

Incorrect predictions: 91
AUC: {0:.2f} 0.9542655972049133


The model has very high AUC. Even with the default threshold of 0.5, it is only inaccurate for 91 rows... But it is obviously overfitting. Time for cross validation and random parameter search. The final parameters will be those which maximise the OOF metric (AUC).

Note that below, the paramGrid is not actually a grid and only offers one option per hyperparameter. This is because the intention is to perform random search rather than grid search. There may be a more elegant solution

A few temporary functions are defined to generate hyperparameter values from the intended distributions.

In [15]:
def get_random_stepSize():
    rng = np.random.rand() * -2
    return np.power(10, rng)

def get_random_maxDepth(min_depth, max_depth):
    return np.random.randint(min_depth, max_depth+1)

def get_random_maxIter(min_iters, max_iters):
    return np.random.randint(min_iters, max_iters+1)

# avoids 'already exists' error
df = df.drop(*["prediction", "probability", "rawPrediction"])

num_models = 40
param_maps = [{gbt.stepSize: get_random_stepSize(),
              gbt.maxDepth: get_random_maxDepth(3, 9),
              gbt.maxIter: get_random_maxIter(10, 30)}
             for i in range(num_models)]

crossval = CrossValidator(estimator=gbt,
    estimatorParamMaps=param_maps,
    evaluator=BinaryClassificationEvaluator(),
    numFolds=5)
cvModel = crossval.fit(df)

In [16]:
bestModel = cvModel.bestModel
best_params = {key: bestModel.extractParamMap()[key] for key in param_maps[0].keys()}
print("The best parameters tested during random search are as follows:")
for k, v in best_params.items():
    print(str(k).split("__")[1], v)
print(f"Best model OOF AUC: {max(cvModel.avgMetrics):.4f}\n\
Worst model OOF AUC: {min(cvModel.avgMetrics):.4f}")

The best parameters tested during random search are as follows:
stepSize 0.5271459550452572
maxDepth 3
maxIter 10
Best model OOF AUC: 0.8678
Worst model OOF AUC: 0.8175


In [17]:
df = bestModel.transform(df)

In [48]:
df = df.withColumn("surname", regexp_extract(col("name"), r"^(\w+)", 0))
df.select("surname").show(5)

+---------+
|  surname|
+---------+
|   Braund|
|  Cumings|
|Heikkinen|
| Futrelle|
|    Allen|
+---------+
only showing top 5 rows



In [51]:
df.select("cabin").distinct().count()

148