# Introduction to Spark MLlib 

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local[4]').appName('spark_mllib').getOrCreate()

## Load and transform the data

Next, we load the data.

In [None]:
births = spark.read.options(inferSchema = True).csv('births_train.csv.gz', header = True)

Our goal is to predict whether the `'INFANT_ALIVE_AT_REPORT'` is either 1 or 0. Thus, we will drop all of the features that relate to the infant.

In [None]:
selected_features = [
    'INFANT_ALIVE_AT_REPORT', 
    'BIRTH_PLACE', 
    'MOTHER_AGE_YEARS', 
    'FATHER_COMBINED_AGE', 
    'CIG_BEFORE', 
    'CIG_1_TRI', 
    'CIG_2_TRI', 
    'CIG_3_TRI', 
    'MOTHER_HEIGHT_IN', 
    'MOTHER_PRE_WEIGHT', 
    'MOTHER_DELIVERY_WEIGHT', 
    'MOTHER_WEIGHT_GAIN', 
    'DIABETES_PRE', 
    'DIABETES_GEST', 
    'HYP_TENS_PRE', 
    'HYP_TENS_GEST', 
    'PREV_BIRTH_PRETERM'
]

births_trimmed = births.select(selected_features)

Specify the recoding methods.

In [None]:
import pyspark.sql.functions as fn
import pyspark.sql.types as types

recode_dictionary = {'YNU': {'Y': 1, 'N': 0,'U': 0}}

def recode(col, key):        
    return recode_dictionary[key][col] 

def correct_cig(feat):
    return fn.when(fn.col(feat) != 99, fn.col(feat)).otherwise(0)

rec_integer = fn.udf(recode, types.IntegerType())

Correct the features related to the number of smoked cigarettes.

In [None]:
births_transformed = births_trimmed \
    .withColumn('CIG_BEFORE', correct_cig('CIG_BEFORE'))\
    .withColumn('CIG_1_TRI', correct_cig('CIG_1_TRI'))\
    .withColumn('CIG_2_TRI', correct_cig('CIG_2_TRI'))\
    .withColumn('CIG_3_TRI', correct_cig('CIG_3_TRI'))
births_transformed.show()

Figure out which Yes/No/Unknown features are.

In [None]:
cols = [(col.name, col.dataType) for col in births_trimmed.schema]

YNU_cols = []

for i, s in enumerate(cols):
    if s[1] == types.StringType():
        dis = births.select(s[0]).distinct().rdd.map(lambda row: row[0]).collect()

        if 'Y' in dis:
            YNU_cols.append(s[0])

DataFrames can transform the features *in bulk* while selecting features.

In [None]:
births.select(['INFANT_NICU_ADMISSION', rec_integer('INFANT_NICU_ADMISSION', fn.lit('YNU')).alias('INFANT_NICU_ADMISSION_RECODE')]).show(5)

Transform all the `YNU_cols` in one using a list of transformations.

In [None]:
for col in births_transformed.columns:
    if col in YNU_cols:
        births_transformed = births_transformed.withColumn(col, rec_integer(col, fn.lit('YNU')).alias(col))

births_transformed.select(YNU_cols).show()

## Get to know your data

### Descriptive statistics

We will use the `colStats(...)` method.

In [None]:
import pyspark.mllib.stat as st
import numpy as np

numeric_cols = ['MOTHER_AGE_YEARS','FATHER_COMBINED_AGE',
                'CIG_BEFORE','CIG_1_TRI','CIG_2_TRI','CIG_3_TRI',
                'MOTHER_HEIGHT_IN','MOTHER_PRE_WEIGHT',
                'MOTHER_DELIVERY_WEIGHT','MOTHER_WEIGHT_GAIN']

numeric_rdd = births_transformed.select(numeric_cols).rdd.map(lambda row: [e for e in row])

mllib_stats = st.Statistics.colStats(numeric_rdd)

for col, m, v in zip(numeric_cols, mllib_stats.mean(), mllib_stats.variance()):
    print('{0}: \t{1:.2f} \t {2:.2f}'.format(col, m, np.sqrt(v)))

For the categorical variables we will calculate the frequencies of their values.

In [None]:
categorical_cols = [e for e in births_transformed.columns if e not in numeric_cols]

categorical_rdd = births_transformed.select(categorical_cols).rdd.map(lambda row: [e for e in row])
            
for i, col in enumerate(categorical_cols):
    feq = categorical_rdd.groupBy(lambda row: row[i]).mapValues(lambda x: len(x))   
    print(col, sorted(feq.collect(), key=lambda x: x[1], reverse=True))

### Correlations

Correlations between our features.

In [None]:
corrs = st.Statistics.corr(numeric_rdd)
print(corrs)

for i, ele in enumerate(corrs > 0.5):
    correlated = []
    for j, e in enumerate(ele):
        if e == True and j != i:
            correlated.append((numeric_cols[j], corrs[i][j])) 
    
    if len(correlated) > 0:
        for p in correlated:
            print('{0}-to-{1}: {2:.2f}'.format(numeric_cols[i], p[0], p[1]))

We can drop most of highly correlated features. 

In [None]:
features_to_keep = [
    'INFANT_ALIVE_AT_REPORT', 
    'BIRTH_PLACE', 
    'MOTHER_AGE_YEARS', 
    'FATHER_COMBINED_AGE', 
    'CIG_1_TRI', 
    'MOTHER_HEIGHT_IN', 
    'MOTHER_PRE_WEIGHT', 
    'DIABETES_PRE', 
    'DIABETES_GEST', 
    'HYP_TENS_PRE', 
    'HYP_TENS_GEST', 
    'PREV_BIRTH_PRETERM'
]

births_transformed = births_transformed.select(features_to_keep)
births_transformed.show()

### Statistical testing

Run a Chi-square test to determine if there are significant differences for categorical variables.

In [None]:
categorical_cols

In [None]:
feq = births_transformed.groupBy('INFANT_ALIVE_AT_REPORT').pivot(categorical_cols[1]).count()
feq.show()

In [None]:
flat_feq = feq.rdd.map(lambda row: (row[1:])).flatMap(lambda row: [0 if e == None else e for e in row]).collect()
flat_feq

In [None]:
import pyspark.mllib.linalg as ln
mat_feq = ln.Matrices.dense(8, 2, flat_feq)
mat_feq

In [None]:
import pyspark.mllib.linalg as ln

for cat in categorical_cols[1:]:
    feq = births_transformed.groupBy('INFANT_ALIVE_AT_REPORT').pivot(cat).count()

    flat_feq = feq.rdd.map(lambda row: (row[1:])).flatMap(lambda row: [0 if e == None else e for e in row]).collect()

    row_length = len(feq.collect()[0]) - 1
    mat_feq = ln.Matrices.dense(row_length, 2, flat_feq)
    
    test = st.Statistics.chiSqTest(mat_feq)
    print(cat, round(test.pValue, 4))

## Create the final dataset

### Create an RDD of `LabeledPoint`

We will use a hashing trick to encode the `'BIRTH_PLACE'` feature.

In [None]:
births_transformed.show()
births_transformed.select('BIRTH_PLACE').distinct().show()
births_transformed.rdd.take(5)

In [None]:
import pyspark.mllib.feature as ft
import pyspark.mllib.regression as reg

hashing = ft.HashingTF(7)

births_hashed = births_transformed.rdd.map(lambda row: [list(hashing.transform(str(row[1])).toArray()) 
                                                            if col == 'BIRTH_PLACE' else row[i] 
                                                        for i, col in enumerate(features_to_keep)])

births_hashed.take(5)

In [None]:
births_hashed_lists = births_hashed.map(lambda row: [[e] if type(e) == int else e for e in row])
print(births_hashed_lists.take(5))
births_hashed_all = births_hashed_lists.map(lambda row: [item for sublist in row for item in sublist])
print(births_hashed_all.take(5))

In [None]:
births_hashed_final = births_hashed_all.map(lambda row: reg.LabeledPoint(row[0], ln.Vectors.dense(row[1:])))
births_hashed_final.take(10)

### Split into training and testing

Before we move to the modeling stage, we need to split our dataset into two sets: one training set and one testing set.

In [None]:
births_train, births_test = births_hashed_final.randomSplit([0.7, 0.3], seed = 200)

## Predicting infant survival

### Logistic regression in Spark

MLLib used to provide a logistic regression model estimated using a stochastic gradient descent (SGD) algorithm. This model has been deprecated in Spark 2.0 in favor of the `LogisticRegressionWithLBFGS` model. 

In [None]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS

LR_Model = LogisticRegressionWithLBFGS.train(births_train, iterations=10)

Let's now use the model to predict the classes for our testing set.

In [None]:
LR_truth = births_test.map(lambda row: row.label).zipWithIndex().map(lambda row: (row[1], row[0]))
LR_truth.take(5)

In [None]:
LR_predicted = LR_Model.predict(births_test.map(lambda row: row.features)).zipWithIndex().map(lambda row: (row[1], row[0] * 1.0))
LR_predicted.take(5)

In [None]:
LR_results = LR_truth.join(LR_predicted).map(lambda row: row[1])
LR_results.take(5)

Let's check how well or how bad our model performed.

In [None]:
import pyspark.mllib.evaluation as ev
LR_evaluation = ev.BinaryClassificationMetrics(LR_results)

print('Area under PR: {0:.2f}'.format(LR_evaluation.areaUnderPR))
print('Area under ROC: {0:.2f}'.format(LR_evaluation.areaUnderROC))
LR_evaluation.unpersist()

### Random Forest in Spark

We are now ready to build the random forest model. 

In [None]:
from pyspark.mllib.tree import RandomForest

RF_model = RandomForest.trainClassifier(data=births_train, 
                                        numClasses=2, 
                                        categoricalFeaturesInfo={}, 
                                        numTrees=6,  
                                        featureSubsetStrategy='all',
                                        seed=666)

Let's see how well our model did.

In [None]:
RF_truth = births_test.map(lambda row: row.label).zipWithIndex().map(lambda row: (row[1], row[0]))
RF_truth.take(5)

In [None]:
RF_predicted = RF_model.predict(births_test.map(lambda row: row.features)).zipWithIndex().map(lambda row: (row[1], row[0] * 1.0))
RF_predicted.take(5)

In [None]:
RF_results = RF_truth.join(RF_predicted).map(lambda row: row[1])
RF_results.take(5)

In [None]:
RF_evaluation = ev.BinaryClassificationMetrics(RF_results)

print('Area under PR: {0:.2f}'.format(RF_evaluation.areaUnderPR))
print('Area under ROC: {0:.2f}'.format(RF_evaluation.areaUnderROC))
RF_evaluation.unpersist()