In [1]:
from typing import List, Tuple, Dict, Any, Optional, Callable

import numpy as np
import pandas as pd
import pyspark
import pyspark.sql.functions as F
import os
import random

from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, FloatType, DoubleType
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, GBTClassifier
from pyspark.ml.tuning import CrossValidator, CrossValidatorModel, ParamGridBuilder
from pyspark.ml.feature import VectorAssembler, StandardScaler, OneHotEncoder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

In [2]:
spark = SparkSession.builder.appName("diabetes_indicators").getOrCreate()

23/10/12 20:40:26 WARN Utils: Your hostname, darkstar resolves to a loopback address: 127.0.1.1; using 192.168.1.208 instead (on interface wlp5s0)
23/10/12 20:40:26 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/12 20:40:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df = spark.read.csv("./data/diabetes/final_diabetes_dataset.csv", header=True, inferSchema=True, sep=",")
df.show(5)

+--------+--------+--------+-------+------+--------+--------+------+--------+-------+-------+--------+-------+-------+--------+--------+---+--------+-------+-----+-------+
|DIABETE3|CHCKIDNY|_RFHYPE5|TOLDHI2| _BMI5|SMOKE100|CVDSTRK3|_MICHD|_TOTINDA|_FRTLT1|_VEGLT1|_RFDRHV5|MEDCOST|GENHLTH|PHYSHLTH|MENTHLTH|SEX|_AGEG5YR|_MRACE1|EDUCA|INCOME2|
+--------+--------+--------+-------+------+--------+--------+------+--------+-------+-------+--------+-------+-------+--------+--------+---+--------+-------+-----+-------+
|       0|       0|       1|      1|2522.0|       1|       0|     0|       1|      1|      1|       0|      0|    2.0|     2.0|    88.0|  0|    10.0|    1.0|  6.0|    8.0|
|       0|       0|       0|      0|2407.0|       0|       0|     0|       0|      0|      1|       0|      0|    2.0|    88.0|    88.0|  0|     8.0|    1.0|  4.0|    3.0|
|       2|       0|       0|      0|2468.0|       1|       0|     0|       1|      1|      1|       0|      0|    3.0|    88.0|    88.0|  0|

# Feature Scaling and Transformation

In [4]:
def transform_data(df: pyspark.sql.DataFrame, map_dict: dict, colName: str) -> pyspark.sql.DataFrame:
    """ Function to transform predictor variable based on map_dict """
    map_col = F.create_map([F.lit(x) for i in map_dict.items() for x in i])
    new_df = df.withColumn(colName, map_col[F.col(colName)])
    
    return new_df

In [5]:
df = df.withColumn("PHYSHLTH", F.when(df["PHYSHLTH"] == 88, 0).otherwise(df["PHYSHLTH"]))
df = df.withColumn("MENTHLTH", F.when(df["MENTHLTH"] == 88, 0).otherwise(df["MENTHLTH"]))

df = transform_data(df, {v:idx for idx, v in enumerate(range(1, 14))}, "_AGEG5YR")
df = transform_data(df, {v:idx for idx, v in enumerate(range(1, 8))}, "_MRACE1")
df = transform_data(df, {v:idx for idx, v in enumerate(range(1, 7))}, "EDUCA")
df = transform_data(df, {v:idx for idx, v in enumerate(range(1, 9))}, "INCOME2")
df = transform_data(df, {v:idx for idx, v in enumerate(range(1, 6))}, "GENHLTH")

In [6]:
_temp = VectorAssembler(inputCols=["_BMI5", "PHYSHLTH", "MENTHLTH"], outputCol="numericfeatures")
_temp2 = VectorAssembler(inputCols=[x for x in df.columns if x not in ["_BMI5", "PHYSHLTH", "MENTHLTH", "DIABETE3"]], outputCol="catfeatures")
_temp3 = VectorAssembler(inputCols=["numericfeatures", "catfeatures"], outputCol="features")
_temp3.transform(_temp2.transform(_temp.transform(df))).select("numericfeatures","features").show()

23/10/12 20:40:36 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+--------------------+--------------------+
|     numericfeatures|            features|
+--------------------+--------------------+
|    [2522.0,2.0,0.0]|(20,[0,1,4,5,6,9,...|
|    [2407.0,0.0,0.0]|(20,[0,11,14,16,1...|
|    [2468.0,0.0,0.0]|(20,[0,6,9,10,11,...|
|    [2317.0,0.0,0.0]|(20,[0,6,9,14,16,...|
|   [2800.0,0.0,10.0]|(20,[0,2,5,12,14,...|
|[2839.34065274683...|(20,[0,4,6,9,10,1...|
|    [2824.0,7.0,0.0]|(20,[0,1,6,11,14,...|
|    [3723.0,0.0,0.0]|[3723.0,0.0,0.0,1...|
|    [2789.0,0.0,0.0]|(20,[0,4,5,6,8,11...|
|    [3119.0,0.0,0.0]|(20,[0,5,6,9,10,1...|
|    [3325.0,0.0,0.0]|(20,[0,4,6,9,10,1...|
|    [3085.0,0.0,0.0]|(20,[0,4,9,14,16,...|
|    [2441.0,3.0,5.0]|[2441.0,3.0,5.0,0...|
|    [2577.0,1.0,0.0]|(20,[0,1,6,9,10,1...|
|   [2443.0,30.0,0.0]|(20,[0,1,4,5,6,10...|
|    [2507.0,0.0,1.0]|(20,[0,2,4,5,6,8,...|
|   [3673.0,2.0,30.0]|(20,[0,1,2,4,5,9,...|
|   [2859.0,10.0,0.0]|(20,[0,1,9,11,16,...|
|    [3338.0,0.0,1.0]|(20,[0,2,4,5,6,8,...|
|   [3759.0,5.0,10.0]|[3759.0,5.

In [7]:
def scale_data(df: pyspark.sql.DataFrame, feats: List) -> pyspark.sql.DataFrame: 
    """ Function to scale the data """
    def scale_feat(df: pyspark.sql.DataFrame, feat: str) -> pyspark.sql.DataFrame:
        """ Function to scale numeric columns of dataframe """
        unlist = F.udf(lambda x: round(float(list(x)[0]), 3), DoubleType())

        assembler = VectorAssembler(inputCols=[feat], outputCol=feat+"_vec")
        scaler = StandardScaler(inputCol=feat+"_vec", outputCol=feat+"_scaled")
        pipeline = Pipeline(stages=[
            assembler,
            scaler
        ])

        if not os.path.exists(f"./scalers/{feat}_pipeline"):
            pipeline_model = pipeline.fit(df)
            pipeline_model.save(f"./scalers/{feat}_pipeline")

        else:
            print(f"Loading pipeline from : ./scalers/{feat}_pipeline")
            pipeline_model = PipelineModel.load(f"./scalers/{feat}_pipeline")

        df = pipeline_model.transform(df).withColumn(feat+"_scaled", unlist(feat+"_scaled")).drop(feat+"_vec")

        return df, feat+"_scaled"
    
    new_feats = list()
    for feat in feats:
        df, _name = scale_feat(df, feat)
        new_feats.append(_name)
        
    return df, new_feats

In [8]:
if not os.path.exists("./scalers/"):
    os.mkdir("./scalers")


num_cols = ["_BMI5", "PHYSHLTH", "MENTHLTH"]
cat_cols = [x for x in df.columns if x not in ["DIABETE3"]+num_cols]

df_train, df_test = df.randomSplit([.8, .2], seed=42)

In [9]:
df_train_scaled, train_new_feats = scale_data(df_train, num_cols)
df_test_scaled, test_new_feats = scale_data(df_test, num_cols)

df_train_scaled = df_train_scaled.withColumnRenamed("DIABETE3", "label")
df_test_scaled = df_test_scaled.withColumnRenamed("DIABETE3", "label")

Loading pipeline from : ./scalers/_BMI5_pipeline
Loading pipeline from : ./scalers/PHYSHLTH_pipeline
Loading pipeline from : ./scalers/MENTHLTH_pipeline
Loading pipeline from : ./scalers/_BMI5_pipeline
Loading pipeline from : ./scalers/PHYSHLTH_pipeline
Loading pipeline from : ./scalers/MENTHLTH_pipeline


In [10]:
dtree = DecisionTreeClassifier(featuresCol="features", labelCol="label", seed=32)
rf = RandomForestClassifier(featuresCol="features", labelCol="label", seed=32)
gbtree = GBTClassifier(featuresCol="features", labelCol="label", seed=32)

In [11]:
paramGrid_dtree = ParamGridBuilder() \
.addGrid(dtree.maxDepth, [5, 10, 15, 20])\
.build()

paramGrid_rf = ParamGridBuilder()\
.addGrid(rf.maxDepth, [5, 10, 15, 20])\
.addGrid(rf.numTrees, [100, 200, 300, 400])\
.build()

paramGrid_gb = ParamGridBuilder()\
.addGrid(gbtree.maxDepth, [5, 10, 15, 20])\
.addGrid(gbtree.maxIter, [20, 50, 100, 300])\
.build()

In [12]:
pipeline_dtree = Pipeline(
    stages=[
        VectorAssembler(inputCols=cat_cols + train_new_feats, outputCol="features"),
        dtree
    ]
)

pipeline_rf = Pipeline(
    stages=[
        VectorAssembler(inputCols=cat_cols + train_new_feats, outputCol="features"),
        rf
    ]
)

pipeline_gbtree = Pipeline(
    stages=[
        VectorAssembler(inputCols=cat_cols + train_new_feats, outputCol="features"),
        gbtree
    ]
)

# Model Cross-Validation

## - Decision Tree

In [13]:
crossval_dtree = CrossValidator(estimator=pipeline_dtree, 
                               estimatorParamMaps=paramGrid_dtree, 
                               evaluator=MulticlassClassificationEvaluator(),
                               numFolds=3)
cvmodel_dtree = crossval_dtree.fit(df_train_scaled)

23/10/12 20:40:58 WARN DAGScheduler: Broadcasting large task binary with size 1223.1 KiB
23/10/12 20:40:58 WARN DAGScheduler: Broadcasting large task binary with size 1656.5 KiB
23/10/12 20:40:58 WARN DAGScheduler: Broadcasting large task binary with size 1131.5 KiB
23/10/12 20:41:00 WARN DAGScheduler: Broadcasting large task binary with size 1223.1 KiB
23/10/12 20:41:00 WARN DAGScheduler: Broadcasting large task binary with size 1656.5 KiB
23/10/12 20:41:00 WARN DAGScheduler: Broadcasting large task binary with size 2.1 MiB
23/10/12 20:41:00 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB
23/10/12 20:41:00 WARN DAGScheduler: Broadcasting large task binary with size 3.3 MiB
23/10/12 20:41:01 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
23/10/12 20:41:01 WARN DAGScheduler: Broadcasting large task binary with size 4.5 MiB
23/10/12 20:41:01 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB
23/10/12 20:41:06 WARN DAGScheduler: Br

In [14]:
cvmodel_dtree.avgMetrics

[0.8025365038195532,
 0.8070890667363112,
 0.7998966711183496,
 0.7839792735138795]

## - Random Forest

In [15]:
crossval_rf = CrossValidator(estimator=pipeline_rf, 
                               estimatorParamMaps=paramGrid_rf, 
                               evaluator=MulticlassClassificationEvaluator(),
                               numFolds=3)
cvmodel_rf = crossval_rf.fit(df_train_scaled)

23/10/12 20:42:01 WARN DAGScheduler: Broadcasting large task binary with size 1136.4 KiB
23/10/12 20:42:08 WARN DAGScheduler: Broadcasting large task binary with size 1136.8 KiB
23/10/12 20:42:15 WARN DAGScheduler: Broadcasting large task binary with size 1138.1 KiB
23/10/12 20:42:24 WARN DAGScheduler: Broadcasting large task binary with size 1602.2 KiB
23/10/12 20:42:27 WARN DAGScheduler: Broadcasting large task binary with size 1160.4 KiB
23/10/12 20:42:35 WARN DAGScheduler: Broadcasting large task binary with size 1602.9 KiB
23/10/12 20:42:37 WARN DAGScheduler: Broadcasting large task binary with size 1157.3 KiB
23/10/12 20:42:45 WARN DAGScheduler: Broadcasting large task binary with size 1604.8 KiB
23/10/12 20:42:48 WARN DAGScheduler: Broadcasting large task binary with size 1159.7 KiB
23/10/12 20:42:55 WARN DAGScheduler: Broadcasting large task binary with size 1100.6 KiB
23/10/12 20:42:58 WARN DAGScheduler: Broadcasting large task binary with size 2.0 MiB
23/10/12 20:43:02 WARN D

ConnectionRefusedError: [Errno 111] Connection refused

In [None]:
cvmodel_rf.avgMetrics

In [None]:
crossval_gbtree = CrossValidator(estimator=pipeline_gbtree, 
                               estimatorParamMaps=paramGrid_gbtree, 
                               evaluator=MulticlassClassificationEvaluator(),
                               numFolds=3)
cvmodel_gbtree = crossval_gbtree.fit(df_train_scaled)

In [None]:
cvmodel_gbtree.avgMetrics

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 37200)
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
Traceback (most recent call last):
  File "/home/darthvader/miniconda3/envs/expenv/l