In [1]:
import pyspark
from pyspark.sql import SparkSession
# May take awhile locally
spark = SparkSession.builder.appName("Classification").getOrCreate()

cores = spark._jsc.sc().getExecutorMemoryStatus().keySet().size()
print("You are working with", cores, "core(s)")
spark

You are working with 1 core(s)


In [2]:
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType
from pyspark.sql.functions import *
from pyspark.ml.feature import VectorAssembler, StringIndexer, MinMaxScaler, OneHotEncoder, VectorIndexer
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import BinaryClassificationEvaluator


In [3]:
# mlflow.set_experiment(experiment_name = "sd_ml")

# #set up client
# from mlflow.tracking import MlflowClient
# client = MlflowClient()

In [4]:
# #create a run

# # Create a run and attach it to the experiment you just created
# experiments = client.list_experiments() # returns a list of mlflow.entities.Experiment

# experiment_name = "sd-ml"
# def create_run(experiment_name):
#     mlflow.set_experiment(experiment_name = experiment_name)
#     for x in experiments:
#         if experiment_name in x.name:
# #             print(experiment_name)
# #             print(x)
#             experiment_index = experiments.index(x)
#             run = client.create_run(experiments[experiment_index].experiment_id) # returns mlflow.entities.Run
# #             print(run)
#             return run

# # Example run command
# # run = create_run('Experiment-3')
# # run = create_run(experiment_name)
# # add tags to run
# # add params and metrics to a run
# # #terminate client

In [5]:
# # test the functionality here
# run = create_run('sd-ml')

# # Add tag to a run
# client.set_tag(run.info.run_id, "Algorithm", "Gradient Boosted Tree")
# client.set_tag(run.info.run_id,"Random Seed",999)
# client.set_tag(run.info.run_id,"Train Perct",999)

# # Add params and metrics to a run
# client.log_param(run.info.run_id, "Max Depth", 999)
# client.log_param(run.info.run_id, "Max Bins", 999)
# client.log_metric(run.info.run_id, "Accuracy", 999)

# # Terminate the client
# client.set_terminated(run.info.run_id)

In [6]:
schema = StructType([
    StructField("age", FloatType(), True),
    StructField("sex", FloatType(), True),
    StructField("chest_pain_type", FloatType(), True),
    StructField("resting_bps", FloatType(), True),
    StructField("chol", FloatType(), True),
    StructField("fast_blood_sugar", FloatType(), True),
    StructField("rest_ecg_type", FloatType(), True),
    StructField("max_hr", FloatType(), True),
    StructField("exercise_angina", FloatType(), True),
    StructField("oldpeak", FloatType(), True),
    StructField("slope_type", FloatType(), True),
    StructField("colored_arteries", FloatType(), True),
    StructField("thal_type", FloatType(), True),
    StructField("heart_disease", IntegerType(), True),
])

df = spark.read.csv("data/processed.cleveland.data", schema = schema, header = False, nullValue='?')
df.limit(6).toPandas()

Unnamed: 0,age,sex,chest_pain_type,resting_bps,chol,fast_blood_sugar,rest_ecg_type,max_hr,exercise_angina,oldpeak,slope_type,colored_arteries,thal_type,heart_disease
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
5,56.0,1.0,2.0,120.0,236.0,0.0,0.0,178.0,0.0,0.8,1.0,0.0,3.0,0


In [7]:
print(df.describe().toPandas())

  summary                age                  sex     chest_pain_type  \
0   count                303                  303                 303   
1    mean  54.43894389438944   0.6798679867986799  3.1584158415841586   
2  stddev   9.03866244244675  0.46729882777012977  0.9601256119600138   
3     min               29.0                  0.0                 1.0   
4     max               77.0                  1.0                 4.0   

          resting_bps                chol    fast_blood_sugar  \
0                 303                 303                 303   
1  131.68976897689768  246.69306930693068  0.1485148514851485   
2   17.59974772958769  51.776917542637065  0.3561978749279763   
3                94.0               126.0                 0.0   
4               200.0               564.0                 1.0   

        rest_ecg_type              max_hr      exercise_angina  \
0                 303                 303                  303   
1  0.9900990099009901   149.6072607260

In [8]:
df.groupBy('heart_disease').count().show()

+-------------+-----+
|heart_disease|count|
+-------------+-----+
|            1|   55|
|            3|   35|
|            4|   13|
|            2|   36|
|            0|  164|
+-------------+-----+



In [9]:
#check how much missing data there is
data_agg = df.agg(*[count(when(isnull(c), c)).alias(c) for c in df.columns])
print(data_agg.limit(8).toPandas())

df_clean = df.na.drop()
features = df.columns[:-1]
label = 'heart_disease'

   age  sex  chest_pain_type  resting_bps  chol  fast_blood_sugar  \
0    0    0                0            0     0                 0   

   rest_ecg_type  max_hr  exercise_angina  oldpeak  slope_type  \
0              0       0                0        0           0   

   colored_arteries  thal_type  heart_disease  
0                 4          2              0  


In [10]:
# change from multiclass to binary prediction
df2 = df_clean.withColumn('label', when(df.heart_disease == 0, 0).otherwise(1))
print(df2.limit(10).show())

categorical_cols = [col for col in df.columns if "type" in col]
print(categorical_cols)

# encoder = OneHotEncoder(
#     inputCols=[categorical_cols], 
#  outputCols=[col + "_classVec" for col in categorical_cols]
# )

continuous_cols = [f for f in features if "type" not in f]
continuous_cols.remove('sex')

+----+---+---------------+-----------+-----+----------------+-------------+------+---------------+-------+----------+----------------+---------+-------------+-----+
| age|sex|chest_pain_type|resting_bps| chol|fast_blood_sugar|rest_ecg_type|max_hr|exercise_angina|oldpeak|slope_type|colored_arteries|thal_type|heart_disease|label|
+----+---+---------------+-----------+-----+----------------+-------------+------+---------------+-------+----------+----------------+---------+-------------+-----+
|63.0|1.0|            1.0|      145.0|233.0|             1.0|          2.0| 150.0|            0.0|    2.3|       3.0|             0.0|      6.0|            0|    0|
|67.0|1.0|            4.0|      160.0|286.0|             0.0|          2.0| 108.0|            1.0|    1.5|       2.0|             3.0|      3.0|            2|    1|
|67.0|1.0|            4.0|      120.0|229.0|             0.0|          2.0| 129.0|            1.0|    2.6|       2.0|             2.0|      7.0|            1|    1|
|37.0|1.0|

In [11]:
#create dense feature vector and scale
features_list = continuous_cols + categorical_cols

assembler = VectorAssembler(inputCols = features_list, outputCol = 'features')
indexer = VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=5)
scaler = MinMaxScaler(inputCol="indexed", outputCol="features_scaled")
pipeline = Pipeline(stages=[assembler, indexer, scaler])
scalerModel = pipeline.fit(df2)
scaledData = scalerModel.transform(df2).select('features_scaled', 'label')
scaledData = scaledData.withColumnRenamed("features_scaled", "features")
scaledData.limit(10).toPandas()

Unnamed: 0,features,label
0,"[0.7083333333333333, 0.4811320754716981, 0.244...",0
1,"[0.7916666666666666, 0.6226415094339622, 0.365...",1
2,"[0.7916666666666666, 0.24528301886792453, 0.23...",1
3,"[0.16666666666666666, 0.33962264150943394, 0.2...",0
4,"[0.25, 0.33962264150943394, 0.1780821917808219...",0
5,"(0.5625, 0.24528301886792453, 0.25114155251141...",0
6,"[0.6875, 0.43396226415094336, 0.32420091324200...",1
7,"[0.5833333333333333, 0.24528301886792453, 0.52...",0
8,"[0.7083333333333333, 0.33962264150943394, 0.29...",1
9,"[0.5, 0.43396226415094336, 0.17579908675799086...",1


In [12]:
train, test = scaledData.randomSplit([0.7, 0.3])
print(f"train len: {train.count()}, test len: {test.count()}")

train len: 202, test len: 95


In [13]:
classifiers = [LogisticRegression(), GBTClassifier()]

for classifier in classifiers:
    print(classifier)
    BC_evaluator = BinaryClassificationEvaluator() 

    paramGrid = (ParamGridBuilder().addGrid(classifier.maxIter, [10,15,20]).build())

    crossval = CrossValidator(
        estimator = classifier,
        estimatorParamMaps = paramGrid,
        evaluator=BC_evaluator,
        numFolds= 2)

    fitModel = crossval.fit(train)

    best_model = fitModel.bestModel
    predictions = fitModel.transform(test) #fitModel automatically uses best model
    area_under_roc = BC_evaluator.evaluate(predictions)
    print(f"Area under ROC: {area_under_roc}")


LogisticRegression_3a97edd5501f
Area under ROC: 0.8872727272727272
GBTClassifier_033882483110
Area under ROC: 0.8497727272727273


In [14]:
def ClassTrainEval(classifier,features,classes,train,test):

    def FindMtype(classifier):
        # Intstantiate Model
        M = classifier
        # Learn what it is
        Mtype = type(M).__name__
        
        return Mtype
    
    Mtype = FindMtype(classifier)
    

    def IntanceFitModel(Mtype,classifier,classes,features,train):
  
            # Add parameters of your choice here:
            if Mtype in("LogisticRegression"):
                paramGrid = (ParamGridBuilder() \
#                              .addGrid(classifier.regParam, [0.1, 0.01]) \
                             .addGrid(classifier.maxIter, [10, 15,20])
                             .build())
                
            # Add parameters of your choice here:
            if Mtype in("RandomForestClassifier"):
                paramGrid = (ParamGridBuilder() \
                               .addGrid(classifier.maxDepth, [2, 5, 10])
                               .addGrid(classifier.maxBins, [5, 10, 20])
                               .addGrid(classifier.numTrees, [5, 20, 50])
                             .build())
                
            # Add parameters of your choice here:
            if Mtype in("GBTClassifier"):
                paramGrid = (ParamGridBuilder() \
#                              .addGrid(classifier.maxDepth, [2, 5, 10, 20, 30]) \
#                              .addGrid(classifier.maxBins, [10, 20, 40, 80, 100]) \
                             .addGrid(classifier.maxIter, [10, 15,])
                             .build())
                
            
            #Cross Validator requires all of the following parameters:
            crossval = CrossValidator(estimator=classifier,
                                      estimatorParamMaps=paramGrid,
                                      evaluator=BinaryClassificationEvaluator(),
                                      numFolds=2) 
            
            # Fit Model: Run cross-validation, and choose the best set of parameters.
            fitModel = crossval.fit(train)
            return fitModel
    
    fitModel = IntanceFitModel(Mtype,classifier,classes,features,train)
    
    # Print feature selection metrics
    if fitModel is not None:
        
        if Mtype in("DecisionTreeClassifier", "GBTClassifier","RandomForestClassifier"):
            # FEATURE IMPORTANCES
            # Get Best Model
            BestModel = fitModel.bestModel
            print(" ")
            print('\033[1m' + Mtype," Feature Importances"+ '\033[0m')
            print("(Scores add up to 1)")
            print("Lowest score is the least important")
            print(" ")
            featureImportances = BestModel.featureImportances.toArray()
            print(featureImportances)
            
            if Mtype in("DecisionTreeClassifier"):
                global DT_featureImportances
                DT_featureImportances = BestModel.featureImportances.toArray()
                global DT_BestModel
                DT_BestModel = BestModel
            if Mtype in("GBTClassifier"):
                global GBT_featureImportances
                GBT_featureImportances = BestModel.featureImportances.toArray()
                global GBT_BestModel
                GBT_BestModel = BestModel
            if Mtype in("RandomForestClassifier"):
                global RF_featureImportances
                RF_featureImportances = BestModel.featureImportances.toArray()
                global RF_BestModel
                RF_BestModel = BestModel

        if Mtype in("LogisticRegression"):
            # Get Best Model
            BestModel = fitModel.bestModel
            print(" ")
            print('\033[1m' + Mtype," Coefficient Matrix"+ '\033[0m')
            print("Coefficients: \n" + str(BestModel.coefficientMatrix))
            print("Intercept: " + str(BestModel.interceptVector))
            global LR_coefficients
            LR_coefficients = BestModel.coefficientMatrix.toArray()
            global LR_BestModel
            LR_BestModel = BestModel
        
   
    # Set the column names to match the external results dataframe that we will join with later:
    columns = ['Classifier', 'Result']
    

    predictions = fitModel.transform(test)
    BC_evaluator = BinaryClassificationEvaluator()
    area_under_roc = (BC_evaluator.evaluate(predictions))*100
    Mtype = [Mtype] # make this a string
    score = [str(area_under_roc)] #make this a string and convert to a list
    result = spark.createDataFrame(zip(Mtype,score), schema=columns)
    result = result.withColumn('Result',result.Result.substr(0, 5))
        
    return result


In [15]:
classes = 2

#set up your results table
columns = ['Classifier', 'Result']
vals = [("Place Holder","N/A")]
results = spark.createDataFrame(vals, columns)

for classifier in classifiers:
    new_result = ClassTrainEval(classifier,features,classes,train,test)
    results = results.union(new_result)
results = results.where("Classifier!='Place Holder'")
results.show(20)

 
[1mLogisticRegression  Coefficient Matrix[0m
Coefficients: 
DenseMatrix([[-2.34905609,  2.68676671,  0.469275  , -1.19255681, -5.01377328,
               0.5127661 ,  0.3803265 ,  3.24886996,  2.16767596,  0.44014217,
               1.18179712,  2.1399278 ]])
Intercept: [-1.0639434478486376]
 
[1mGBTClassifier  Feature Importances[0m
(Scores add up to 1)
Lowest score is the least important
 
[0.15647879 0.05654415 0.13557847 0.00040801 0.14525112 0.02227026
 0.09860547 0.08490765 0.06775833 0.00876506 0.01410718 0.20932552]
+------------------+------+
|        Classifier|Result|
+------------------+------+
|LogisticRegression| 88.72|
|     GBTClassifier| 84.97|
+------------------+------+



In [20]:
#Classification diagnostics for logistic regression

lr = LogisticRegression()

crossval = CrossValidator(
        estimator = lr,
        estimatorParamMaps = paramGrid,
        evaluator=BC_evaluator,
        numFolds= 2)
    
lr_model = crossval.fit(train)
best_model = lr_model.bestModel
    
trainingSummary = best_model.summary

# Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
trainingSummary.roc.show()

print("areaUnderROC: " + str(trainingSummary.areaUnderROC))

# Set the model threshold to maximize F-Measure
fMeasure = trainingSummary.fMeasureByThreshold
maxFMeasure = fMeasure.groupBy().max('F-Measure').select('max(F-Measure)').head()
bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \
    .select('threshold').head()['threshold']
classifier.setThreshold(bestThreshold)

+--------------------+--------------------+
|                 FPR|                 TPR|
+--------------------+--------------------+
|                 0.0|                 0.0|
|                 0.0|0.024390243902439025|
|                 0.0| 0.04878048780487805|
|                 0.0| 0.07317073170731707|
|                 0.0|  0.0975609756097561|
|                 0.0| 0.12195121951219512|
|                 0.0| 0.14634146341463414|
|                 0.0| 0.17073170731707318|
|                 0.0|  0.1951219512195122|
|                 0.0| 0.21951219512195122|
|                 0.0| 0.24390243902439024|
|0.008333333333333333| 0.25609756097560976|
|0.016666666666666666|  0.2682926829268293|
|0.016666666666666666|  0.2926829268292683|
|0.016666666666666666|  0.3170731707317073|
|0.016666666666666666| 0.34146341463414637|
|0.016666666666666666| 0.36585365853658536|
|0.016666666666666666|  0.3902439024390244|
|0.016666666666666666|  0.4146341463414634|
|0.016666666666666666| 0.4390243

LogisticRegression_074ab416fe8c