# Diabetes Readmission modelling using Decission Trees

In [0]:
# Data processing
from pyspark.sql.functions import log, col, exp

# Modeling
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator


In [0]:
diabetes_readmit = spark.read.table("diab_readmit_csv")
write_path = 'dbfs:/tmp/reproducible_ml_uofl/diab_readmit_csv.delta'
diabetes_readmit.write.format('delta').mode('overwrite').save(write_path)

In [0]:
diabetes_readmit = spark.read.format('delta').load(write_path)

#Show basic summary stats
display(diabetes_readmit.summary())

summary,patient_nbr,time_in_hospital,num_procedures,num_lab_procedures,num_medications,number_outpatient,number_inpatient,number_emergency,number_diagnoses,gender_cd,DiabetesMedication,readmit_flag,race_cd
count,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766
mean,54330400.69494724,4.395986871843248,1.339730361810428,43.09564098028811,16.021844230882614,0.3693571526836074,0.635565906098304,0.1978362124874712,7.422606764538254,0.5375862272271682,0.7700312481575379,0.1115991588546272,
stddev,38696359.34653421,2.985107767471267,1.705806979121172,19.674362249142096,8.127566209167309,1.2672650965326815,1.26286329009732,0.9304722684224632,1.9336001449974247,0.4985877237567153,0.420814525814695,0.3148741984505526,
min,135.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,AfrAmr
25%,23412645.0,2.0,0.0,31.0,10.0,0.0,0.0,0.0,6.0,0.0,1.0,0.0,
50%,45500490.0,4.0,1.0,44.0,15.0,0.0,0.0,0.0,8.0,1.0,1.0,0.0,
75%,87532902.0,6.0,2.0,57.0,20.0,0.0,1.0,0.0,9.0,1.0,1.0,0.0,
max,189502619.0,14.0,6.0,132.0,81.0,42.0,21.0,76.0,16.0,1.0,1.0,1.0,White


In [0]:
# Train test split
trainDF, testDF = diabetes_readmit.randomSplit([.65, .35], seed=42)
# Print the number of records
print(f'There are {trainDF.cache().count()} records in the training dataset.')
print(f'There are {testDF.cache().count()} records in the testing dataset.')

In [0]:
#One hot encoding of string feature Region
trainDF.groupBy('race_cd').count().show()

##Now we need to modify the categorical variable race_cd into one-hot-encoded version
 For this we will use a pipeline to do this in one step

In [0]:
#You can also create a pipeline and do everything together in one easy fit and transform step
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
 
categoricalColumns = ["race_cd"]
stages = [] # stages in Pipeline
for categoricalCol in categoricalColumns:
    # Category Indexing with StringIndexer
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")
    
# Use OneHotEncoder to convert categorical variables into binary SparseVectors
from pyspark.ml.feature import OneHotEncoder
encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])

# Define the pipeline based on the stages created in previous steps.
pipeline = Pipeline(stages=[stringIndexer, encoder])
 
# Define the pipeline model.
transform_mdl = pipeline.fit(trainDF)
trainDF21=transform_mdl.transform(trainDF)
trainDF21.show()

In [0]:
trainDF21.printSchema()

In [0]:
# Linear regression expect a vector input
vecAssembler = VectorAssembler(inputCols=['time_in_hospital','num_procedures','num_medications', 'number_inpatient','number_emergency','number_diagnoses','DiabetesMedication'], outputCol="features")
vecTrainDF = vecAssembler.transform(trainDF21)

In [0]:
testDF21=transform_mdl.transform(testDF) #do the data transformation using saved parameters from training
vecTestDF = vecAssembler.transform(testDF21) #do the feature transformation using vector assembler

In [0]:
# Create Decision tree calssifier
dt = DecisionTreeClassifier(featuresCol="features", labelCol="readmit_flag", maxDepth = 20)
dtModel = dt.fit(vecTrainDF)
predict_train = dtModel.transform(vecTrainDF)
predict_train.select('readmit_flag', 'rawPrediction', 'prediction', 'probability').show(10)

In [0]:
# Make predictions on testing dataset
predict_test = dtModel.transform(vecTestDF) #make predictions using the trained model
predict_test.groupBy('prediction').count().show()

In [0]:
eval = BinaryClassificationEvaluator(rawPredictionCol = "prediction", labelCol = "readmit_flag")
auc_train = eval.evaluate(predict_train)
print(auc_train)

auc_test = eval.evaluate(predict_test)
print(auc_test)