In [1]:
#To import all required modules
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
import os
import pandas as pd
from pyspark.ml.feature import StringIndexer
from pyspark.sql.types import *
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.clustering import KMeans



In [3]:
spark = SparkSession.builder.appName("Clustering").getOrCreate()

In [6]:
# Loading the dataset
diabetes_dataFrame = spark.read.load("dataset/Diabetes.csv", format="csv", header=True, inferSchema=True, delimiter=",")
# Check the type of diabetes_dataFrame
print("The type of diabetes_dataFrame is", type(diabetes_dataFrame))
#To show the first 10 rows
pd.DataFrame(diabetes_dataFrame.take(10), columns=diabetes_dataFrame.columns).transpose()

The type of diabetes_dataFrame is <class 'pyspark.sql.dataframe.DataFrame'>


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
encounter_id,2278392,149190,64410,500364,16680,35754,55842,63768,12522,15738
patient_nbr,8222157,55629189,86047875,82442376,42519267,82637451,84259809,114882984,48330783,63555939
race,Caucasian,Caucasian,AfricanAmerican,Caucasian,Caucasian,Caucasian,Caucasian,Caucasian,Caucasian,Caucasian
gender,Female,Female,Female,Male,Male,Male,Male,Male,Female,Female
age,[0-10),[10-20),[20-30),[30-40),[40-50),[50-60),[60-70),[70-80),[80-90),[90-100)
weight,?,?,?,?,?,?,?,?,?,?
admission_type_id,6,1,1,1,1,2,3,1,2,3
discharge_disposition_id,25,1,1,1,1,1,1,1,1,3
admission_source_id,1,7,7,7,7,2,2,7,4,4
time_in_hospital,1,3,2,2,1,3,4,5,13,12


In [15]:
#To remove null values 
diabetes_dataFrame = diabetes_dataFrame.filter(diabetes_dataFrame.weight.isNotNull())
diabetes_dataFrame = diabetes_dataFrame.filter(diabetes_dataFrame.weight != "?")
diabetes_dataFrame = diabetes_dataFrame.filter(diabetes_dataFrame.race != "?")
diabetes_dataFrame = diabetes_dataFrame.filter(diabetes_dataFrame.race.isNotNull())
diabetes_dataFrame = diabetes_dataFrame.filter(diabetes_dataFrame.payer_code != "?")
diabetes_dataFrame = diabetes_dataFrame.filter(diabetes_dataFrame.payer_code != "None")
pd.DataFrame(diabetes_dataFrame.take(10), columns=diabetes_dataFrame.columns).transpose()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
encounter_id,88754226,88792836,88816788,88986678,89032962,89035518,89125038,89191392,89277516,89307582
patient_nbr,78163290,100654011,85400073,58682736,69250302,69053373,59606946,62022042,30950811,58763808
race,Caucasian,Caucasian,Caucasian,Caucasian,Caucasian,Caucasian,Caucasian,Caucasian,Caucasian,Caucasian
gender,Male,Female,Female,Male,Male,Male,Male,Male,Male,Female
age,[40-50),[70-80),[60-70),[80-90),[60-70),[80-90),[80-90),[40-50),[50-60),[70-80)
weight,[75-100),[75-100),[75-100),[50-75),[100-125),[75-100),[75-100),[75-100),[100-125),[75-100)
admission_type_id,3,3,5,1,1,1,1,1,3,1
discharge_disposition_id,1,3,6,11,1,1,3,3,1,3
admission_source_id,1,1,4,5,7,6,5,7,1,7
time_in_hospital,1,10,10,6,2,7,6,3,2,10


In [20]:
#To get numeric features that we need only
diabetes_numeric_dataFrame = diabetes_dataFrame.select("admission_type_id", "discharge_disposition_id", "admission_source_id", "time_in_hospital", "num_lab_procedures", "num_procedures", "num_medications", "number_outpatient", "number_emergency", "number_inpatient", "number_diagnoses")


In [27]:
#To Change numeric features data type into integer
diabetes_numeric_dataFrame = diabetes_numeric_dataFrame.withColumn("admission_type_id", diabetes_numeric_dataFrame["admission_type_id"].cast(IntegerType()))

diabetes_numeric_dataFrame = diabetes_numeric_dataFrame.withColumn("discharge_disposition_id", diabetes_numeric_dataFrame["discharge_disposition_id"].cast(IntegerType()))

diabetes_numeric_dataFrame = diabetes_numeric_dataFrame.withColumn("admission_source_id", diabetes_numeric_dataFrame["admission_source_id"].cast(IntegerType()))

diabetes_numeric_dataFrame = diabetes_numeric_dataFrame.withColumn("time_in_hospital", diabetes_numeric_dataFrame["time_in_hospital"].cast(IntegerType()))

diabetes_numeric_dataFrame = diabetes_numeric_dataFrame.withColumn("num_lab_procedures", diabetes_numeric_dataFrame["num_lab_procedures"].cast(IntegerType()))

diabetes_numeric_dataFrame = diabetes_numeric_dataFrame.withColumn("num_procedures", diabetes_numeric_dataFrame["num_procedures"].cast(IntegerType()))

diabetes_numeric_dataFrame = diabetes_numeric_dataFrame.withColumn("num_medications", diabetes_numeric_dataFrame["num_medications"].cast(IntegerType()))

diabetes_numeric_dataFrame = diabetes_numeric_dataFrame.withColumn("number_outpatient", diabetes_numeric_dataFrame["number_outpatient"].cast(IntegerType()))

diabetes_numeric_dataFrame = diabetes_numeric_dataFrame.withColumn("number_emergency", diabetes_numeric_dataFrame["number_emergency"].cast(IntegerType()))

diabetes_numeric_dataFrame = diabetes_numeric_dataFrame.withColumn("number_inpatient", diabetes_numeric_dataFrame["number_inpatient"].cast(IntegerType()))

diabetes_numeric_dataFrame = diabetes_numeric_dataFrame.withColumn("number_diagnoses", diabetes_numeric_dataFrame["number_diagnoses"].cast(IntegerType()))

#T generate the vector assembler for numeric features, I choose the first five columns
diabetes_dataFrame_vector = VectorAssembler(inputCols=diabetes_numeric_dataFrame.columns[:10], outputCol="features")
diabetes_numeric_dataFrame = diabetes_dataFrame_vector.transform(diabetes_numeric_dataFrame)
diabetes_numeric_dataFrame.show(10)

+-----------------+------------------------+-------------------+----------------+------------------+--------------+---------------+-----------------+----------------+----------------+----------------+--------------------+
|admission_type_id|discharge_disposition_id|admission_source_id|time_in_hospital|num_lab_procedures|num_procedures|num_medications|number_outpatient|number_emergency|number_inpatient|number_diagnoses|            features|
+-----------------+------------------------+-------------------+----------------+------------------+--------------+---------------+-----------------+----------------+----------------+----------------+--------------------+
|                3|                       1|                  1|               1|                28|             2|             15|                0|               0|               0|               6|[3.0,1.0,1.0,1.0,...|
|                3|                       3|                  1|              10|                65|            

In [29]:
# To get the label and features vector
model_data = diabetes_numeric_dataFrame.select("features")
#To split the model dataset into training and testing datasets, 80% for taining and 20% for testing
training, test = model_data.randomSplit([0.8, 0.2])
training.show(5)

+--------------------+
|            features|
+--------------------+
|[1.0,1.0,1.0,2.0,...|
|[1.0,1.0,4.0,7.0,...|
|[1.0,1.0,5.0,1.0,...|
|[1.0,1.0,5.0,2.0,...|
|[1.0,1.0,5.0,3.0,...|
+--------------------+
only showing top 5 rows



In [42]:
#To train the model on the training dataset
kmeans = KMeans().setK(10).setSeed(1)
model = kmeans.fit(training)

In [45]:
#To perform testing on the test dataset
predictions = model.transform(test)
predictions.show(5)

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|[1.0,1.0,5.0,3.0,...|         6|
|[1.0,1.0,6.0,2.0,...|         0|
|[1.0,1.0,6.0,3.0,...|         6|
|[1.0,1.0,6.0,6.0,...|         6|
|[1.0,1.0,6.0,9.0,...|         6|
+--------------------+----------+
only showing top 5 rows



In [44]:
# To evaluate model and compute accuracy on the test set
# Evaluate clustering by computing Silhouette score
evaluate_kmeans_model = ClusteringEvaluator()

silhouette = evaluate_kmeans_model.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Silhouette with squared euclidean distance = 0.3615958833867857
Cluster Centers: 
[ 2.65919283  2.49327354  1.60089686  3.52914798 33.23766816  1.9058296
 15.74887892  1.42152466  0.16591928  0.68609865]
[ 1.41176471  3.29411765  5.33823529  9.63235294 79.79411765  3.64705882
 37.57352941  0.95588235  0.25        0.77941176]
[ 1.59283388  2.43648208  4.75895765  4.02931596 54.99674267  1.39739414
 13.33224756  1.27687296  0.30618893  0.51791531]
[ 1.12565445  2.9895288   6.19371728  5.60209424 77.28795812  1.12041885
 16.05759162  1.07853403  0.27225131  0.4973822 ]
[ 2.85454545  2.45454545  1.47272727  2.70909091  3.92727273  1.56363636
 14.54545455  1.27272727  0.07272727  0.54545455]
[ 1.36818182  2.70909091  5.25454545  6.10909091 67.88181818  1.87272727
 21.59545455  1.50454545  0.35909091  0.87272727]
[ 1.2388664   2.58704453  5.63562753  3.63967611 65.87854251  0.67611336
 10.17004049  1.05668016  0.32388664  0.61133603]
[ 2.80263158  2.94736842  1.76315789  3.25       17.921052