In [31]:
import findspark
findspark.init()
import time as t

In [32]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('patients').getOrCreate()

In [33]:
from pyspark.ml.clustering import KMeans
dataset = spark.read.csv("D:\COURSE PDFs\College Notes\SEMESTER VI\Big Data Analytics\Healthcare Stroke Dataset\data.csv",header=True,inferSchema=True)

In [34]:
dataset.head(1)


[Row(id=9046, gender=1, age=67.0, hypertension=0, heart_disease=1, ever_married=1, work_type=1, Residence_type=1, avg_glucose_level=228.69, smoking_status=1, stroke=1)]

In [35]:
dataset.printSchema()


root
 |-- id: integer (nullable = true)
 |-- gender: integer (nullable = true)
 |-- age: double (nullable = true)
 |-- hypertension: integer (nullable = true)
 |-- heart_disease: integer (nullable = true)
 |-- ever_married: integer (nullable = true)
 |-- work_type: integer (nullable = true)
 |-- Residence_type: integer (nullable = true)
 |-- avg_glucose_level: double (nullable = true)
 |-- smoking_status: integer (nullable = true)
 |-- stroke: integer (nullable = true)



In [36]:
columns_to_drop = ['avg_glucose_level']
dataset = dataset.drop(*columns_to_drop)
dataset.printSchema()

root
 |-- id: integer (nullable = true)
 |-- gender: integer (nullable = true)
 |-- age: double (nullable = true)
 |-- hypertension: integer (nullable = true)
 |-- heart_disease: integer (nullable = true)
 |-- ever_married: integer (nullable = true)
 |-- work_type: integer (nullable = true)
 |-- Residence_type: integer (nullable = true)
 |-- smoking_status: integer (nullable = true)
 |-- stroke: integer (nullable = true)



In [37]:
dataset.columns


['id',
 'gender',
 'age',
 'hypertension',
 'heart_disease',
 'ever_married',
 'work_type',
 'Residence_type',
 'smoking_status',
 'stroke']

In [38]:
start_time=t.time()
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler


In [39]:
from pyspark.sql.types import IntegerType
dataset = dataset.withColumn("smoking_status", dataset["smoking_status"].cast(IntegerType()))

In [40]:
dataset.printSchema()


root
 |-- id: integer (nullable = true)
 |-- gender: integer (nullable = true)
 |-- age: double (nullable = true)
 |-- hypertension: integer (nullable = true)
 |-- heart_disease: integer (nullable = true)
 |-- ever_married: integer (nullable = true)
 |-- work_type: integer (nullable = true)
 |-- Residence_type: integer (nullable = true)
 |-- smoking_status: integer (nullable = true)
 |-- stroke: integer (nullable = true)



In [41]:
feat_cols = [
 'gender','age','hypertension','heart_disease','ever_married','work_type','Residence_type','smoking_status','stroke']

In [42]:
vec_assembler = VectorAssembler(inputCols = feat_cols, outputCol='features')


In [43]:
final_data = vec_assembler.transform(dataset)


In [44]:
from pyspark.ml.feature import StandardScaler

In [45]:
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False)


In [46]:
final_data



DataFrame[id: int, gender: int, age: double, hypertension: int, heart_disease: int, ever_married: int, work_type: int, Residence_type: int, smoking_status: int, stroke: int, features: vector]

In [47]:
scalerModel = scaler.fit(final_data)


In [48]:
cluster_final_data = scalerModel.transform(final_data)


In [49]:
kmeans3 = KMeans(featuresCol='scaledFeatures',k=3)
kmeans2 = KMeans(featuresCol='scaledFeatures',k=2)

In [50]:
model3 = kmeans3.fit(cluster_final_data)
model2 = kmeans2.fit(cluster_final_data)

In [51]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

In [52]:
predictions3 = model3.transform(cluster_final_data)
predictions2 = model2.transform(cluster_final_data)

In [53]:
evaluator = ClusteringEvaluator()

In [54]:
silhouette = evaluator.evaluate(predictions3)
print("With k=3 Silhouette with squared euclidean distance = " + str(silhouette))
silhouette = evaluator.evaluate(predictions2)
print("With k=2 Silhouette with squared euclidean distance = " + str(silhouette))

With k=3 Silhouette with squared euclidean distance = -0.051780492409706495
With k=2 Silhouette with squared euclidean distance = 0.08301564925690019


In [55]:
centers=model2.clusterCenters()
print("Cluster Centers:")
for center in centers:
    print(center)

Cluster Centers:
[8.30328948e-01 1.77326310e+00 0.00000000e+00 1.77700557e-01
 1.30316618e+00 1.68107417e+00 2.98497153e+00 1.14720772e+00
 2.09625525e-03]
[0.90508111 2.81444811 2.47274111 0.63844992 1.8911928  1.39416261
 2.97800074 0.81946711 1.68944126]


In [56]:
for k in range(2,5):
    kmeans = KMeans(featuresCol='scaledFeatures',k=k)
    model = kmeans.fit(cluster_final_data)
    predictions = model.transform(cluster_final_data)
    evaluator = ClusteringEvaluator()
    silhouette = evaluator.evaluate(predictions)
    print("With K={}".format(k))
    print("Silhouette with squared euclidean distance = " + str(silhouette))
    print('--'*30)

With K=2
Silhouette with squared euclidean distance = 0.08301564925690019
------------------------------------------------------------
With K=3
Silhouette with squared euclidean distance = -0.051780492409706495
------------------------------------------------------------
With K=4
Silhouette with squared euclidean distance = -0.08035770895238105
------------------------------------------------------------


In [57]:
model3.transform(cluster_final_data).groupBy('prediction').count().show()


+----------+-----+
|prediction|count|
+----------+-----+
|         1|  432|
|         2|  249|
|         0| 4429|
+----------+-----+



In [58]:
model2.transform(cluster_final_data).groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  679|
|         0| 4431|
+----------+-----+



In [59]:
end_time = t.time()

In [60]:
end_time - start_time

8.335537672042847