In [7]:
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans

In [8]:
spark = SparkSession.builder.getOrCreate()

In [9]:
data_path = '/Users/natha/Desktop/bootcamp_repo-1/LinkedIn_Spark_SQL_DataFrames/Exercise Files/Data'
file_path_no_header = data_path + "/utilization.csv"
df_util = spark.read.format("csv").option("header", "false").option("inferSchema","true").load(file_path_no_header)

df_util = df_util.withColumnRenamed("_c0", "event_datetime") \
        .withColumnRenamed ("_c1", "server_id")       \
        .withColumnRenamed("_c2", "cpu_utilization")  \
        .withColumnRenamed("_c3", "free_memory")      \
        .withColumnRenamed("_c4", "session_count")

In [10]:
df_util.show()

+-------------------+---------+---------------+-----------+-------------+
|     event_datetime|server_id|cpu_utilization|free_memory|session_count|
+-------------------+---------+---------------+-----------+-------------+
|03/05/2019 08:06:14|      100|           0.57|       0.51|           47|
|03/05/2019 08:11:14|      100|           0.47|       0.62|           43|
|03/05/2019 08:16:14|      100|           0.56|       0.57|           62|
|03/05/2019 08:21:14|      100|           0.57|       0.56|           50|
|03/05/2019 08:26:14|      100|           0.35|       0.46|           43|
|03/05/2019 08:31:14|      100|           0.41|       0.58|           48|
|03/05/2019 08:36:14|      100|           0.57|       0.35|           58|
|03/05/2019 08:41:14|      100|           0.41|        0.4|           58|
|03/05/2019 08:46:14|      100|           0.53|       0.35|           62|
|03/05/2019 08:51:14|      100|           0.51|        0.6|           45|
|03/05/2019 08:56:14|      100|       

In [11]:
vectorAssembler = VectorAssembler(inputCols=["cpu_utilization", "free_memory", "session_count"], outputCol="features")

In [12]:
vcluster_df = vectorAssembler.transform(df_util)

In [13]:
vcluster_df.show()

+-------------------+---------+---------------+-----------+-------------+----------------+
|     event_datetime|server_id|cpu_utilization|free_memory|session_count|        features|
+-------------------+---------+---------------+-----------+-------------+----------------+
|03/05/2019 08:06:14|      100|           0.57|       0.51|           47|[0.57,0.51,47.0]|
|03/05/2019 08:11:14|      100|           0.47|       0.62|           43|[0.47,0.62,43.0]|
|03/05/2019 08:16:14|      100|           0.56|       0.57|           62|[0.56,0.57,62.0]|
|03/05/2019 08:21:14|      100|           0.57|       0.56|           50|[0.57,0.56,50.0]|
|03/05/2019 08:26:14|      100|           0.35|       0.46|           43|[0.35,0.46,43.0]|
|03/05/2019 08:31:14|      100|           0.41|       0.58|           48|[0.41,0.58,48.0]|
|03/05/2019 08:36:14|      100|           0.57|       0.35|           58|[0.57,0.35,58.0]|
|03/05/2019 08:41:14|      100|           0.41|        0.4|           58| [0.41,0.4,58.0]|

In [14]:
kmeans = KMeans().setK(3)
kmeans = kmeans.setSeed(1)

In [15]:
kmodel = kmeans.fit(vcluster_df)

In [16]:
kmodel.clusterCenters()

[array([ 0.71174897,  0.28808911, 86.87510507]),
 array([ 0.61918113,  0.38080285, 68.75004716]),
 array([ 0.51439668,  0.48445202, 50.49452021])]