In [16]:
import os.path

from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()


In [17]:
folder_path = '/Users/sg0218817/Downloads/Ex_Files_Spark_SQL_DataFrames/Exercise Files/Data'
df = spark.read.format('json')\
    .load(os.path.join(folder_path, 'utilization'))

df.createOrReplaceTempView('utilization')


### KMeans


In [18]:
vectorAssembler = VectorAssembler(inputCols=['cpu_utilization', 'free_memory', 'session_count'], outputCol='features')
vcluster_df = vectorAssembler.transform(df)
vcluster_df.show(5)

kmeans = KMeans().setK(3)\
    .setSeed(1)

# uses features column
kmodel = kmeans.fit(vcluster_df)
kmodel.clusterCenters()


+---------------+-------------------+-----------+---------+-------------+----------------+
|cpu_utilization|         event_date|free_memory|server_id|session_count|        features|
+---------------+-------------------+-----------+---------+-------------+----------------+
|           0.57|03/05/2019 08:06:14|       0.51|      100|           47|[0.57,0.51,47.0]|
|           0.47|03/05/2019 08:11:14|       0.62|      100|           43|[0.47,0.62,43.0]|
|           0.56|03/05/2019 08:16:14|       0.57|      100|           62|[0.56,0.57,62.0]|
|           0.57|03/05/2019 08:21:14|       0.56|      100|           50|[0.57,0.56,50.0]|
|           0.35|03/05/2019 08:26:14|       0.46|      100|           43|[0.35,0.46,43.0]|
+---------------+-------------------+-----------+---------+-------------+----------------+
only showing top 5 rows



[array([ 0.61918113,  0.38080285, 68.75004716]),
 array([ 0.71174897,  0.28808911, 86.87510507]),
 array([ 0.51439668,  0.48445202, 50.49452021])]

### Linear Regression


In [19]:
vectorAssembler = VectorAssembler(inputCols=['cpu_utilization'], outputCol='features')
df_vutil = vectorAssembler.transform(df)
df_vutil.show(5)

lr = LinearRegression(featuresCol='features', labelCol='session_count')
lr_model = lr.fit(df_vutil)

print('coefficients=', lr_model.coefficients)
print('intercept=', lr_model.intercept)
print('rootMeanSquaredError=', lr_model.summary.rootMeanSquaredError)


+---------------+-------------------+-----------+---------+-------------+--------+
|cpu_utilization|         event_date|free_memory|server_id|session_count|features|
+---------------+-------------------+-----------+---------+-------------+--------+
|           0.57|03/05/2019 08:06:14|       0.51|      100|           47|  [0.57]|
|           0.47|03/05/2019 08:11:14|       0.62|      100|           43|  [0.47]|
|           0.56|03/05/2019 08:16:14|       0.57|      100|           62|  [0.56]|
|           0.57|03/05/2019 08:21:14|       0.56|      100|           50|  [0.57]|
|           0.35|03/05/2019 08:26:14|       0.46|      100|           43|  [0.35]|
+---------------+-------------------+-----------+---------+-------------+--------+
only showing top 5 rows

coefficients= [47.023972214607845]
intercept= 40.41695103556818
rootMeanSquaredError= 12.837990225931815
