In [1]:
import org.apache.spark.sql.SparkSession

import org.apache.spark.ml.clustering.{KMeans, KMeansSummary}
import org.apache.spark.ml.evaluation.ClusteringEvaluator
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.Vectors


import org.apache.log4j._
Logger.getLogger("org").setLevel(Level.ERROR)

val spark = SparkSession.builder().getOrCreate()

Intitializing Scala interpreter ...

Spark Web UI available at http://MSI:4040
SparkContext available as 'sc' (version = 3.0.0, master = local[*], app id = local-1605306413700)
SparkSession available as 'spark'


import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.clustering.{KMeans, KMeansSummary}
import org.apache.spark.ml.evaluation.ClusteringEvaluator
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.Vectors
import org.apache.log4j._
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@20b26d7a


In [2]:
val df = (spark.read.option("inferSchema","true").option("multiline","true")
          .option("header","true").format("csv").load("../../data/ml_scala/Wholesale customers data.csv"))

df: org.apache.spark.sql.DataFrame = [Channel: int, Region: int ... 6 more fields]


In [3]:
df.printSchema()
df.show(5)
df.describe().show()

root
 |-- Channel: integer (nullable = true)
 |-- Region: integer (nullable = true)
 |-- Fresh: integer (nullable = true)
 |-- Milk: integer (nullable = true)
 |-- Grocery: integer (nullable = true)
 |-- Frozen: integer (nullable = true)
 |-- Detergents_Paper: integer (nullable = true)
 |-- Delicassen: integer (nullable = true)

+-------+------+-----+----+-------+------+----------------+----------+
|Channel|Region|Fresh|Milk|Grocery|Frozen|Detergents_Paper|Delicassen|
+-------+------+-----+----+-------+------+----------------+----------+
|      2|     3|12669|9656|   7561|   214|            2674|      1338|
|      2|     3| 7057|9810|   9568|  1762|            3293|      1776|
|      2|     3| 6353|8808|   7684|  2405|            3516|      7844|
|      1|     3|13265|1196|   4221|  6404|             507|      1788|
|      2|     3|22615|5410|   7198|  3915|            1777|      5185|
+-------+------+-----+----+-------+------+----------------+----------+
only showing top 5 rows

+----

In [4]:
// Select features to use in the model
val df_features = (df.select($"Fresh", $"Milk", $"Grocery",
                            $"Frozen", $"Detergents_Paper",$"Delicassen"))

df_features.show(5)

+-----+----+-------+------+----------------+----------+
|Fresh|Milk|Grocery|Frozen|Detergents_Paper|Delicassen|
+-----+----+-------+------+----------------+----------+
|12669|9656|   7561|   214|            2674|      1338|
| 7057|9810|   9568|  1762|            3293|      1776|
| 6353|8808|   7684|  2405|            3516|      7844|
|13265|1196|   4221|  6404|             507|      1788|
|22615|5410|   7198|  3915|            1777|      5185|
+-----+----+-------+------+----------------+----------+
only showing top 5 rows



df_features: org.apache.spark.sql.DataFrame = [Fresh: int, Milk: int ... 4 more fields]


In [5]:
// Create matrix of features
val assembler = (new VectorAssembler()
                 .setInputCols(Array("Fresh", "Milk", "Grocery", "Frozen", "Detergents_Paper", "Delicassen"))
                 .setOutputCol("features"))

val training_data = assembler.transform(df).select("features")

training_data.show(5)

+--------------------+
|            features|
+--------------------+
|[12669.0,9656.0,7...|
|[7057.0,9810.0,95...|
|[6353.0,8808.0,76...|
|[13265.0,1196.0,4...|
|[22615.0,5410.0,7...|
+--------------------+
only showing top 5 rows



assembler: org.apache.spark.ml.feature.VectorAssembler = VectorAssembler: uid=vecAssembler_88cf2229d57e, handleInvalid=error, numInputCols=6
training_data: org.apache.spark.sql.DataFrame = [features: vector]


In [6]:
// Train a K-Means Model
val kmeans = new KMeans().setK(3).setSeed(1L)
val model = kmeans.fit(training_data)

val predictions = model.transform(training_data)

kmeans: org.apache.spark.ml.clustering.KMeans = kmeans_fa299a69dc5c
model: org.apache.spark.ml.clustering.KMeansModel = KMeansModel: uid=kmeans_fa299a69dc5c, k=3, distanceMeasure=euclidean, numFeatures=6
predictions: org.apache.spark.sql.DataFrame = [features: vector, prediction: int]


In [7]:
val evaluator = new ClusteringEvaluator()

val silhouette = evaluator.evaluate(predictions)
println(s"Silhouette with squared euclidean distance = $silhouette")

Silhouette with squared euclidean distance = 0.6482181662567144


evaluator: org.apache.spark.ml.evaluation.ClusteringEvaluator = ClusteringEvaluator: uid=cluEval_0a6d9e1fa408, metricName=silhouette, distanceMeasure=squaredEuclidean
silhouette: Double = 0.6482181662567144


In [8]:
// Show Cluster Centroids
println("Cluster Centers: ")
model.clusterCenters.foreach(println)

Cluster Centers: 
[7390.958456973294,4439.768545994066,6292.19584569733,2495.53412462908,2238.6528189910982,1158.4480712166173]
[32768.013333333336,4827.68,5723.146666666667,5535.92,1074.1200000000001,2066.6400000000003]
[11849.17857142857,24717.10714285714,33887.71428571428,3409.3214285714284,15459.714285714284,4483.857142857142]


In [9]:
spark.stop()