### Import libraries

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans

### Start a Spark sessions

In [2]:
spark = SparkSession.builder.appName('app').getOrCreate()

In [4]:
sales = spark.read.format("csv") \
        .option("header", "true") \
        .option("inferSchema", "true") \
        .load("./../../../../data/retail-data/by-day/*.csv") \
        .limit(50) \
        .coalesce(1) \
        .where("Description IS NOT NULL")

In [5]:
sales.show(5)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   580538|    23084|  RABBIT NIGHT LIGHT|      48|2011-12-05 08:38:00|     1.79|   14075.0|United Kingdom|
|   580538|    23077| DOUGHNUT LIP GLOSS |      20|2011-12-05 08:38:00|     1.25|   14075.0|United Kingdom|
|   580538|    22906|12 MESSAGE CARDS ...|      24|2011-12-05 08:38:00|     1.65|   14075.0|United Kingdom|
|   580538|    21914|BLUE HARMONICA IN...|      24|2011-12-05 08:38:00|     1.25|   14075.0|United Kingdom|
|   580538|    22467|   GUMBALL COAT RACK|       6|2011-12-05 08:38:00|     2.55|   14075.0|United Kingdom|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
only showing top 5 rows



In [6]:
va = VectorAssembler()\
  .setInputCols(["Quantity", "UnitPrice"])\
  .setOutputCol("features")

va = va.transform(sales)

In [7]:
va.show(1)

+---------+---------+------------------+--------+-------------------+---------+----------+--------------+-----------+
|InvoiceNo|StockCode|       Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|   features|
+---------+---------+------------------+--------+-------------------+---------+----------+--------------+-----------+
|   580538|    23084|RABBIT NIGHT LIGHT|      48|2011-12-05 08:38:00|     1.79|   14075.0|United Kingdom|[48.0,1.79]|
+---------+---------+------------------+--------+-------------------+---------+----------+--------------+-----------+
only showing top 1 row



In [8]:
km = KMeans().setK(5)
print(km.explainParams())

distanceMeasure: the distance measure. Supported options: 'euclidean' and 'cosine'. (default: euclidean)
featuresCol: features column name. (default: features)
initMode: The initialization algorithm. This can be either "random" to choose random points as initial cluster centers, or "k-means||" to use a parallel variant of k-means++ (default: k-means||)
initSteps: The number of steps for k-means|| initialization mode. Must be > 0. (default: 2)
k: The number of clusters to create. Must be > 1. (default: 2, current: 5)
maxIter: max number of iterations (>= 0). (default: 20)
predictionCol: prediction column name. (default: prediction)
seed: random seed. (default: -4805716734417628608)
tol: the convergence tolerance for iterative algorithms (>= 0). (default: 0.0001)


In [9]:
kmModel = km.fit(va)

In [10]:
kmModel

KMeans_0e794554310d

In [11]:
summary = kmModel.summary
print(summary.clusterSizes) # number of points in each cluster

[12, 3, 8, 10, 17]


In [12]:
kmModel.computeCost(va)

257.16051377450964

In [13]:
centers = kmModel.clusterCenters()

print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[11.33333333  1.1       ]
[44.          1.16333333]
[ 2.5     11.24375]
[23.2    0.956]
[4.88235294 3.95176471]


In [16]:
from pyspark.ml.clustering import BisectingKMeans
bkm = BisectingKMeans().setK(5).setMaxIter(5)
bkmModel = bkm.fit(sales)