In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('app').getOrCreate()

In [8]:
from pyspark.ml.feature import VectorAssembler

va = VectorAssembler()\
  .setInputCols(["Quantity", "UnitPrice"])\
  .setOutputCol("features")

sales = va.transform(spark.read.format("csv")
  .option("header", "true")
  .option("inferSchema", "true")
  .load("./../../../../../data/retail-data/by-day/*.csv")
  .limit(50)
  .coalesce(1)
  .where("Description IS NOT NULL"))

sales.cache()

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: timestamp, UnitPrice: double, CustomerID: double, Country: string, features: vector]

In [9]:
sales.show(5)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-----------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|   features|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-----------+
|   580538|    23084|  RABBIT NIGHT LIGHT|      48|2011-12-05 08:38:00|     1.79|   14075.0|United Kingdom|[48.0,1.79]|
|   580538|    23077| DOUGHNUT LIP GLOSS |      20|2011-12-05 08:38:00|     1.25|   14075.0|United Kingdom|[20.0,1.25]|
|   580538|    22906|12 MESSAGE CARDS ...|      24|2011-12-05 08:38:00|     1.65|   14075.0|United Kingdom|[24.0,1.65]|
|   580538|    21914|BLUE HARMONICA IN...|      24|2011-12-05 08:38:00|     1.25|   14075.0|United Kingdom|[24.0,1.25]|
|   580538|    22467|   GUMBALL COAT RACK|       6|2011-12-05 08:38:00|     2.55|   14075.0|United Kingdom| [6.0,2.55]|
+---------+---------+-------------------

In [11]:
from pyspark.ml.clustering import KMeans
km = KMeans().setK(5)
print(km.explainParams())
kmModel = km.fit(sales)

distanceMeasure: the distance measure. Supported options: 'euclidean' and 'cosine'. (default: euclidean)
featuresCol: features column name. (default: features)
initMode: The initialization algorithm. This can be either "random" to choose random points as initial cluster centers, or "k-means||" to use a parallel variant of k-means++ (default: k-means||)
initSteps: The number of steps for k-means|| initialization mode. Must be > 0. (default: 2)
k: The number of clusters to create. Must be > 1. (default: 2, current: 5)
maxIter: max number of iterations (>= 0). (default: 20)
predictionCol: prediction column name. (default: prediction)
seed: random seed. (default: -3274967362699018702)
tol: the convergence tolerance for iterative algorithms (>= 0). (default: 0.0001)


In [12]:
kmModel

KMeans_d9847b6a6389

In [14]:
summary = kmModel.summary
print(summary.clusterSizes) # number of points
kmModel.computeCost(sales)
centers = kmModel.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

[10, 19, 2, 11, 8]
Cluster Centers: 
[12.    0.93]
[5.21052632 3.74105263]
[48.    1.32]
[24.36363636  0.94636364]
[ 2.5     11.24375]


In [16]:
from pyspark.ml.clustering import BisectingKMeans
bkm = BisectingKMeans().setK(5).setMaxIter(5)
bkmModel = bkm.fit(sales)

In [18]:
from pyspark.ml.clustering import GaussianMixture
gmm = GaussianMixture().setK(5)
print (gmm.explainParams())
model = gmm.fit(sales)

featuresCol: features column name. (default: features)
k: Number of independent Gaussians in the mixture model. Must be > 1. (default: 2, current: 5)
maxIter: max number of iterations (>= 0). (default: 100)
predictionCol: prediction column name. (default: prediction)
probabilityCol: Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities. (default: probability)
seed: random seed. (default: 8948137684748870449)
tol: the convergence tolerance for iterative algorithms (>= 0). (default: 0.01)


In [19]:
summary = model.summary
print(model.weights)
model.gaussiansDF.show()
summary.cluster.show()
summary.clusterSizes
summary.probability.show()

[0.17364777675042192, 0.40868479342178, 0.03999996150661165, 0.1630798262595909, 0.21458764206159553]
+--------------------+--------------------+
|                mean|                 cov|
+--------------------+--------------------+
|[17.7721409197281...|41.05070949914583...|
|[6.80376078272665...|10.89519357280599...|
|[6.00000192468753...|4.00000000096711 ...|
|[2.52439808609302...|0.770110579375161...|
|[25.8245785761804...|168.5033578708051...|
+--------------------+--------------------+

+----------+
|prediction|
+----------+
|         4|
|         4|
|         4|
|         4|
|         1|
|         4|
|         2|
|         4|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         0|
|         0|
|         4|
|         4|
|         1|
|         1|
+----------+
only showing top 20 rows

+--------------------+
|         probability|
+--------------------+
|[8.36858831656477...|
|[3.12247049269465...|
|[3.46826975456394...|
|[4.43116833348206...|
|[

In [20]:
from pyspark.ml.feature import Tokenizer, CountVectorizer
tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut")
tokenized = tkn.transform(sales.drop("features"))
cv = CountVectorizer()\
  .setInputCol("DescOut")\
  .setOutputCol("features")\
  .setVocabSize(500)\
  .setMinTF(0)\
  .setMinDF(0)\
  .setBinary(True)
cvFitted = cv.fit(tokenized)
prepped = cvFitted.transform(tokenized)

In [22]:
from pyspark.ml.clustering import LDA
lda = LDA().setK(10).setMaxIter(5)
print (lda.explainParams())
model = lda.fit(prepped)

checkpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext. (default: 10)
docConcentration: Concentration parameter (commonly named "alpha") for the prior placed on documents' distributions over topics ("theta"). (undefined)
featuresCol: features column name. (default: features)
k: The number of topics (clusters) to infer. Must be > 1. (default: 10, current: 10)
keepLastCheckpoint: (For EM optimizer) If using checkpointing, this indicates whether to keep the last checkpoint. If false, then the checkpoint will be deleted. Deleting the checkpoint can cause failures if a data partition is lost, so set this bit with care. (default: True)
learningDecay: Learning rate, set as anexponential decay rate. This should be between (0.5, 1.0] to guarantee asymptotic convergence. (default: 0.51)
learningOffset: A (pos

In [23]:
model.describeTopics(3).show()
cvFitted.vocabulary

+-----+--------------+--------------------+
|topic|   termIndices|         termWeights|
+-----+--------------+--------------------+
|    0|[114, 63, 118]|[0.00943511328587...|
|    1|  [49, 28, 56]|[0.00895179944471...|
|    2| [12, 20, 111]|[0.01853200671111...|
|    3|   [60, 8, 10]|[0.01187679234416...|
|    4| [50, 110, 38]|[0.00994906941679...|
|    5|   [30, 9, 66]|[0.01549615627022...|
|    6| [79, 55, 136]|[0.01232558009687...|
|    7|  [11, 5, 115]|[0.01794935152781...|
|    8|    [2, 94, 4]|[0.01416778393951...|
|    9|  [55, 35, 74]|[0.00877663282520...|
+-----+--------------+--------------------+



['water',
 'hot',
 'vintage',
 'bottle',
 'paperweight',
 '6',
 'home',
 'doormat',
 'landmark',
 'bicycle',
 'frame',
 'ribbons',
 '',
 'classic',
 'rose',
 'kit',
 'leaf',
 'sweet',
 'bag',
 'airline',
 'doorstop',
 'light',
 'in',
 'christmas',
 'heart',
 'calm',
 'set',
 'keep',
 'balloons',
 'night',
 'lights',
 '12',
 'tin',
 'english',
 'caravan',
 'stuff',
 'tidy',
 'oxford',
 'full',
 'cottage',
 'notting',
 'drawer',
 'mushrooms',
 'chrome',
 'champion',
 'amelie',
 'mini',
 'the',
 'giant',
 'design',
 'elegant',
 'tins',
 'jet',
 'fairy',
 "50's",
 'holder',
 'message',
 'blue',
 'storage',
 'tier',
 'covent',
 'world',
 'skulls',
 'font',
 'hearts',
 'skull',
 'clips',
 'bell',
 'red',
 'party',
 'chalkboard',
 'save',
 '4',
 'coloured',
 'poppies',
 'garden',
 'nine',
 'girl',
 'shimmering',
 'doughnut',
 'dog',
 '3',
 'tattoos',
 'chilli',
 'coat',
 'torch',
 'sunflower',
 'tale',
 'cards',
 'puncture',
 'woodland',
 'bomb',
 'knack',
 'lip',
 'collage',
 'rabbit',
 'sex