In [0]:
# KMeans in Spark 
# 참고: https://spark.apache.org/docs/latest/ml-clustering.html#k-means

from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

# Loads data
dataset = spark.read.format("libsvm").load("/FileStore/sample_kmeans_data.txt")

# Trains a k-means model.
kmeans = KMeans().setK(2).setSeed(1)
model = kmeans.fit(dataset)

# Make predictions
predictions = model.transform(dataset)

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Silhouette with squared euclidean distance = 0.9997530305375207
Cluster Centers: 
[9.1 9.1 9.1]
[0.1 0.1 0.1]


In [0]:
# LDA in Spark
# https://spark.apache.org/docs/latest/ml-clustering.html#latent-dirichlet-allocation-lda

from pyspark.ml.clustering import LDA

# Loads data.
dataset = spark.read.format("libsvm").load("/FileStore/sample_lda_libsvm_data.txt")

# Trains a LDA model.
lda = LDA(k=5, maxIter=10)
model = lda.fit(dataset)

ll = model.logLikelihood(dataset)
lp = model.logPerplexity(dataset)
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp))

# Describe topics.
topics = model.describeTopics(4)
print("The topics described by their top-weighted terms:")
topics.show(truncate=False)

# Shows the result
transformed = model.transform(dataset)
transformed.show(truncate=False)

The lower bound on the log likelihood of the entire corpus: -698.4256993812817
The upper bound on perplexity: 2.6862526899280064
The topics described by their top-weighted terms:
+-----+-------------+------------------------------------------------------------------------------------+
|topic|termIndices  |termWeights                                                                         |
+-----+-------------+------------------------------------------------------------------------------------+
|0    |[3, 10, 2, 6]|[0.24027798582266366, 0.1366561117199267, 0.11635495689219202, 0.08790743796625304] |
|1    |[5, 4, 0, 9] |[0.16366268169619855, 0.12922630595808127, 0.12341411895523932, 0.11609322033741368]|
|2    |[5, 10, 9, 2]|[0.09805650594465096, 0.09798136746252867, 0.0955546657411339, 0.09246716208033515] |
|3    |[5, 10, 2, 3]|[0.1040609363322473, 0.10180390490152064, 0.09772882547642432, 0.09689663134955645] |
|4    |[5, 8, 2, 3] |[0.10585866808506506, 0.10198073649317539, 0.096871