In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

# Create Spark Session
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .getOrCreate()

In [4]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

In [5]:
# Loads data.
dataset = spark.read.format("libsvm").load("D:/spark-2.3.1-bin-hadoop2.7/data/mllib/sample_kmeans_data.txt")

In [6]:
dataset.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|           (3,[],[])|
|  1.0|(3,[0,1,2],[0.1,0...|
|  2.0|(3,[0,1,2],[0.2,0...|
|  3.0|(3,[0,1,2],[9.0,9...|
|  4.0|(3,[0,1,2],[9.1,9...|
|  5.0|(3,[0,1,2],[9.2,9...|
+-----+--------------------+



In [15]:
# Trains a k-means model.
kmeans = KMeans().setK(2).setSeed(1)
model = kmeans.fit(dataset)

In [16]:
# Make predictions
predictions = model.transform(dataset)

In [17]:
predictions.show()

+-----+--------------------+----------+
|label|            features|prediction|
+-----+--------------------+----------+
|  0.0|           (3,[],[])|         0|
|  1.0|(3,[0,1,2],[0.1,0...|         0|
|  2.0|(3,[0,1,2],[0.2,0...|         0|
|  3.0|(3,[0,1,2],[9.0,9...|         1|
|  4.0|(3,[0,1,2],[9.1,9...|         1|
|  5.0|(3,[0,1,2],[9.2,9...|         1|
+-----+--------------------+----------+



In [18]:
# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

Silhouette with squared euclidean distance = 0.9997530305375207


In [19]:
# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[0.1 0.1 0.1]
[9.1 9.1 9.1]


In [25]:
# Menggunakan dataset yang lain
# Loads data.
data2 = spark.read.format("libsvm").load("data.txt")

In [26]:
data2.show()

+-----+-------------------+
|label|           features|
+-----+-------------------+
|  0.0|(2,[0,1],[1.0,1.0])|
|  1.0|(2,[0,1],[1.5,2.0])|
|  2.0|(2,[0,1],[3.0,4.0])|
|  3.0|(2,[0,1],[5.0,7.0])|
|  4.0|(2,[0,1],[3.5,5.0])|
|  5.0|(2,[0,1],[4.5,5.0])|
|  6.0|(2,[0,1],[3.5,4.5])|
+-----+-------------------+



In [32]:
# Trains a k-means model.
kmeans = KMeans().setK(3).setSeed(1)
model = kmeans.fit(data2)

In [33]:
# Make predictions
predictions = model.transform(data2)

In [34]:
predictions.show()

+-----+-------------------+----------+
|label|           features|prediction|
+-----+-------------------+----------+
|  0.0|(2,[0,1],[1.0,1.0])|         1|
|  1.0|(2,[0,1],[1.5,2.0])|         1|
|  2.0|(2,[0,1],[3.0,4.0])|         0|
|  3.0|(2,[0,1],[5.0,7.0])|         2|
|  4.0|(2,[0,1],[3.5,5.0])|         0|
|  5.0|(2,[0,1],[4.5,5.0])|         0|
|  6.0|(2,[0,1],[3.5,4.5])|         0|
+-----+-------------------+----------+



In [35]:
# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

Silhouette with squared euclidean distance = 0.8596376863267189


In [31]:
# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[3.9 5.1]
[1.25 1.5 ]
