In [1]:
# Call findspark
import findspark
findspark.init()

# Import required library
from pyspark.sql import SparkSession

# Create Spark Session
spark = SparkSession \
    .builder \
    .appName("Python Spark Clustering Sample") \
    .getOrCreate()

print(spark)

<pyspark.sql.session.SparkSession object at 0x00000245FF668908>


In [21]:
# Load Data
# Kita hilangkan data yang memiliki nilai kosong dengan tambahan .na.drop()
df = spark.read.csv("D:/Documents/dataset/Crimes_-_2001_to_present.csv", header=True, inferSchema=True).na.drop()

In [22]:
df.show()

+-------+-----------+--------------------+--------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+
|     ID|Case Number|                Date|               Block|IUCR|        Primary Type|         Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|          Updated On|    Latitude|    Longitude|            Location|
+-------+-----------+--------------------+--------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+
|4080779|   HL424670|06/16/2005 02:30:...| 076XX S SANGAMON ST|1320|     CRIMINAL DAMAGE|          TO VEHICLE|              STR

In [23]:
df.createOrReplaceTempView("crimes")

In [25]:
df2018 = spark.sql("SELECT ID, `Case Number`, BLOCK, `Primary Type`, `Location Description`, Latitude, Longitude \
            FROM crimes \
            WHERE year = 2018")

In [26]:
df2018.show()

+--------+-----------+--------------------+--------------------+--------------------+------------+-------------+
|      ID|Case Number|               BLOCK|        Primary Type|Location Description|    Latitude|    Longitude|
+--------+-----------+--------------------+--------------------+--------------------+------------+-------------+
|11196948|   JB106757| 006XX N FRANKLIN ST|  DECEPTIVE PRACTICE|VEHICLE NON-COMME...|41.893676531|-87.635628328|
|11213009|   JB128278|   062XX S DAMEN AVE|             BATTERY|RESIDENCE PORCH/H...|41.780361586|-87.673928946|
|11220257|   JB137570| 051XX S ASHLAND AVE|       OTHER OFFENSE|PARKING LOT/GARAG...|41.800844112|-87.664762808|
|11220367|   JB135517|  042XX S TALMAN AVE|OFFENSE INVOLVING...|           RESIDENCE|41.816857323|-87.690789203|
|11224979|   JB144109| 029XX W BELMONT AVE|               THEFT|              STREET| 41.93937377|-87.700583524|
|11224997|   JB144124|     006XX E 51ST ST|  DECEPTIVE PRACTICE|           RESIDENCE|41.80217490

## Proses Clustering

Referensi dapat dilihat pada [Clustering](https://spark.apache.org/docs/latest/ml-clustering.html)

Tahap-tahap yang perlu dilakukan adalah sebagai berikut

### 1. Assembling Vector

Library Spark ML hanya menerima data input dalam bentuk vektor. Oleh karena itu, kita perlu menggabungkan kolom-kolom ke dalam sebuah vektor.

Misal apabila kita memiliki kolom ```latitude``` dan ```longitude``` yang ingin dijadikan sebagai sebuah input, maka kita harus menjadikannya menjadi [```latitude```, ```longitude```].

### 2. Train model
Proses training dilakukan dengan memasukkan data latih ke dalam instance dengan mengeset beberapa parameter. Untuk K-Means hanya memerlukan parameter jumlah cluster (```k```) dan seed (randomness).

### 3. Prediksi
Proses prediksi dilakukan dengan memasukkan data uji ke dalam model yang telah dibuat.

### 4. Evaluasi
Proses evaluasi dilakukan sesuai dengan algoritma yang dipakai. Untuk K-Means evaluasi yang dapat digunakan adalah Silhouette Evaluation.

In [27]:
# Assembling Vector
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=["Latitude", "Longitude"],
    outputCol='features')

df2018 = assembler.transform(df2018)
df2018.show()

+--------+-----------+--------------------+--------------------+--------------------+------------+-------------+--------------------+
|      ID|Case Number|               BLOCK|        Primary Type|Location Description|    Latitude|    Longitude|            features|
+--------+-----------+--------------------+--------------------+--------------------+------------+-------------+--------------------+
|11196948|   JB106757| 006XX N FRANKLIN ST|  DECEPTIVE PRACTICE|VEHICLE NON-COMME...|41.893676531|-87.635628328|[41.893676531,-87...|
|11213009|   JB128278|   062XX S DAMEN AVE|             BATTERY|RESIDENCE PORCH/H...|41.780361586|-87.673928946|[41.780361586,-87...|
|11220257|   JB137570| 051XX S ASHLAND AVE|       OTHER OFFENSE|PARKING LOT/GARAG...|41.800844112|-87.664762808|[41.800844112,-87...|
|11220367|   JB135517|  042XX S TALMAN AVE|OFFENSE INVOLVING...|           RESIDENCE|41.816857323|-87.690789203|[41.816857323,-87...|
|11224979|   JB144109| 029XX W BELMONT AVE|               THEF

In [35]:
# Train model
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

kmeans = KMeans().setK(10).setSeed(1)
model = kmeans.fit(df2018)

In [36]:
# Make a prediction
predictions = model.transform(df2018)
predictions.show(5)

+--------+-----------+-------------------+--------------------+--------------------+------------+-------------+--------------------+----------+
|      ID|Case Number|              BLOCK|        Primary Type|Location Description|    Latitude|    Longitude|            features|prediction|
+--------+-----------+-------------------+--------------------+--------------------+------------+-------------+--------------------+----------+
|11196948|   JB106757|006XX N FRANKLIN ST|  DECEPTIVE PRACTICE|VEHICLE NON-COMME...|41.893676531|-87.635628328|[41.893676531,-87...|         7|
|11213009|   JB128278|  062XX S DAMEN AVE|             BATTERY|RESIDENCE PORCH/H...|41.780361586|-87.673928946|[41.780361586,-87...|         4|
|11220257|   JB137570|051XX S ASHLAND AVE|       OTHER OFFENSE|PARKING LOT/GARAG...|41.800844112|-87.664762808|[41.800844112,-87...|         4|
|11220367|   JB135517| 042XX S TALMAN AVE|OFFENSE INVOLVING...|           RESIDENCE|41.816857323|-87.690789203|[41.816857323,-87...|    

In [37]:
# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

Silhouette with squared euclidean distance = 0.5416128347143511


In [38]:
# Shows the result
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[ 41.87242393 -87.75383832]
[ 41.76195811 -87.59260117]
[ 41.69929956 -87.63325356]
[ 41.92954572 -87.73687791]
[ 41.77706117 -87.67720321]
[ 41.92273509 -87.66162967]
[ 41.98676391 -87.67958479]
[ 41.87329242 -87.63289616]
[ 41.9621505 -87.8105806]
[ 41.86897478 -87.70824169]


In [32]:
# Visualization using pixiedust
import pixiedust

Pixiedust database opened successfully


In [39]:
display(predictions)

In [34]:
df2018.count()

262951

Gambar hasil clustering

![Clustering](img/test.png)