In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, DoubleType, StructType, StructField
from pyspark.sql.functions import col, to_timestamp

In [3]:
# create schema
schema = StructType([ 
    StructField("dt",StringType(), True), 
    StructField("lat",DoubleType(), True), 
    StructField("lon",DoubleType(), True), 
    StructField("base", StringType(), True), 
  ])

In [4]:
# read data
spark = SparkSession.builder.appName('Uber').getOrCreate()
path = "uber.csv"
df = spark.read.csv(path=path, schema=schema)

# convert dt column to timestamp
df_uber = df.withColumn("dt",to_timestamp("dt").cast("timestamp"))
df_uber.show(5)
df_uber.printSchema()

+-------------------+-------+--------+------+
|                 dt|    lat|     lon|  base|
+-------------------+-------+--------+------+
|2014-08-01 00:00:00| 40.729|-73.9422|B02598|
|2014-08-01 00:00:00|40.7476|-73.9871|B02598|
|2014-08-01 00:00:00|40.7424|-74.0044|B02598|
|2014-08-01 00:00:00| 40.751|-73.9869|B02598|
|2014-08-01 00:00:00|40.7406|-73.9902|B02598|
+-------------------+-------+--------+------+
only showing top 5 rows

root
 |-- dt: timestamp (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- base: string (nullable = true)



In [5]:
from pyspark.ml.feature import VectorAssembler
# Denfine features vector to use for kmeans algorithm
featureCols = ['lat', 'lon']
assembler = VectorAssembler(inputCols=featureCols, outputCol='features')

df_uber2 = assembler.transform(df_uber)
df_uber2.cache()
df_uber2.show(5)

+-------------------+-------+--------+------+------------------+
|                 dt|    lat|     lon|  base|          features|
+-------------------+-------+--------+------+------------------+
|2014-08-01 00:00:00| 40.729|-73.9422|B02598| [40.729,-73.9422]|
|2014-08-01 00:00:00|40.7476|-73.9871|B02598|[40.7476,-73.9871]|
|2014-08-01 00:00:00|40.7424|-74.0044|B02598|[40.7424,-74.0044]|
|2014-08-01 00:00:00| 40.751|-73.9869|B02598| [40.751,-73.9869]|
|2014-08-01 00:00:00|40.7406|-73.9902|B02598|[40.7406,-73.9902]|
+-------------------+-------+--------+------+------------------+
only showing top 5 rows



In [6]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

# setK(20) phân thành 20 cụm
# setFeaturesCol("features") dùng để train
# setPredictionCol("cid") dùng để predict
kmeans = KMeans().setK(20).setFeaturesCol("features").setPredictionCol("cid").setSeed(1)
model = kmeans.fit(df_uber2)

# Shows the result 20 cluster.
centers = model.clusterCenters()
i=0
print("Cluster Centers: ")
for center in centers:
    print(i, center)
    i += 1

Cluster Centers: 
0 [ 40.73909578 -73.99337027]
1 [ 40.65009343 -73.78464196]
2 [ 40.71824324 -73.9529467 ]
3 [ 40.76011724 -73.86306385]
4 [ 40.76949209 -73.95020413]
5 [ 40.770255   -73.47119922]
6 [ 40.73938927 -74.04208329]
7 [ 40.68221995 -73.98402042]
8 [ 40.62416966 -73.98202639]
9 [ 40.20325862 -74.05279351]
10 [ 40.71850996 -74.00190164]
11 [ 40.69600569 -74.20238311]
12 [ 41.00103971 -73.77328319]
13 [ 40.75982169 -73.98040416]
14 [ 40.68420017 -73.93498654]
15 [ 40.80073204 -73.09776343]
16 [ 40.79924197 -73.96137349]
17 [ 40.92930041 -74.11616198]
18 [ 40.74288204 -73.66693211]
19 [ 40.85732825 -73.90904588]


In [30]:
# Evaluate clustering by computing Silhouette score
# evaluator = ClusteringEvaluator(predictionCol='cid', featuresCol='features',
#                                 metricName='silhouette', distanceMeasure='squaredEuclidean')

# silhouette = evaluator.evaluate(predictions)
# print("Silhouette with squared euclidean distance = " + str(silhouette))

Silhouette with squared euclidean distance = 0.5444916986154139


In [21]:
#save model
try:
    model.save("E:/PySpark/Uber_Locations/model/uber_location")
    # model.write().overwrite().save("E:/PySpark/Uber_Locations/model/uber_location") # ghi đè
    print("Save succes")
except:
    print("Fail!")