In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, DoubleType, StructType, StructField, TimestampType
from pyspark.sql.functions import col, to_timestamp

In [4]:
# create schema
schema = StructType([ 
    StructField("dt",TimestampType(), True), 
    StructField("lat",DoubleType(), True), 
    StructField("lon",DoubleType(), True), 
    StructField("base", StringType(), True), 
  ])

In [6]:
spark = SparkSession.builder.appName('Uber').getOrCreate()

In [8]:
# read data from hdfs 
path = "hdfs://localhost:9000/Uber_Warehouse/raw/"
df_uber = spark.read.csv(path=path, schema=schema)

# convert dt column to timestamp
# df_uber = df.withColumn("dt",to_timestamp("dt").cast("timestamp"))
df_uber.show(5)
df_uber.printSchema()

+-------------------+-------+--------+------+
|                 dt|    lat|     lon|  base|
+-------------------+-------+--------+------+
|2014-08-05 09:43:00| 40.726|-74.0013|B02682|
|2014-08-31 07:29:00|40.7546|-73.9895|B02617|
|2014-08-09 00:16:00|40.6768|-73.9801|B02598|
|2014-08-17 19:30:00|40.7689|-73.9604|B02617|
|2014-08-01 00:00:00| 40.729|-73.9422|B02598|
+-------------------+-------+--------+------+
only showing top 5 rows

root
 |-- dt: timestamp (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- base: string (nullable = true)



In [9]:
from pyspark.ml.feature import VectorAssembler
# Denfine features vector to use for kmeans algorithm
featureCols = ['lat', 'lon']
assembler = VectorAssembler(inputCols=featureCols, outputCol='features')

df_uber2 = assembler.transform(df_uber)
df_uber2.cache()
df_uber2.show(5)

[Stage 2:>                                                          (0 + 1) / 1]

+-------------------+-------+--------+------+------------------+
|                 dt|    lat|     lon|  base|          features|
+-------------------+-------+--------+------+------------------+
|2014-08-05 09:43:00| 40.726|-74.0013|B02682| [40.726,-74.0013]|
|2014-08-31 07:29:00|40.7546|-73.9895|B02617|[40.7546,-73.9895]|
|2014-08-09 00:16:00|40.6768|-73.9801|B02598|[40.6768,-73.9801]|
|2014-08-17 19:30:00|40.7689|-73.9604|B02617|[40.7689,-73.9604]|
|2014-08-01 00:00:00| 40.729|-73.9422|B02598| [40.729,-73.9422]|
+-------------------+-------+--------+------+------------------+
only showing top 5 rows



                                                                                

In [10]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

# setK(20) phân thành 20 cụm
# setFeaturesCol("features") dùng để train
# setPredictionCol("cid") dùng để predict
kmeans = KMeans().setK(20).setFeaturesCol("features").setPredictionCol("cid").setSeed(1)
model = kmeans.fit(df_uber2)

# Shows the result 20 cluster.
centers = model.clusterCenters()
i=0
print("Cluster Centers: ")
for center in centers:
    print(i, center)
    i += 1

22/02/01 13:52:11 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/02/01 13:52:11 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS

Cluster Centers: 
0 [ 40.73423646 -73.99364081]
1 [ 40.64987137 -73.78502466]
2 [ 40.82184678 -73.94300184]
3 [ 40.69717486 -74.18066285]
4 [ 40.770255   -73.47119922]
5 [ 40.75881196 -73.99109361]
6 [ 40.66656304 -73.97881524]
7 [ 40.7068365  -73.94677405]
8 [ 40.67601567 -74.40629101]
9 [ 40.75702524 -73.9217683 ]
10 [ 40.77983533 -73.96266167]
11 [ 40.71524847 -74.0061783 ]
12 [ 40.20184004 -74.04641643]
13 [ 40.75565536 -73.97356015]
14 [ 41.02007523 -73.76031087]
15 [ 40.74274103 -73.66676635]
16 [ 40.75989546 -73.86056185]
17 [ 40.91695269 -74.11372694]
18 [ 40.80073204 -73.09776343]
19 [ 40.87546667 -73.87859288]


                                                                                

In [30]:
# Evaluate clustering by computing Silhouette score
# evaluator = ClusteringEvaluator(predictionCol='cid', featuresCol='features',
#                                 metricName='silhouette', distanceMeasure='squaredEuclidean')

# silhouette = evaluator.evaluate(predictions)
# print("Silhouette with squared euclidean distance = " + str(silhouette))

Silhouette with squared euclidean distance = 0.5444916986154139


In [21]:
#save model
# model.save("E:/PySpark/Uber_Locations/model/uber_location")
# model.write().overwrite().save("E:/PySpark/Uber_Locations/model/uber_location")


In [12]:
# make prediction
df_predicted = model.transform(df_uber2)
df_predicted.show(5)

+-------------------+-------+--------+------+------------------+---+
|                 dt|    lat|     lon|  base|          features|cid|
+-------------------+-------+--------+------+------------------+---+
|2014-08-05 09:43:00| 40.726|-74.0013|B02682| [40.726,-74.0013]|  0|
|2014-08-31 07:29:00|40.7546|-73.9895|B02617|[40.7546,-73.9895]|  5|
|2014-08-09 00:16:00|40.6768|-73.9801|B02598|[40.6768,-73.9801]|  6|
|2014-08-17 19:30:00|40.7689|-73.9604|B02617|[40.7689,-73.9604]| 10|
|2014-08-01 00:00:00| 40.729|-73.9422|B02598| [40.729,-73.9422]|  7|
+-------------------+-------+--------+------+------------------+---+
only showing top 5 rows



In [13]:
from pyspark.sql.functions import split, concat_ws, concat

# add id column = cid + lat + lon
split_lon = split(df_predicted.lon, "\.").getItem(1)
split_lat = split(df_predicted.lat, "\.").getItem(1)
id = concat(split_lat,split_lon) # nối chuỗi
df_uber_id = df_predicted.withColumn("id", concat_ws("_",col("cid"),id)) # add column "id"

# drop feature column
df_uber_locates = df_uber_id.drop(df_uber_id.features)
df_uber_locates.show(5)

+-------------------+-------+--------+------+---+-----------+
|                 dt|    lat|     lon|  base|cid|         id|
+-------------------+-------+--------+------+---+-----------+
|2014-08-05 09:43:00| 40.726|-74.0013|B02682|  0|  0_7260013|
|2014-08-31 07:29:00|40.7546|-73.9895|B02617|  5| 5_75469895|
|2014-08-09 00:16:00|40.6768|-73.9801|B02598|  6| 6_67689801|
|2014-08-17 19:30:00|40.7689|-73.9604|B02617| 10|10_76899604|
|2014-08-01 00:00:00| 40.729|-73.9422|B02598|  7|  7_7299422|
+-------------------+-------+--------+------+---+-----------+
only showing top 5 rows



In [15]:
# write to hdfs   
df_uber_locates.write.mode("overwrite").format("csv") \
                    .option("path", "hdfs://localhost:9000/Uber_Warehouse/transformed/") \
                    .option("checkpointLocation", "hdfs://localhost:9000/Uber_Warehouse/checkpoints/") \
                    .save()

                                                                                