In [None]:
import os
import boto3
import awswrangler as wr
from datetime import datetime
from geopy.distance import geodesic
from pyspark.sql.types import FloatType, StructType
import pyspark.sql.functions as F
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName("Loka Application").getOrCreate()

In [59]:
df = spark.read.option("mergeSchema", "true").json("/tmp/loka-data/*")

                                                                                

In [60]:
df.printSchema()

root
 |-- at: string (nullable = true)
 |-- data: struct (nullable = true)
 |    |-- finish: string (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- location: struct (nullable = true)
 |    |    |-- at: string (nullable = true)
 |    |    |-- lat: double (nullable = true)
 |    |    |-- lng: double (nullable = true)
 |    |-- start: string (nullable = true)
 |-- event: string (nullable = true)
 |-- on: string (nullable = true)
 |-- organization_id: string (nullable = true)



In [61]:
@F.udf(returnType=FloatType())
def geodesic_udf(a, b):
    return geodesic(a, b).km

In [62]:
timestamp_format = "yyyy-MM-dd'T'HH:mm:ss.SSSX"
df = df.withColumn("at", F.to_timestamp("at", timestamp_format))

In [63]:
df = df.withColumn(
    "date_start",
    F.to_timestamp(df.data.start, timestamp_format)
).withColumn(
    "data",
    F.struct("data.*", "date_start")
).drop("date_start")


In [64]:
df = df.withColumn(
    "date_finish",
    F.to_timestamp(df.data.finish, timestamp_format)
).withColumn(
    "data",
    F.struct("data.*", "date_finish")
).drop("date_finish")

In [65]:
df = df.withColumn(
    "date_location_at",
    F.to_timestamp(df.data.location.at, timestamp_format)
).withColumn(
    "data",
    F.struct("data.*", "date_location_at")
).drop("date_location_at")

In [None]:
df.where(df.data.date_location_at.isNotNull()).select("data.*").show()

In [66]:
df.show()

+--------------------+--------------------+------+-------+---------------+
|                  at|                data| event|     on|organization_id|
+--------------------+--------------------+------+-------+---------------+
|2019-06-01 19:17:...|{null, bac5188f-6...|update|vehicle|         org-id|
|2019-06-01 19:17:...|{null, 3a3eb23a-f...|update|vehicle|         org-id|
|2019-06-01 19:17:...|{null, f06eb89c-a...|update|vehicle|         org-id|
|2019-06-01 19:17:...|{null, f0b87796-b...|update|vehicle|         org-id|
|2019-06-01 19:17:...|{null, e641b45f-f...|update|vehicle|         org-id|
|2019-06-01 19:17:...|{null, 9152c5d8-7...|update|vehicle|         org-id|
|2019-06-01 19:17:...|{null, 949798fc-5...|update|vehicle|         org-id|
|2019-06-01 19:17:...|{null, 9d6a8840-d...|update|vehicle|         org-id|
|2019-06-01 19:17:...|{null, 3b0640d6-5...|update|vehicle|         org-id|
|2019-06-01 19:17:...|{null, 98c8b8cb-7...|update|vehicle|         org-id|
|2019-06-01 19:17:...|{nu

In [None]:
df2 = df.sort(df.data.at).groupBy(df.organization_id, df.on, df.data.id).count()
df2.show()