In [None]:
import os
import boto3
import awswrangler as wr
from datetime import datetime
from geopy.distance import geodesic
from pyspark.sql.types import FloatType, StructType
import pyspark.sql.functions as F
from pyspark.sql import SparkSession

In [None]:
@F.udf(returnType=FloatType())
def geodesic_udf(a, b):
    return geodesic(a, b).km

In [None]:
spark = SparkSession.builder.appName("Loka Application").getOrCreate()

In [None]:
df = spark.read.option("mergeSchema", "true").json("/tmp/loka-data/*")

df = (
    df.withColumn("at", F.to_timestamp(df.at, timestamp_format))
    .withColumn("date_start", F.to_timestamp(df.data.start, timestamp_format))
    .withColumn("date_finish", F.to_timestamp(df.data.finish, timestamp_format))
    .withColumn(
        "date_location_at", F.to_timestamp(df.data.location.at, timestamp_format)
    )
    .withColumn(
        "data",
        F.struct(
            "data.*",
            "date_start",
            "date_finish",
            "date_location_at",
        ),
    )
    .drop("date_start")
    .drop("date_finish")
    .drop("date_location_at")
)
df.show()

In [None]:
df.where(df.data.date_location_at.isNotNull()).select("data.*").show()

In [None]:
df.show()

In [None]:
df_vehicle = df.where(df.on == "vehicle").show()

In [None]:
timestamp_format = "yyyy-MM-dd'T'HH:mm:ss.SSSX"
df = spark.read.option("mergeSchema", "true").json("/tmp/loka-data/*")
df = (
    df.withColumn("at", F.to_timestamp(df.at, timestamp_format))
    .withColumn("date_start", F.to_timestamp(df.data.start, timestamp_format))
    .withColumn("date_finish", F.to_timestamp(df.data.finish, timestamp_format))
    .withColumn(
        "location_at", F.to_timestamp(df.data.location.at, timestamp_format)
    )
    .withColumn("location_lat", df.data.location.lat)
    .withColumn("location_lng", df.data.location.lng)
    .withColumn(
        "data",
        F.struct(
            "data.*",
            "date_start",
            "date_finish",
            "location_at",
            "location_lat",
            "location_lng",
        ),
    )
    .drop("date_start")
    .drop("date_finish")
    .drop("location_at")
    .drop("location_lat")
    .drop("location_lng")
)

In [None]:
df.where(df.on == "operating_period").select("data.*").show()

In [None]:
df.where(df.on == "operating_period")\
    .withColumn("data_id", df.data.id)\
    .withColumn("date_start", df.data.date_start)\
    .withColumn("date_finish", df.data.date_finish)\
.drop(df.data).show()

In [None]:
import os
import awswrangler as wr
import pandas as pd

from geopy.distance import geodesic
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_timestamp, udf, struct
from pyspark.sql.types import FloatType
from sqlalchemy import create_engine
raw_data_bucket = "de-tech-assessment-2022"
raw_data_prefix = "data"
timestamp_format = "yyyy-MM-dd'T'HH:mm:ss.SSSX"


In [None]:
spark = SparkSession.builder.appName("Loka Application").getOrCreate()


In [98]:
df = spark.read.option("mergeSchema", "true").json("/tmp/loka-data/*")

df = (
    df.withColumn("at", to_timestamp(df.at, timestamp_format))
    .withColumn("date_start", to_timestamp(df.data.start, timestamp_format))
    .withColumn("date_finish", to_timestamp(df.data.finish, timestamp_format))
    .withColumn("location_at", to_timestamp(df.data.location.at, timestamp_format))
    .withColumn(
        "data",
        struct(
            "data.*",
            "date_start",
            "date_finish",
            "location_at",
        ),
    )
    .drop("date_start")
    .drop("date_finish")
    .drop("location_at")
)
df_vehicle = (
    df.where(df.on == "vehicle")
    .withColumn("data_id", df.data.id)
    .withColumn("location_at", df.data.location_at)
    .withColumn("location_lat", df.data.location.lat)
    .withColumn("location_lng", df.data.location.lng)
    .drop(df.data)
)
df_operating_period = (
    df.where(df.on == "operating_period")
    .withColumn("data_id", df.data.id)
    .withColumn("date_start", df.data.date_start)
    .withColumn("date_finish", df.data.date_finish)
    .drop(df.data)
)


                                                                                

In [110]:
os.path.normpath("/tmp/loka-data/")

'/tmp/loka-data'

In [109]:
all_files = os.listdir("/tmp/loka-data/")
for event_file in all_files:
    base_s3_path = f"s3://de-tech-assessment-2022-nilson/data/{datetime.today()}"
    filename = event_file.split("/")[-1]
    final_s3_path = f"{base_s3_path}/{filename}"
    print(final_s3_path)
    # wr.s3.upload(local_file=event_file, path=final_s3_path)

['2019-06-01-15-23-29-events.json', '2019-06-01-15-26-11-events.json', '2019-06-01-15-17-52-events.json', '2019-06-01-15-23-23-events.json', '2019-06-01-15-17-46-events.json', '2019-06-01-15-22-55-events.json', '2019-06-01-15-27-48-events.json', '2019-06-01-15-27-1-events.json', '2019-06-01-15-27-8-events.json', '2019-06-01-15-23-39-events.json', '2019-06-01-15-24-17-events.json', '2019-06-01-15-25-47-events.json', '2019-06-01-15-17-22-events.json', '2019-06-01-15-19-45-events.json', '2019-06-01-15-18-26-events.json', '2019-06-01-15-28-14-events.json', '2019-06-01-15-22-20-events.json', '2019-06-01-15-23-32-events.json', '2019-06-01-15-28-0-events.json', '2019-06-01-15-21-42-events.json', '2019-06-01-15-26-1-events.json', '2019-06-01-15-22-0-events.json', '2019-06-01-15-17-7-events.json', '2019-06-01-15-23-53-events.json', '2019-06-01-15-18-30-events.json', '2019-06-01-15-23-41-events.json', '2019-06-01-15-25-59-events.json', '2019-06-01-15-17-8-events.json', '2019-06-01-15-25-7-events

2019-06-01 19:23:04.079000
2019-06-01 19:28:04.079000
2019-06-01 19:17:04.079000
2019-06-01 19:22:04.079000




31866


                                                                                

In [None]:

pdf_vehicle = df_vehicle.toPandas()
# Create SQLAlchemy engine
engine = create_engine(
    "postgresql+psycopg2://datawarehouse:datawarehouse@localhost/datawarehouse?client_encoding=utf8"
)
# Save result to the database via engine
pdf_vehicle.to_sql("vehicle", engine, index=False, if_exists="append")

In [None]:
pdf_operating_period = df_operating_period.toPandas()
pdf_operating_period.to_sql("operating_period", engine, index=False, if_exists="append")

In [None]:
from geopy.distance import geodesic


@udf(returnType=FloatType())
def geodesic_udf(a, b):
    # df = df.withColumn("Distance", geodesic_udf(F.array("B", "A"), F.array("D", "C")))
    return geodesic(a, b).km

In [111]:
from datetime import date
date.today().isoformat()

'2023-02-01'

In [112]:
import shutil
shutil.rmtree("/tmp/loka-data/")
