In [8]:
import requests
from pyspark.sql import SparkSession, Row, Window
from pyspark.sql.functions import *
from pyspark.sql.types import *
import time
import json
from delta import DeltaTable

spark = (SparkSession.builder
             .appName('lab') # Name the app
             .config("hive.metastore.uris", "thrift://metastore:9083") # Set external Hive Metastore
             .config("hive.metastore.schema.verification", "false") # Prevent some errors
             .config("spark.sql.repl.eagerEval.enabled", True)
             .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
             .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
             .enableHiveSupport()
             .getOrCreate())

In [2]:
spark.sql("show databases").show()

+---------+
|namespace|
+---------+
| business|
|  default|
|    stage|
+---------+



In [3]:
spark.sql("show tables from stage").show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|    stage| corredor|      false|
|    stage|  posicao|      false|
|    stage|   linhas|      false|
|    stage|  paradas|      false|
|    stage|    trips|      false|
|    stage|  empresa|      false|
+---------+---------+-----------+



In [4]:
df_previsao = spark.read.json(f"s3a://raw/olhovivo/previsao/dt=2024-09-23/")

In [6]:
df_previsao.show()

+-----+--------------------+-------+
|   hr|                   p|stop_id|
+-----+--------------------+-------+
|19:08|                null|  19073|
|19:08|                null| 104763|
|19:08|{105268, [{3134-1...| 105268|
|19:08|{105368, [{407Y-1...| 105368|
|19:08|{105388, [{4729-1...| 105388|
|19:08|{105424, [{4025-1...| 105424|
|19:08|{105431, [{4726-1...| 105431|
|19:08|{105432, [{4726-1...| 105432|
|19:08|{109574, [{4726-1...| 109574|
|19:08|{109576, [{5144-1...| 109576|
|19:08|{109594, [{4025-1...| 109594|
|19:08|{109595, [{4726-1...| 109595|
|19:08|{109731, [{4025-1...| 109731|
|19:08|{109733, [{2100-1...| 109733|
|19:08|{109738, [{4726-1...| 109738|
|19:08|{109739, [{4726-1...| 109739|
|19:08|                null| 109740|
|19:08|{109741, [{4726-1...| 109741|
|19:08|{109751, [{4025-1...| 109751|
|19:08|                null| 109754|
+-----+--------------------+-------+
only showing top 20 rows



In [None]:
df_previsao = spark.read.json(f"s3a://raw/olhovivo/previsao/dt=2024-09-23/")

window_spec = Window.partitionBy("stop_id").orderBy(col("hr").desc())
df_previsao_tratado = df_previsao.withColumn("row_num", row_number().over(window_spec)) \
                       .filter(col("row_num") == 1) \
                       .drop("row_num")

DeltaTable.createIfNotExists(spark) \
    .tableName("stage.previsao")\
    .addColumns(df_previsao_tratado.schema)\
    .execute()

deltaTable = DeltaTable.forName(spark, "stage.previsao")

deltaTable.alias('destiny') \
    .merge(
        df_previsao_tratado.alias('source'),
        'source.stop_id = destiny.stop_id'
    ) \
    .whenMatchedUpdateAll() \
    .whenNotMatchedInsertAll() \
    .execute()