## 1.0 Definição de schema



In [0]:
spark.sql("USE CATALOG workspace")
spark.sql("USE SCHEMA weather")

from pyspark.sql import functions as F, types as T


### 1.1 Processamento do último lote

In [0]:
df_bronze = spark.table("workspace.weather.bronze_openweather_raw")

ult_dia = df_bronze.agg(F.max("ingestion_date").alias("d")).first()["d"]

df_bronze_dia = df_bronze.filter(F.col("ingestion_date") == F.lit(ult_dia))
ultimo_lote   = df_bronze_dia.agg(F.max("ingestion_ts_utc").alias("ts")).first()["ts"]

df_bronze = df_bronze_dia.filter(F.col("ingestion_ts_utc") == F.lit(ultimo_lote))

## 2.0 Definição do schema

In [0]:
weather_schema = T.StructType([
    T.StructField("coord", T.StructType([
        T.StructField("lon", T.DoubleType()),
        T.StructField("lat", T.DoubleType())
    ])),
    T.StructField("weather", T.ArrayType(T.StructType([
        T.StructField("id", T.LongType()),
        T.StructField("main", T.StringType()),
        T.StructField("description", T.StringType()),
        T.StructField("icon", T.StringType())
    ]))),
    T.StructField("base", T.StringType()),
    T.StructField("main", T.StructType([
        T.StructField("temp", T.DoubleType()),
        T.StructField("feels_like", T.DoubleType()),
        T.StructField("temp_min", T.DoubleType()),
        T.StructField("temp_max", T.DoubleType()),
        T.StructField("pressure", T.DoubleType()),
        T.StructField("humidity", T.DoubleType()),
        T.StructField("sea_level", T.DoubleType()),   
        T.StructField("grnd_level", T.DoubleType()) 
    ])),
    T.StructField("visibility", T.LongType()),
    T.StructField("wind", T.StructType([
        T.StructField("speed", T.DoubleType()),
        T.StructField("deg", T.DoubleType()),
        T.StructField("gust", T.DoubleType())         
    ])),
    T.StructField("clouds", T.StructType([
        T.StructField("all", T.LongType())
    ])),
    T.StructField("dt", T.LongType()),
    T.StructField("sys", T.StructType([
        T.StructField("country", T.StringType()),
        T.StructField("sunrise", T.LongType()),
        T.StructField("sunset", T.LongType())
    ])),
    T.StructField("timezone", T.LongType()),
    T.StructField("id", T.LongType()),
    T.StructField("name", T.StringType()),
    T.StructField("cod", T.LongType())
])


## 3.0 Parse do payload




In [0]:
parsed = df_bronze.select(
    F.from_json(F.col("json_line"), weather_schema).alias("j"),
    F.col("obs_ts_utc").alias("obs_utc"),
    F.col("ingestion_date").alias("data_ingestao")
)

silver = parsed.select(
    F.col("j.id").cast("bigint").alias("id_cidade"),
    F.col("j.name").alias("nome_cidade"),
    F.col("j.coord.lat").alias("latitude"),
    F.col("j.coord.lon").alias("longitude"),
    F.col("j.main.temp").alias("temperatura_c"),
    F.col("j.main.feels_like").alias("sensacao_c"),
    F.col("j.main.temp_min").alias("temperatura_min_c"),
    F.col("j.main.temp_max").alias("temperatura_max_c"),
    F.col("j.main.humidity").alias("umidade_pct"),
    F.col("j.main.pressure").alias("pressao_hpa"),
    F.col("j.main.sea_level").alias("pressao_nivel_mar_hpa"),
    F.col("j.main.grnd_level").alias("pressao_nivel_solo_hpa"),
    F.col("j.visibility").cast("bigint").alias("visibilidade_m"),
    F.col("j.wind.speed").alias("velocidade_vento_ms"),
    F.col("j.wind.deg").alias("vento_graus"),
    F.col("j.wind.gust").alias("rajada_vento_ms"),
    F.col("j.clouds.all").cast("bigint").alias("nuvens_pct"),
    F.col("j.weather")[0]["main"].alias("condicao"),
    F.col("j.weather")[0]["description"].alias("condicao_desc"),
    F.from_utc_timestamp(F.col("obs_utc"), "America/Sao_Paulo").alias("data_observacao"),
    F.col("data_ingestao")
)

## 4.0 Escreve na silver


In [0]:
(silver.write
   .format("delta")
   .mode("append")
   .option("mergeSchema", "true")
   .saveAsTable("workspace.weather.silver_openweather"))

display(spark.table("workspace.weather.silver_openweather")
              .orderBy(F.col("data_ingestao").desc(), F.col("nome_cidade"))
              .limit(10))