<a href="https://colab.research.google.com/github/lucprosa/dataeng-basic-course/blob/main/spark/misc/etl_program.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import *
import pyspark.sql.functions as F
from pyspark.sql.types import *
import requests

class ETLFlow:
    def __init__(self, spark: SparkSession) -> None:
        self.spark = spark

    def extract_from_file(self, format: str, path: str, **kwargs) -> DataFrame:
        df = self.spark.read.format(format).load(path)
        return df

    def extract_from_api(self, url: str, schema: StructType = None):
      response = requests.get(url)
      rdd = spark.sparkContext.parallelize(response.json())
      if schema:
        df = spark.read.schema(schema).json(rdd)
      else:
        df = spark.read.json(rdd)
      return df

    def load(self, df: DataFrame, format: str, path: str, partition_column: str = None, **kwargs) -> None:
        if partition_column:
          df.coalesce(1).write.mode("overwrite").partitionBy(partition_column).format(format).save(path)
        else:
          df.coalesce(1).write.mode("overwrite").format(format).save(path)

class ETLTask(ETLFlow):

    def __init__(self, spark: SparkSession) -> None:
        self.spark = spark

    def ingestion_lines(self):
      # schema
      lines_schema = StructType([StructField('color', StringType(), True),
                                 StructField('facilities', ArrayType(StringType(), True), True),
                                 StructField('id', StringType(), True),
                                 StructField('localities',ArrayType(StringType(), True), True),
                                 StructField('long_name', StringType(), True),
                                 StructField('municipalities', ArrayType(StringType(), True), True),
                                 StructField('patterns', ArrayType(StringType(), True), True),
                                 StructField('routes', ArrayType(StringType(), True), True),
                                 StructField('short_name', StringType(), True), StructField('text_color', StringType(), True)])
      # ingestion
      df = self.extract_from_api(url="https://api.carrismetropolitana.pt/lines", schema=lines_schema)
      # load
      self.load(df=df, format="parquet", path="/content/lake/bronze/lines")


    def ingestion_vehicles(self):
      vehicle_schema = StructType([StructField('bearing', IntegerType(), True),
                                  StructField('block_id', StringType(), True),
                                  StructField('current_status', StringType(), True),
                                  StructField('id', StringType(), True),
                                  StructField('lat', FloatType(), True),
                                  StructField('line_id', StringType(), True),
                                  StructField('lon', FloatType(), True),
                                  StructField('pattern_id', StringType(), True),
                                  StructField('route_id', StringType(), True),
                                  StructField('schedule_relationship', StringType(), True),
                                  StructField('shift_id', StringType(), True),
                                  StructField('speed', FloatType(), True),
                                  StructField('stop_id', StringType(), True),
                                  StructField('timestamp', TimestampType(), True),
                                  StructField('trip_id', StringType(), True)])

      df = self.extract_from_api(url="https://api.carrismetropolitana.pt/vehicles", schema=vehicle_schema)

      # create date column
      # date(from_unixtime(1732305594))
      df = df.withColumn("date", expr("date(timestamp)"))

      self.load(df=df, format="parquet", path="/content/lake/bronze/vehicles", partition_column="date")

    def cleansing_vehicles(self):
      df = self.extract_from_file(format="parquet", path="/content/lake/bronze/vehicles")

      # transformations
      df = df.withColumn("new_column", lit("test"))
      df = df.drop_duplicates()

      self.load(df=df, format="parquet", path="/content/lake/silver/vehicles", partition_column="date")

    def enrich_vehicles_with_lines(self):
        # 1. CRIAR camada SILVER para LINES (simples cleansing, remove duplicados)
        df_lines_bronze = self.extract_from_file(format="parquet", path="/content/lake/bronze/lines")
        df_lines_silver = df_lines_bronze.dropDuplicates()
        self.load(df=df_lines_silver, format="parquet", path="/content/lake/silver/lines")

        # 2. EXTRAIR SILVER VEHICLES E SILVER LINES
        df_vehicles_silver = self.extract_from_file(format="parquet", path="/content/lake/silver/vehicles")
        df_lines_silver = self.extract_from_file(format="parquet", path="/content/lake/silver/lines")

        # 3. ENRIQUECER (JOIN): vehicles.line_id == lines.id
        df_enriched = df_vehicles_silver.join(
            df_lines_silver,
            df_vehicles_silver.line_id == df_lines_silver.id,
            how="inner"
        )

        # 4. Seleciona as colunas que queres manter (exemplo abaixo, ajusta conforme precisares)
        df_enriched = df_enriched.select(
            df_vehicles_silver.bearing,
            df_vehicles_silver.block_id,
            df_vehicles_silver.current_status,
            df_vehicles_silver.id,
            df_vehicles_silver.lat,
            df_vehicles_silver.date)

        # 5. GUARDAR camada GOLD, particionado por data
        self.load(df=df_enriched,format="parquet",path="/content/lake/gold/lines_with_vehicles", partition_column="date")


if __name__ == '__main__':

    # init spark
    from pyspark.sql import SparkSession
    spark = SparkSession.builder.master('local').appName('ETL Program').getOrCreate()

    print("Starting ETL program")
    etl = ETLTask(spark)

    # run tasks
    print("Running Task - Ingestion Vehicles")
    etl.ingestion_vehicles()

    print("Running Task - Ingestion Lines")
    etl.ingestion_lines()

    print("Running Task - Cleansing Vehicles")
    etl.cleansing_vehicles()

    print("Running Task - Lines with Vehicles")
    etl.enrich_vehicles_with_lines()

    print("ETL program completed")

Starting ETL program
Running Task - Ingestion Vehicles
Running Task - Ingestion Lines
Running Task - Cleansing Vehicles
Running Task - Lines with Vehicles
ETL program completed


In [10]:
spark.read.parquet("/content/lake/bronze/lines").show()

+-------+----------+----+--------------------+--------------------+--------------+--------------------+--------------------+----------+----------+
|  color|facilities|  id|          localities|           long_name|municipalities|            patterns|              routes|short_name|text_color|
+-------+----------+----+--------------------+--------------------+--------------+--------------------+--------------------+----------+----------+
|#C61D23|        []|1001|[Alfragide, Amado...|Alfragide (Estr S...|        [1115]|[1001_0_1, 1001_0_2]|            [1001_0]|      1001|   #FFFFFF|
|#C61D23|        []|1002|[Reboleira, Amado...|Reboleira (Estaçã...|        [1115]|          [1002_0_3]|            [1002_0]|      1002|   #FFFFFF|
|#C61D23|        []|1003|[Amadora, Amadora...|Amadora (Estação ...|        [1115]|[1003_0_1, 1003_0_2]|            [1003_0]|      1003|   #FFFFFF|
|#C61D23|        []|1004|[Amadora, Moinhos...|Amadora (Estação ...|        [1115]|          [1004_0_3]|            [10

In [2]:
# check results
spark.read.parquet("/content/lake/bronze/vehicles").show()
spark.read.parquet("/content/lake/silver/vehicles").show()

+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+----------+
|bearing|            block_id|current_status|      id|      lat|line_id|      lon|pattern_id|route_id|schedule_relationship|    shift_id|    speed|stop_id|          timestamp|             trip_id|      date|
+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+----------+
|    252|             4098-21| IN_TRANSIT_TO| 42|2364| 38.94853|   2309|-9.040474|  2309_0_3|  2309_0|            SCHEDULED|        4166|10.277778| 180016|2025-06-21 15:01:42|2309_0_3|2|1|1530...|2025-06-21|
|    340|20250621-64020075...| IN_TRANSIT_TO|44|12078|38.654644|   4600| -8.99465|  4600_0_2|  4600_0|            SCHEDULED|121290000007|5.5555553| 090060|2025-06-21 15