<a href="https://colab.research.google.com/github/margaridagomes/dataeng-basic-course/blob/main/spark/challenges/challenge_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CHALLENGE 2
##  Implement CLEANSING process
- Set up path in the "lake"
  - !mkdir -p /content/lake/silver

- Read data from BRONZE layer as PARQUET:
    - vehicles - path: /content/lake/bronze/vehicles
    - lines - path: /content/lake/bronze/lines
    - municipalities - path: /content/lake/bronze/municipalities

- Transformations
  - vehicles
    - rename "lat" and "lon" to "latitude" and "longitude" respectively
    - remove possible duplicates
    - remove rows when the column CURRENT_STATUS is null
    - remove any corrupted record
  - lines
    - remove duplicates
    - remove rows when the column LONG_NAME is null
    - remove any corrupted record
  - municipalities
    - remove duplicates
    - remove rows when the columns NAME or DISTRICT_NAME are null
    - remove any corrupted record

- Write data as PARQUET into the SILVER layer (/content/lake/silver)
  - Partition "vehicles" by "date"(created in the ingestion)
  - Paths:
    - vehicles - path: /content/lake/silver/vehicles
    - lines - path: /content/lake/silver/lines
    - municipalities - path: /content/lake/silver/municipalities

# Setting up PySpark

In [None]:
%pip install pyspark



In [11]:
# Set up path in the "lake"
!mkdir -p /content/lake/bronze
!mkdir -p /content/lake/silver

from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import *
import pyspark.sql.functions as F
import requests

class ETLFlow:

    def __init__(self, spark: SparkSession) -> None:
        self.spark = spark

    def extract_from_api(self, url: str, schema: StructType = None) -> DataFrame:
        response = requests.get(url)
        rdd = spark.sparkContext.parallelize(response.json())
        if schema:
            df = spark.read.schema(schema).json(rdd)
        else:
            df = spark.read.json(rdd)
        return df

    def extract_from_file(self, format: str, path: str, **kwargs) -> DataFrame:
        df = self.spark.read.format(format).load(path)
        return df

    def load(self, df: DataFrame, format: str, path: str, partition_column: str = None, **kwargs) -> None:
        if partition_column:
            df.coalesce(1).write.mode("overwrite").partitionBy(partition_column).format(format).save(path)
        else:
            df.coalesce(1).write.mode("overwrite").format(format).save(path)

class ETLTask(ETLFlow):

    def __init__(self, spark: SparkSession) -> None:
        self.spark = spark

    def ingestion_lines(self):
        # Schema definition for lines
        lines_schema = StructType([StructField('color', StringType(), True),
                                   StructField('facilities', ArrayType(StringType(), True), True),
                                   StructField('id', StringType(), True),
                                   StructField('localities', ArrayType(StringType(), True), True),
                                   StructField('long_name', StringType(), True),
                                   StructField('municipalities', ArrayType(StringType(), True), True),
                                   StructField('patterns', ArrayType(StringType(), True), True),
                                   StructField('routes', ArrayType(StringType(), True), True),
                                   StructField('short_name', StringType(), True),
                                   StructField('text_color', StringType(), True)])

        # Ingestion
        df = self.extract_from_api(url="https://api.carrismetropolitana.pt/lines", schema=lines_schema)
        # Load
        self.load(df=df, format="parquet", path="/content/lake/bronze/lines")

    def ingestion_vehicles(self):
        # Schema definition for vehicles
        vehicle_schema = StructType([StructField('bearing', IntegerType(), True),
                                  StructField('block_id', StringType(), True),
                                  StructField('current_status', StringType(), True),
                                  StructField('id', StringType(), True),
                                  StructField('lat', FloatType(), True),
                                  StructField('line_id', StringType(), True),
                                  StructField('lon', FloatType(), True),
                                  StructField('pattern_id', StringType(), True),
                                  StructField('route_id', StringType(), True),
                                  StructField('schedule_relationship', StringType(), True),
                                  StructField('shift_id', StringType(), True),
                                  StructField('speed', FloatType(), True),
                                  StructField('stop_id', StringType(), True),
                                  StructField('timestamp', TimestampType(), True),
                                  StructField('trip_id', StringType(), True)])

        # Ingestion
        df = self.extract_from_api(url="https://api.carrismetropolitana.pt/vehicles", schema=vehicle_schema)

        # Create "date" column from "timestamp"
        df = df.withColumn("date", F.expr("date(timestamp)"))

        # Load
        self.load(df=df, format="parquet", path="/content/lake/bronze/vehicles", partition_column="date")

    def ingestion_municipalities(self):
        # Schema definition for municipalities
        municipalities_schema = StructType([StructField('district_id', StringType(), True),
                                            StructField('district_name', StringType(), True),
                                            StructField('id', StringType(), True),
                                            StructField('name', StringType(), True),
                                            StructField('prefix', StringType(), True),
                                            StructField('region_id', StringType(), True),
                                            StructField('region_name', StringType(), True)])

        # Ingestion
        df = self.extract_from_api(url="https://api.carrismetropolitana.pt/municipalities", schema=municipalities_schema)

        # Load
        self.load(df=df, format="parquet", path="/content/lake/bronze/municipalities")

    def cleansing_vehicles(self):
        # Ingestion
        df = self.extract_from_file(format="parquet", path="/content/lake/bronze/vehicles")

        # Renaming columns
        df = df.withColumnRenamed("lat", "latitude").withColumnRenamed("lon", "longitude")

        # Removing duplicates
        df = df.dropDuplicates()

        # Remove rows where 'current_status' is null
        df = df.filter(F.col("current_status").isNotNull())

        # Remove corrupted records
        if "_corrupt_record" in df.columns:
          df = df.filter(F.col("_corrupt_record").isNull()).drop("_corrupt_record")

        # Load
        self.load(df=df, format="parquet", path="/content/lake/silver/vehicles", partition_column="date")


    def cleansing_lines(self):
        # Ingestion
        df = self.extract_from_file(format="parquet", path="/content/lake/bronze/lines")

        # Remove duplicates
        df = df.dropDuplicates()

        # Remove rows where 'long_name' is null
        df = df.filter(F.col("long_name").isNotNull())

        # Remove corrupted records
        if "_corrupt_record" in df.columns:
          df = df.filter(F.col("_corrupt_record").isNull()).drop("_corrupt_record")

        # Load
        self.load(df=df, format="parquet", path="/content/lake/silver/lines")


    def cleansing_municipalities(self):
        # Ingestion
        df = self.extract_from_file(format="parquet", path="/content/lake/bronze/municipalities")

        # Remove duplicates
        df = df.dropDuplicates()

        # Remove rows where 'name' or 'district_name' is null
        df = df.filter(F.col("name").isNotNull() & F.col("district_name").isNotNull())

        # Remove corrupted records (records missing important columns)
        if "_corrupt_record" in df.columns:
          df = df.filter(F.col("_corrupt_record").isNull()).drop("_corrupt_record")

        # Load
        self.load(df=df, format="parquet", path="/content/lake/silver/municipalities")


if __name__ == '__main__':

    # Init Spark
    from pyspark.sql import SparkSession
    spark = SparkSession.builder.master('local').appName('ETL Program').getOrCreate()

    print("Starting ETL program")
    etl = ETLTask(spark)

    # Run Tasks
    print("Running Task - Ingestion Vehicles")
    etl.ingestion_vehicles()

    print("Running Task - Ingestion Lines")
    etl.ingestion_lines()

    print("Running Task - Ingestion Municipalities")
    etl.ingestion_municipalities()

    print("Running Task - Cleansing Vehicles")
    etl.cleansing_vehicles()

    print("Running Task - Cleansing Lines")
    etl.cleansing_lines()

    print("Running Task - Cleansing Municipalities")
    etl.cleansing_municipalities()

    print("ETL program completed")

    # Check results
    print("Preview - vehicles")
    spark.read.parquet("/content/lake/silver/vehicles").show()

    print("Preview - lines")
    spark.read.parquet("/content/lake/silver/lines").show()

    print("Preview - municipalities")
    spark.read.parquet("/content/lake/silver/municipalities").show()

Starting ETL program
Running Task - Ingestion Vehicles
Running Task - Ingestion Lines
Running Task - Ingestion Municipalities
Running Task - Cleansing Vehicles
Running Task - Cleansing Lines
Running Task - Cleansing Municipalities
ETL program completed
Preview - vehicles
+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+----------+
|bearing|            block_id|current_status|      id| latitude|line_id|longitude|pattern_id|route_id|schedule_relationship|    shift_id|    speed|stop_id|          timestamp|             trip_id|      date|
+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+----------+
|    284|20250628-64020242...| IN_TRANSIT_TO|44|12632|38.484882|   4477|-8.961852|  4477_0_3|  4477_0|  