<a href="https://colab.research.google.com/github/mdias23i/DE-DataProcessing/blob/main/spark/challenges/challenges.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CHALLENGE 1
##  Implement INGESTION process
- Set up path in the "lake"
  - !mkdir -p /content/lake/bronze

- Read data from API https://api.carrismetropolitana.pt/
  - Endpoints:
    - vehicles
    - lines
    - municipalities
  - Use StructFields to enforce schema

- Transformations
  - vehicles
    - create "date" extracted from "timestamp" column (format: hh24miss)

- Write data as PARQUET into the BRONZE layer (/content/lake/bronze)
  - Partition "vehicles" by "date" column
  - Paths:
    - vehicles - path: /content/lake/bronze/vehicles
    - lines - path: /content/lake/bronze/lines
    - municipalities - path: /content/lake/bronze/municipalities
  - Make sure there is only 1 single parquet created
  - Use overwrite as write mode

# Setting up PySpark

In [1]:
%pip install pyspark



In [30]:
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import requests

class ETLFlow:
    def __init__(self, spark: SparkSession) -> None:
        self.spark = spark

    def extract_from_api(self, url: str, schema: StructType = None):
      response = requests.get(url)
      rdd = spark.sparkContext.parallelize(response.json())
      if schema:
        df = spark.read.schema(schema).json(rdd)
      else:
        df = spark.read.json(rdd)
      return df

    def load(self, df: DataFrame, format: str, path: str, **kwargs) -> None:
        df.write.mode("overwrite").format(format).save(path)



class ETLTask(ETLFlow):

    def __init__(self, spark: SparkSession) -> None:
        self.spark = spark

    def ingestion_vehicles(self):
      vehicle_schema = StructType([StructField('bearing', IntegerType(), True),
                                  StructField('block_id', StringType(), True),
                                  StructField('current_status', StringType(), True),
                                  StructField('id', StringType(), True),
                                  StructField('lat', FloatType(), True),
                                  StructField('line_id', StringType(), True),
                                  StructField('lon', FloatType(), True),
                                  StructField('pattern_id', StringType(), True),
                                  StructField('route_id', StringType(), True),
                                  StructField('schedule_relationship', StringType(), True),
                                  StructField('shift_id', StringType(), True),
                                  StructField('speed', FloatType(), True),
                                  StructField('stop_id', StringType(), True),
                                  StructField('timestamp', TimestampType(), True),
                                  StructField('trip_id', StringType(), True)])

      df = self.extract_from_api(url="https://api.carrismetropolitana.pt/vehicles", schema = vehicle_schema)
      df = df.withColumn('date', to_date(col('timestamp')))

      (df
        .repartition(1)
        .write
        .mode("overwrite")
        .partitionBy("date")
        .format("parquet")
        .save(path="/content/lake/bronze/vehicles")
      )


    def ingestion_lines(self):

      lines_schema = StructType([
                                    StructField("color", StringType(), True),
                                    StructField("facilities", ArrayType(StringType()), True),
                                    StructField("id", StringType(), True),
                                    StructField("localities", ArrayType(StringType()), True),
                                    StructField("long_name", StringType(), True),
                                    StructField("municipalities", ArrayType(StringType()), True),
                                    StructField("patterns", ArrayType(ArrayType(IntegerType())), True),
                                    StructField("routes", ArrayType(StringType()), True),
                                    StructField("short_name", StringType(), True),
                                    StructField("text_color", StringType(), True)
                                ])

      df = self.extract_from_api(url="https://api.carrismetropolitana.pt/lines", schema = lines_schema)

      (df
        .write
        .mode("overwrite")
        .format("parquet")
        .save(path="/content/lake/bronze/lines")
      )


    def ingestion_municipalities(self):

      municipalities_schema = StructType([
                                    StructField("district_id", StringType(), True),
                                    StructField("district_name", StringType(), True),
                                    StructField("id", StringType(), True),
                                    StructField("name", StringType(), True),
                                    StructField("prefix", StringType(), True),
                                    StructField("region_id", StringType(), True),
                                    StructField("region_name", StringType(), True)
                                ])

      df = self.extract_from_api(url="https://api.carrismetropolitana.pt/municipalities", schema = municipalities_schema)

      (df
        .write
        .mode("overwrite")
        .format("parquet")
        .save(path="/content/lake/bronze/municipalities")
      )


if __name__ == '__main__':

    # init spark
    from pyspark.sql import SparkSession
    spark = SparkSession.builder.master('local').appName('Challenge 1').getOrCreate()

    print("Starting Challenging 1")
    etl = ETLTask(spark)

    # run tasks
    print("Running Task - Ingestion Vehicles")
    etl.ingestion_vehicles()

    print("Running Task - Ingestion Lines")
    etl.ingestion_lines()

    print("Running Task - Ingestion municipalities")
    etl.ingestion_municipalities()

Starting Challenging 1
Running Task - Ingestion Vehicles
Running Task - Ingestion Lines
Running Task - Ingestion municipalities


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [31]:
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import requests

# Path to files
parquet_vehicles_file = "/content/lake/bronze/vehicles"
parquet_lines_file = "/content/lake/bronze/lines"
parquet_municipalities_file = "/content/lake/bronze/municipalities"



class ETLFlow:
    def __init__(self, spark: SparkSession) -> None:
        self.spark = spark

    def load(self, df: DataFrame, format: str, path: str, **kwargs) -> None:
        df.write.mode("overwrite").format(format).save(path)

class ETLTask(ETLFlow):

  def __init__(self, spark: SparkSession) -> None:
        self.spark = spark

  def cleansing_vehicles(self):
    #Read vehicles parquet file
    df_vehicles = spark.read.parquet(parquet_vehicles_file)


    # Transformations
    # rename "lat" and "lon" to "latitude" and "longitude" respectively
    df_vehicles = df_vehicles.withColumnRenamed("lat", "latitude")
    df_vehicles = df_vehicles.withColumnRenamed("lon", "longitude")
    # remove possible duplicates
    df_vehicles = df_vehicles.drop_duplicates()
    # remove rows when the column CURRENT_STATUS is null
    df_vehicles = df_vehicles.filter(df_vehicles['current_status'].isNotNull())


    (df_vehicles
        .repartition(1)
        .write
        .mode("overwrite")
        .partitionBy("date")
        .format("parquet")
        .save(path="/content/lake/silver/vehicles")
      )


  def cleansing_lines(self):
    #Read lines parquet file
    df_lines = spark.read.parquet(parquet_lines_file)
    df_lines.show()
    # remove possible duplicates
    df_lines = df_lines.drop_duplicates()
    #remove rows when the column LONG_NAME is null
    df_lines = df_lines.filter(df_lines['long_name'].isNotNull())

    self.load(df=df_lines, format="parquet", path="/content/lake/silver/lines")



  def cleansing_municipalities(self):
    #Read municipalities parquet file
    df_municipalities = spark.read.parquet(parquet_municipalities_file)
    # remove possible duplicates
    df_municipalities = df_municipalities.drop_duplicates()
    #remove rows when the columns NAME or DISTRICT_NAME are null
    df_municipalities = df_municipalities.filter((df_municipalities['name'].isNotNull()) & (col("district_name").isNotNull()))

    self.load(df=df_municipalities, format="parquet", path="/content/lake/silver/municipalities")


if __name__ == '__main__':

    # init spark
    from pyspark.sql import SparkSession
    spark = SparkSession.builder.master('local').appName('Challenge 2').getOrCreate()

    print("Starting Challenging 2")
    etl = ETLTask(spark)

    # run tasks
    print("Running Task - Cleansing Vehicles")
    etl.cleansing_vehicles()
    print("Running Task - Cleansing Lines")
    etl.cleansing_lines()
    print("Running Task - Cleansing Municipalities")
    etl.cleansing_municipalities()

Starting Challenging 2
Running Task - Cleansing Vehicles
Running Task - Cleansing Lines
Running Task - Cleansing Municipalities


In [33]:
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import requests

spark = SparkSession.builder.appName("ReadParquetExample").getOrCreate()

# Path to files
parquet_vehicles_file = "/content/lake/silver/vehicles"
parquet_lines_file = "/content/lake/silver/lines"
parquet_municipalities_file = "/content/lake/silver/municipalities"

class ETLTask(ETLFlow):
  def __init__(self, spark: SparkSession) -> None:
        self.spark = spark

  def enrich_vehicles(self):

    #Read parquet files
    df_vehicles = spark.read.parquet(parquet_vehicles_file)
    df_lines = spark.read.parquet(parquet_lines_file)
    df_municipalities = spark.read.parquet(parquet_municipalities_file)


    df_lines = df_lines.withColumn('municipality_id', explode(df_lines['municipalities']))

    df_vehicles.createOrReplaceTempView("vehicles")
    df_lines.createOrReplaceTempView("lines")
    df_municipalities.createOrReplaceTempView("municipalities")


    query = """
              SELECT
                  vehicles.*,  -- all columns vehicles
                  lines.long_name AS line_name,  --  lines.long_name to line_name
                  municipalities.name AS municipality_name  --  municipalities.name to municipality_name
              FROM
                  vehicles
              JOIN
                  lines ON vehicles.line_id = lines.id  -- Join vehicles and lines
              JOIN
                  municipalities ON lines.municipality_id = municipalities.id  -- Join municipalities
            """

    result_df = spark.sql(query)

    (result_df
        .repartition(1)
        .write
        .mode("overwrite")
        .partitionBy("date")
        .format("parquet")
        .save(path="/content/lake/gold/vehicles_enriched")
      )

    result_df.show()



if __name__ == '__main__':

    # init spark
    from pyspark.sql import SparkSession
    spark = SparkSession.builder.master('local').appName('Challenge 3').getOrCreate()

    print("Starting Challenging 3")
    etl = ETLTask(spark)

    # run tasks
    print("Running Task - Enrich Vehicles")
    etl.enrich_vehicles()


Starting Challenging 3
Running Task - Enrich Vehicles
+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+----------+--------------------+-------------------+
|bearing|            block_id|current_status|      id| latitude|line_id|longitude|pattern_id|route_id|schedule_relationship|    shift_id|    speed|stop_id|          timestamp|             trip_id|      date|           line_name|  municipality_name|
+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+----------+--------------------+-------------------+
|     64|             1185-11| IN_TRANSIT_TO| 42|2369|38.907845|   2328|-9.029838|  2328_0_1|  2328_0|            SCHEDULED|        1279|11.111111| 180095|2024-11-22 21:30:28|2328_0_1|1|1|2100...|202

In [42]:
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import requests

spark = SparkSession.builder.appName("ReadParquetExample").getOrCreate()

# Path to files
parquet_vehicles_enriched = "/content/lake/gold/vehicles_enriched"


class ETLTask():
  def __init__(self, spark: SparkSession) -> None:
        self.spark = spark

  def gold_vehicles(self):

    #Read parquet files
    df_vehicles_enriched = spark.read.parquet(parquet_vehicles_enriched)

    df_vehicles_enriched_by_municipality = df_vehicles_enriched.groupBy("municipality_name").agg(
      count("line_id").alias("vehicle_count"),
      sum("speed").alias("total_speed")
    ).orderBy(desc(col("total_speed"))).limit(3)

    df_vehicles_enriched_by_municipality_avg = df_vehicles_enriched.groupBy("municipality_name").agg(
      avg("speed").alias("avg_speed")
    ).orderBy(desc(col("avg_speed"))).limit(3)

    df_vehicles_enriched_by_municipality.show()

    df_vehicles_enriched_by_municipality_avg.show()



if __name__ == '__main__':

    # init spark
    from pyspark.sql import SparkSession
    spark = SparkSession.builder.master('local').appName('Challenge 4').getOrCreate()

    print("Starting Challenging 4")
    etl = ETLTask(spark)

     # run tasks
    print("Running Task")
    etl.gold_vehicles()


Starting Challenging 4
Running Task
+-----------------+-------------+-----------------+
|municipality_name|vehicle_count|      total_speed|
+-----------------+-------------+-----------------+
|           Lisboa|          157|1092.499995470047|
|           Loures|           87|521.6666631698608|
|           Sintra|           83|466.1111090183258|
+-----------------+-------------+-----------------+

+-----------------+-----------------+
|municipality_name|        avg_speed|
+-----------------+-----------------+
|         Alenquer| 9.44444465637207|
|            Mafra|9.398148112826878|
|          Montijo|9.355158720697675|
+-----------------+-----------------+

