<a href="https://colab.research.google.com/github/margaridagomes/dataeng-basic-course/blob/main/spark/misc/read_from_api.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Read from API
- Read data from API using requests and RDDs

# Setting up PySpark

In [None]:
%pip install pyspark

In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local').appName('Spark Course').config('spark.ui.port', '4050').getOrCreate()
sc = spark.sparkContext

# Get data from API - Vehicles

In [5]:
import requests
from pyspark.sql.types import *

def readFromAPI(url: str, schema: StructType = None):
  response = requests.get(url)
  rdd = sc.parallelize(response.json())

  if schema:
    df = spark.read.schema(schema).json(rdd)
  else:
    df = spark.read.json(rdd)
  return df

In [3]:
readFromAPI("https://api.carrismetropolitana.pt/vehicles").show()

+-------+--------------------+--------------+--------+------------------+-------+------------------+----------+--------+---------------------+------------+------------------+-------+----------+--------------------+
|bearing|            block_id|current_status|      id|               lat|line_id|               lon|pattern_id|route_id|schedule_relationship|    shift_id|             speed|stop_id| timestamp|             trip_id|
+-------+--------------------+--------------+--------+------------------+-------+------------------+----------+--------+---------------------+------------+------------------+-------+----------+--------------------+
|    133|20250625-64010006...| IN_TRANSIT_TO|44|12668|   38.776611328125|   4702| -9.06252670288086|  4702_0_1|  4702_0|            SCHEDULED|123500234560| 27.77777777777778| 010159|1750887160|4702_0_1|300|2225...|
|     18|20250625-64010056...| IN_TRANSIT_TO|44|12504|  38.6970100402832|   4512|-8.945204734802246|  4512_0_2|  4512_0|            SCHEDULE

In [6]:
readFromAPI("https://api.carrismetropolitana.pt/municipalities").show()

+-----------+-------------+----+--------------------+------+---------+----------------+
|district_id|district_name|  id|                name|prefix|region_id|     region_name|
+-----------+-------------+----+--------------------+------+---------+----------------+
|         07|        Évora|0712|        Vendas Novas|    19|    PT187|Alentejo Central|
|         11|       Lisboa|1101|            Alenquer|    20|    PT16B|           Oeste|
|         11|       Lisboa|1102|   Arruda dos Vinhos|    20|    PT16B|           Oeste|
|         11|       Lisboa|1105|             Cascais|    05|    PT170|             AML|
|         11|       Lisboa|1106|              Lisboa|    06|    PT170|             AML|
|         11|       Lisboa|1107|              Loures|    07|    PT170|             AML|
|         11|       Lisboa|1109|               Mafra|    08|    PT170|             AML|
|         11|       Lisboa|1110|              Oeiras|    12|    PT170|             AML|
|         11|       Lisboa|1111|

In [8]:
municipalities_schema = StructType([
    StructField('district_id', StringType(), True),
    StructField('district_name', StringType(), True),
    StructField('id', StringType(), True),
    StructField('name', StringType(), True),
    StructField('prefix', StringType(), True),
    StructField('region_id', StringType(), True),
    StructField('region_name', StringType(), True)])

municipalities = readFromAPI("https://api.carrismetropolitana.pt/municipalities", municipalities_schema)
print(municipalities.count())
municipalities.show()

23
+-----------+-------------+----+--------------------+------+---------+----------------+
|district_id|district_name|  id|                name|prefix|region_id|     region_name|
+-----------+-------------+----+--------------------+------+---------+----------------+
|         07|        Évora|0712|        Vendas Novas|    19|    PT187|Alentejo Central|
|         11|       Lisboa|1101|            Alenquer|    20|    PT16B|           Oeste|
|         11|       Lisboa|1102|   Arruda dos Vinhos|    20|    PT16B|           Oeste|
|         11|       Lisboa|1105|             Cascais|    05|    PT170|             AML|
|         11|       Lisboa|1106|              Lisboa|    06|    PT170|             AML|
|         11|       Lisboa|1107|              Loures|    07|    PT170|             AML|
|         11|       Lisboa|1109|               Mafra|    08|    PT170|             AML|
|         11|       Lisboa|1110|              Oeiras|    12|    PT170|             AML|
|         11|       Lisboa|11

# Vehicles

In [None]:
vehicle_schema = StructType([StructField('bearing', IntegerType(), True),
                             StructField('block_id', StringType(), True),
                             StructField('current_status', StringType(), True),
                             StructField('id', StringType(), True),
                             StructField('lat', FloatType(), True),
                             StructField('line_id', StringType(), True),
                             StructField('lon', FloatType(), True),
                             StructField('pattern_id', StringType(), True),
                             StructField('route_id', StringType(), True),
                             StructField('schedule_relationship', StringType(), True),
                             StructField('shift_id', StringType(), True),
                             StructField('speed', FloatType(), True),
                             StructField('stop_id', StringType(), True),
                             StructField('timestamp', TimestampType(), True),
                             StructField('trip_id', StringType(), True)])

vehicles = readFromAPI("https://api.carrismetropolitana.pt/vehicles", vehicle_schema)
print(vehicles.count())
vehicles.show()

410
+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+
|bearing|            block_id|current_status|      id|      lat|line_id|      lon|pattern_id|route_id|schedule_relationship|    shift_id|    speed|stop_id|          timestamp|             trip_id|
+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+
|    315|           2_2006-21| IN_TRANSIT_TO| 41|1342|  38.7697|   1710|-9.206302|  1710_3_2|  1710_3|            SCHEDULED|        2036|16.944445| 030817|2025-06-21 14:34:21|1710_3_2_1530_155...|
|    329|20250621-64020099...| IN_TRANSIT_TO|44|12510| 38.70925|   4504|-8.961373|  4504_0_2|  4504_0|            SCHEDULED|121380000007| 9.722222| 100446|2025-06-21 14:34:30|4504_0_2|600|1500...|
|    306|20

# Routes

In [None]:
from pyspark.sql.functions import *
routes = readFromAPI("https://api.carrismetropolitana.pt/routes")
print(routes.count())

913


# Questions

# Q1
- adjust ROUTE dataframe
  - use correct schema to get the data (StructField)
  - make sure _corrupt_record is removed from schema
- find the routes that pass through the localities "Brandoa" and "Colégio Militar"

In [None]:
routes

DataFrame[_corrupt_record: string, color: string, facilities: array<string>, id: string, line_id: string, localities: array<string>, long_name: string, municipalities: array<string>, patterns: array<string>, short_name: string, text_color: string]

In [None]:
routes.schema

StructType([StructField('_corrupt_record', StringType(), True), StructField('color', StringType(), True), StructField('facilities', ArrayType(StringType(), True), True), StructField('id', StringType(), True), StructField('line_id', StringType(), True), StructField('localities', ArrayType(StringType(), True), True), StructField('long_name', StringType(), True), StructField('municipalities', ArrayType(StringType(), True), True), StructField('patterns', ArrayType(StringType(), True), True), StructField('short_name', StringType(), True), StructField('text_color', StringType(), True)])

In [None]:
import pyspark.sql.functions as F
from pyspark.sql.types import *

routes_schema = StructType([StructField('_corrupt_record', StringType(), True), StructField('color', StringType(), True), StructField('facilities', ArrayType(StringType(), True), True), StructField('id', StringType(), True), StructField('line_id', StringType(), True), StructField('localities', ArrayType(StringType(), True), True), StructField('long_name', StringType(), True), StructField('municipalities', ArrayType(StringType(), True), True), StructField('patterns', ArrayType(StringType(), True), True), StructField('short_name', StringType(), True), StructField('text_color', StringType(), True)])

routes = readFromAPI("https://api.carrismetropolitana.pt/routes", routes_schema)
routes = routes.filter(col("_corrupt_record").isNull()).drop("_corrupt_record")

#routes.filter(array_contains(col("localities"), "Brandoa") & array_contains(col("localities"), "Colégio Militar"))
routes.filter(F.array_contains(routes.localities, "Brandoa") & F.array_contains(routes.localities, "Colégio Militar")).select("localities").show()

+--------------------+
|          localities|
+--------------------+
|[Casal da Mira, A...|
|[Brandoa, Amadora...|
|[Casal da Mira, A...|
|[Brandoa, Amadora...|
|[Casal da Mira, A...|
+--------------------+



In [None]:
routes.select("_corrupt_record").show()

+--------------------+
|     _corrupt_record|
+--------------------+
|                NULL|
|                NULL|
|                NULL|
|                NULL|
|                NULL|
|                NULL|
|                NULL|
|                NULL|
|                NULL|
|{'color': '#3D85C...|
|{'color': '#3D85C...|
|                NULL|
|                NULL|
|                NULL|
|                NULL|
|{'color': '#C61D2...|
|                NULL|
|                NULL|
|                NULL|
|                NULL|
+--------------------+
only showing top 20 rows



# Q2
- What is the line with more localities?


In [None]:
import pyspark.sql.functions as F

lines = readFromAPI("https://api.carrismetropolitana.pt/lines")

lines.withColumn("localities_qty", F.array_size(F.col("localities"))).orderBy(F.desc("localities_qty")).select("long_name", "localities_qty").show(1, False)


+-----------------------------------------------+--------------+
|long_name                                      |localities_qty|
+-----------------------------------------------+--------------+
|Cacilhas (Terminal) - Setúbal (ITS) via Azeitão|24            |
+-----------------------------------------------+--------------+
only showing top 1 row



### API - https://github.com/carrismetropolitana/api