In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=ef352baeb64d7391d4d18a7b5da190eb2b81cfa75c29a2c9ee0653cf6b65b418
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [2]:
!pip install FlightRadarAPI

Collecting FlightRadarAPI
  Downloading flightradarapi-1.3.26-py3-none-any.whl (16 kB)
Collecting brotli (from FlightRadarAPI)
  Downloading Brotli-1.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: brotli, FlightRadarAPI
Successfully installed FlightRadarAPI-1.3.26 brotli-1.1.0


In [3]:
from FlightRadar24 import FlightRadar24API
import json
import re
fr_api = FlightRadar24API()

In [4]:
def save_json(dicc,file):
    with open(file,'w') as f:
        f.write(json.dumps(dicc))

In [5]:
def read_json(file):
    with open(file,'r') as f:
        data = json.load(f)
    return data

## Obtain airports from cities we are interested in\

In [6]:
airports = fr_api.get_airports()

In [7]:
# List of cities
cities = ['Amsterdam','Athens','Barcelona','Berlin','Budapest','Lisbon','London','Paris','Rome','Vienna']

In [8]:
import re
regex = r'\((.*?)\)'
airports_in_city = []

for ap in airports:
    for city in cities:
        if city in str(ap):
            acronym = re.findall(regex,str(ap))[0]
            airports_in_city.append(ap)
            break

In [9]:
def parseig_aeroports(lista_aeropuertos):
    airports_info = {}
    regex = r'\((.*?)\) .*? Latitude: ([-+]?\d+\.\d+) - Longitude: ([-+]?\d+\.\d+)'

    for aeropuerto in lista_aeropuertos:
        matches = re.findall(regex, str(aeropuerto))
        if matches:
            siglas = matches[0][0]
            latitude = float(matches[0][1])
            longitude = float(matches[0][2])
            for city in cities:
                if city in aeropuerto:
                    airp_city = city
            airports_info[siglas] = {'Latitude': latitude, 'Longitude': longitude,'City':airp_city}

    return airports_info

In [18]:
airports_info = parseig_aeroports(airports_in_city)

We store information we have by the moment because we had some issues with API requests.

In [None]:
save_json(airports_info,'airports_info.json')

## Get arrivals from airports

In [None]:
airports_info = read_json('airports_info.json')

In [None]:
def get_100_arrivals(acronym):
    aeroport = fr_api.get_airport_details(acronym)
    vols = aeroport['airport']['pluginData']['schedule']['arrivals']['data']
    return vols

In [None]:
for airp in airports_info.keys():
    airports_info[airp]['arrivals'] = get_100_arrivals(airp)

In [None]:
save_json(airports_info,'airport_info_arrivals.json')

## Save into DataLake

In [None]:
import json
import re

In [13]:
airports_info = read_json('airport_info_arrivals.json')

In [15]:
from pyspark.sql import SparkSession

# Init session in Spark
spark = SparkSession.builder \
    .appName("JSON a Parquet con Spark SQL") \
    .getOrCreate()

# Read json and create a temporal view
df = spark.read.json("airport_info_arrivals.json")
df.createOrReplaceTempView("airport_info_arrivals")

# Use Spark SQL to select the data
selected_data = spark.sql("SELECT * FROM airport_info_arrivals")

# Save data as a parquet file
selected_data.write.parquet("datalake/airport_info_arrivals.parquet")

# Stop Spark Session
spark.stop()

We realized that out json didn't have a great format to manage the data as we want. So we need to change it to have the following structure to begin:

| Airport acronym | Longitude | Latitude | City  | arrivals |
| --------------- | --------- | -------- | ----  | -------- |
| EHAM | 52.308609 | 4.763889 | Amsterdam  | [flight 3473cc55] |

To get this new structure we'll use map reduce, as we have seen in class. Then we'll built another parquet file and it will be the definitive of our datalake.

In [16]:
# Map function: transform input into intermediate key-value output
def map_function(input_data):
    intermediate = []
    for key, value in input_data.items():
        # The key is the airport acronym, and the value is the corresponding information
        intermediate.append((key, value))
    return intermediate

# Reduce function: process intermediate values for each key and emit final result
# In this case, each key is unique, so we just pass the data through
def reduce_function(intermediate_data):
    result = []
    for key, values in intermediate_data:
        # Transform the value to the desired structure (in this case, it's already in the desired structure)
        result.append((key, values['Longitude'], values['Latitude'], values['City'], values['arrivals']))
    return result

# Simulate the MapReduce process
intermediate = map_function(airports_info)
final_result = reduce_function(intermediate)

# Display the result
for item in final_result:
    print(item)


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [18]:
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, DoubleType
# Define the DataFrame schema according to the desired structure
schema = StructType([
    StructField("airport_acronym", StringType(), True),
    StructField("longitude", DoubleType(), True),
    StructField("latitude", DoubleType(), True),
    StructField("city", StringType(), True),
    StructField("arrivals", ArrayType(StringType()), True)
])

# Start a Spark session
spark = SparkSession.builder \
    .appName("Load and Save Transformed Data") \
    .getOrCreate()

# Create a DataFrame from the transformed data and the defined schema
df = spark.createDataFrame(data=final_result, schema=schema)

# Show the DataFrame to verify its content
df.show()

# Save the DataFrame as a Parquet file
df.write.parquet("datalake/airport_info_transformed.parquet")

# Stop the Spark session
spark.stop()


+---------------+----------+---------+---------+--------------------+
|airport_acronym| longitude| latitude|     city|            arrivals|
+---------------+----------+---------+---------+--------------------+
|           EHAM|  4.763889|52.308609|Amsterdam|[{flight={owner={...|
|           KAHN|-83.324722| 33.95195|   Athens|[{flight={owner=n...|
|           LGAV|  23.94446|37.936352|   Athens|[{flight={owner={...|
|           KUNI|  -82.2314|39.210999|   Athens|                  []|
|           KMMI|  -84.5625|35.397221|   Athens|                  []|
|           LEBL|  2.078463| 41.29707|Barcelona|[{flight={owner={...|
|           SVBC|-64.689102| 10.10713|Barcelona|[{flight={owner={...|
|           EDDB| 13.503722|52.362877|   Berlin|[{flight={owner={...|
|           LHBS| 18.980589|47.451065| Budapest|                  []|
|           LHBP|  19.25559|47.436932| Budapest|[{flight={owner={...|
|           FAEL|  27.82593| -33.0355|   London|[{flight={owner={...|
|           LPPT|  -