In [None]:
from FlightRadar24 import FlightRadar24API
import json
import re
fr_api = FlightRadar24API()

In [None]:
def read_json(file):
    with open(file,'r') as f:
        data = json.load(f)
    return data

In [None]:
def save_json(dicc,file):
    with open(file,'w') as f:
        f.write(json.dumps(dicc))

## Obtain airports from cities we are interested in

In [None]:
airports = fr_api.get_airports()

In [None]:
airports

In [None]:
# List of cities
cities = ['Amsterdam','Athens','Barcelona','Berlin','Budapest','Lisbon','London','Paris','Rome','Vienna']

In [None]:
import re
regex = r'\((.*?)\)'
airports_in_city = []

for ap in airports:
    for city in cities:
        if city in str(ap):
            acronym = re.findall(regex,str(ap))[0]
            airports_in_city.append(ap)
            break

In [None]:
airports_in_city

In [None]:
def parseig_aeroports(lista_aeropuertos):
    airports_info = {}
    regex = r'\((.*?)\) .*? Latitude: ([-+]?\d+\.\d+) - Longitude: ([-+]?\d+\.\d+)'
    
    for aeropuerto in lista_aeropuertos:
        matches = re.findall(regex, str(aeropuerto))
        if matches:
            siglas = matches[0][0]
            latitude = float(matches[0][1])
            longitude = float(matches[0][2])
            for city in cities:
                if city in aeropuerto:
                    airp_city = city
            airports_info[siglas] = {'Latitude': latitude, 'Longitude': longitude,'City':airp_city}
    
    return airports_info

In [None]:
airports_info = parseig_aeroports(airports_in_city)

We store information we have by the moment because we had some issues with API requests.

In [None]:
save_json(airports_info,'airports_info.json')

## Get arrivals from airports

In [None]:
airports_info = read_json('airports_info.json')

In [None]:
def get_100_arrivals(acronym):
    aeroport = fr_api.get_airport_details(acronym)
    vols = aeroport['airport']['pluginData']['schedule']['arrivals']['data']
    return vols

In [None]:
for airp in airports_info.keys():
    airports_info[airp]['arrivals'] = get_100_arrivals(airp)

In [None]:
save_json(airports_info,'airport_info_arrivals.json')

## Save into DataLake

In [None]:
import json
import re

In [None]:
airports_info = read_json('../airport_info_arrivals.json')

In [None]:
from pyspark.sql import SparkSession

# Init session in Spark
spark = SparkSession.builder \
    .appName("JSON a Parquet con Spark SQL") \
    .getOrCreate()

# Read json and create a temporal view
df = spark.read.json("airport_info_arrivals.json")
df.createOrReplaceTempView("airport_info_arrivals")

# Use Spark SQL to select the data
selected_data = spark.sql("SELECT * FROM airport_info_arrivals")

# Save data as a parquet file
selected_data.write.parquet("../datalake/airport_info_arrivals.parquet")

# Stop Spark Session
spark.stop()

We realized that out json didn't have a great format to manage the data as we want. So we need to change it to have the following structure to begin:

| Airport acronym | Longitude | Latitude | City  | arrivals | 
| --------------- | --------- | -------- | ----  | -------- | 
| EHAM | 52.308609 | 4.763889 | Amsterdam  | [flight 3473cc55] |

To get this new structure we'll use map reduce, as we have seen in class. Then we'll built another parquet file.

In [None]:
# Map function: transform input into intermediate key-value output
def map_function(input_data):
    intermediate = []
    for key, value in input_data.items():
        # The key is the airport acronym, and the value is the corresponding information
        intermediate.append((key, value))
    return intermediate

# Reduce function: process intermediate values for each key and emit final result
# In this case, each key is unique, so we just pass the data through
def reduce_function(intermediate_data):
    result = []
    for key, values in intermediate_data:
        # Transform the value to the desired structure (in this case, it's already in the desired structure)
        result.append((key, values['Longitude'], values['Latitude'], values['City'], values['arrivals']))
    return result

# Simulate the MapReduce process
intermediate = map_function(airports_info)
final_result = reduce_function(intermediate)

# Display the result
for item in final_result:
    print(item)


In [None]:
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, DoubleType
# Define the DataFrame schema according to the desired structure
schema = StructType([
    StructField("Airport acronym", StringType(), True),
    StructField("Longitude", DoubleType(), True),
    StructField("Latitude", DoubleType(), True),
    StructField("City", StringType(), True),
    StructField("arrivals", ArrayType(StringType()), True)
])

# Start a Spark session
spark = SparkSession.builder \
    .appName("Load and Save Transformed Data") \
    .getOrCreate()

# Create a DataFrame from the transformed data and the defined schema
df = spark.createDataFrame(data=final_result, schema=schema)

# Show the DataFrame to verify its content
df.show()

# Save the DataFrame as a Parquet file
df.write.parquet("../datalake/airport_info_transformed.parquet")

# Stop the Spark session
spark.stop()