In [1]:
import requests
import json
import pandas as pd
import os
import time
from datetime import datetime, timedelta
import logging
logging.basicConfig(
    level="INFO",
    datefmt="%Y-%m-%d",
    format="%(asctime)s - %(levelname)s - %(message)s",
    filename="skyscanner_logs.log"
)
import warnings
warnings.filterwarnings(action="ignore")
from dotenv import load_dotenv
# Load environment variables
load_dotenv()

True

### Define some global inputs

In [2]:
airport_data_excel_file = "airport_data.xlsx"
failed_airports_excel_file = "failed_airports.xlsx"
flight_data_excel_file = "flight_data.xlsx"
failed_routes_excel_file = "failed_routes.xlsx"

initial_crawling_date = datetime.now().date() + timedelta(days=7)
initial_crawling_date_formatted = datetime.strftime(initial_crawling_date, "%Y-%m-%d")

return_crawling_date = datetime.now().date() + timedelta(days=14)
return_crawling_date_formatted = datetime.strftime(return_crawling_date, "%Y-%m-%d")

t_plus_6_months_crawling_date = datetime.now().date() + timedelta(days=7) + timedelta(days=180)
t_plus_6_months_crawling_date_formatted = datetime.strftime(t_plus_6_months_crawling_date, "%Y-%m-%d")

t_plus_12_months_crawling_date = datetime.now().date() + timedelta(days=7) + timedelta(days=365)
t_plus_12_months_crawling_date_formatted = datetime.strftime(t_plus_12_months_crawling_date, "%Y-%m-%d")

avg_aircraft_speed = 900 # in km/hr

### Load the flight routes from the Excel file

In [3]:
df_routes = pd.read_excel("flight_routes.xlsx")

# Display the data frame
df_routes.columns = df_routes.columns.str.lower().str.replace(" ", "_")
df_routes

Unnamed: 0,departure_city,arrival_city,route_competition_status
0,Aalborg,Kaunas,Monopoly always
1,Aalborg,London (GB),Monopoly not always
2,Aalborg,Stockholm,Monopoly always
3,Aarhus,Faro (PT),Monopoly always
4,Aarhus,Gdansk,Monopoly always
...,...,...,...
5115,Zadar,Paris (FR),1 comp No Wizz
5116,Zadar,Vienna,1 comp No Wizz
5117,Zagreb,Dublin (IE),1 comp No Wizz
5118,Zagreb,Duesseldorf,1 comp No Wizz


In [4]:
# Create a list of all airports from df_routes
airport_list = pd.concat([df_routes["departure_city"], df_routes["arrival_city"]], axis=0).drop_duplicates().reset_index(drop=True).sort_values().tolist()

# Clean the list so only valid airports are shown
failed_airports_mapping_dict = {
    # Failed airports
    "Duesseldorf": "Düsseldorf",
    "Basel/Mulhouse": "Basel",
    "Cologne/Bonn": "Cologne",
    "Karlsruhe/Baden-Baden": "Karlsruhe",
    "Klaipeda/Palanga": "Palanga",
    "Leipzig/Halle": "Leipzig",
    "Lourdes/Tarbes": "Tarbes",
    "Maastricht/Aachen (NL)": "Maastricht",
    "Muenster/Osnabrueck (DE) 00": "Münster",
    "Paderborn/Lippstadt": "Paderborn Lippstadt",
    "Preveza/Lefkada": "Preveza",

    # Cities that don't have a City ID in the API response
    "Palma de Mallorca": "Mallorca",
    "Kerkyra": "Corfu",
    "Irakleion": "Heraklion",
    "Eilat (IL)": "Eilat",
    "Tel Aviv-yafo": "Tel Aviv",
}
airport_list = [failed_airports_mapping_dict.get(airport) if airport in list(failed_airports_mapping_dict.keys()) else airport for airport in airport_list]

### Define a function to extract the airport based on the city name

In [5]:
def extract_airport(city_name):
    url = "https://skyscanner50.p.rapidapi.com/api/v1/searchAirport"

    querystring = {"query": city_name}

    headers = {
        "X-RapidAPI-Key": os.getenv("API_KEY"),
        "X-RapidAPI-Host": "skyscanner50.p.rapidapi.com"
    }

    response = requests.get(url, headers=headers, params=querystring)

    return response.json()

### Call the airport data endpoint to retrieve information about the airports found in the cities

In [None]:
list_airport_data = []
list_failed_airports = []
for idx, i in enumerate(airport_list):
    # Print a status message
    logging.info(f"Extracting the airport data of airport {i}, which is airport number {idx + 1} out of {len(airport_list)}")
    
    # Get the airport data and create a data frame out of it
    try:
        output_extract_airport = pd.json_normalize(extract_airport(i)["data"][0])
        output_extract_airport["search_term"] = i

        # Append the result to airport_list
        list_airport_data.append(output_extract_airport)

        # Write the result to Excel
        with pd.ExcelWriter(airport_data_excel_file, mode='a', engine='openpyxl', if_sheet_exists="overlay") as writer:
            last_row = pd.read_excel(airport_data_excel_file).shape[0]
            if idx == 0:
                output_extract_airport.to_excel(writer, sheet_name='Sheet1', index=False, header=True, encoding="utf-8", startrow=last_row)
            else:
                output_extract_airport.to_excel(writer, sheet_name='Sheet1', index=False, header=False, encoding="utf-8", startrow=last_row + 1)
     # This KeyError can occur because the airport cannot be found in the data object of the API
     # The IndexError occurs because the airport cannot be found in the 0th index of the API response
    except (KeyError, IndexError): 
        list_failed_airports.append(i)
        
        # Write the result to Excel
        with pd.ExcelWriter(failed_airports_excel_file, mode='a', engine='openpyxl', if_sheet_exists="overlay") as writer:
            last_row = pd.read_excel(failed_airports_excel_file).shape[0]
            pd.DataFrame({"failed_airport": [i]}).to_excel(writer, sheet_name='Sheet1', index=False, header=False, encoding="utf-8", startrow=last_row + 1)

    # Introduce a sleep of 0.5 second between each call and the next
    time.sleep(0.5)

# Change list_airport_data to a dataframe
df_airport_data = pd.concat(list_airport_data).reset_index(drop=True)

# Drop the unnecessary columns after "search_term"
df_airport_data = df_airport_data.loc[:, :"search_term"]

### Define a function to get the route data from the searchFlights API endpoint

In [6]:
def get_flight_data(origin_airport, destination_airport, departure_date):
	url = "https://skyscanner50.p.rapidapi.com/api/v1/searchFlights"

	querystring = {
		"origin": origin_airport,
		"destination": destination_airport,
		"date": departure_date,
		"adults": "1",
		"cabinClass":"economy",
		"filter": "best",
		"currency": "EUR",
	}

	headers = {
		"X-RapidAPI-Key": os.getenv("API_KEY"),
		"X-RapidAPI-Host": "skyscanner50.p.rapidapi.com"
	}

	response = requests.get(url, headers=headers, params=querystring)

	return response.json()

### Get the route data from the searchFlights API endpoint

In [7]:
# Pull the airport data from airport_date.xlsx
df_airport_data_excel = pd.read_excel("airport_data.xlsx")

def find_dict_key_from_value(val, dictionary):
    return next((key for key, value in dictionary.items() if value == val), None)

def modify_search_term(x):
    if x in list(failed_airports_mapping_dict.values()):
        return find_dict_key_from_value(val=x, dictionary=failed_airports_mapping_dict) # Return the value of the dictionary
    else:
        return x # Return the same value in the data frame or the key of the dictionary

df_airport_data_excel["search_term_original"] = df_airport_data_excel["search_term"].apply(modify_search_term)
df_airport_data_excel

Unnamed: 0,PlaceId,PlaceName,LocalizedPlaceName,CountryId,CityId,IataCode,CountryName,PlaceNameEn,CityName,CityNameEn,GeoId,GeoContainerId,Location,ResultingPhrase,UntransliteratedResultingPhrase,search_term,search_term_original
0,AAL,Aalborg,,DK,AALB,AAL,Denmark,,Aalborg,,95673990,27536413,"57.093056,9.85","Aalborg (AAL), Aalborg|Ålborg|North Denmark Re...","Aalborg (AAL), Aalborg|Ålborg|North Denmark Re...",Aalborg,Aalborg
1,AAR,Aarhus,,DK,AARH,AAR,Denmark,,Aarhus,,95673987,27547464,"56.310278,10.618056","Aarhus (AAR), Aarhus|Århus Kommune|Aarhus Limo...","Aarhus (AAR), Aarhus|Århus Kommune|Aarhus Limo...",Aarhus,Aarhus
2,ABZ,Aberdeen,,UK,ABER,ABZ,United Kingdom,,Aberdeen,,95674056,27536433,"57.204167,-2.198056","Aberdeen (ABZ), Aberdeen|Aberdeen City|Scotlan...","Aberdeen (ABZ), Aberdeen|Aberdeen City|Scotlan...",Aberdeen (GB),Aberdeen (GB)
3,AGA,Agadir,,MA,AGAD,AGA,Morocco,,Agadir,,95673640,27536479,"30.325619,-9.412708","Agadir (AGA), Agadir|Morocco","Agadir (AGA), Agadir|Morocco",Agadir,Agadir
4,AHO,Alghero Sardinia,,IT,ALGS,AHO,Italy,,Alghero Sardinia,,95674177,27536503,"40.630556,8.288889","Alghero Sardinia (AHO), Alghero Sardinia|Provi...","Alghero Sardinia (AHO), Alghero Sardinia|Provi...",Alghero,Alghero
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224,WRO,Wroclaw,,PL,WROC,WRO,Poland,,Wroclaw,,95674155,27536248,"51.1,16.883333","Wroclaw (WRO), Wroclaw|Lower Silesian Voivodes...","Wroclaw (WRO), Wroclaw|Lower Silesian Voivodes...",Wroclaw,Wroclaw
225,ZAD,Zadar,,HR,ZADA,ZAD,Croatia,,Zadar,,95674072,27537473,"44.097778,15.356667","Zadar (ZAD), Zadar|Zadar County|Croatia","Zadar (ZAD), Zadar|Zadar County|Croatia",Zadar,Zadar
226,ZAG,Zagreb,,HR,ZAGR,ZAG,Croatia,,Zagreb,,95673639,27537474,"45.7408627,16.067501","Zagreb (ZAG), Zagreb|Zagreb Općina|Zagreb Coun...","Zagreb (ZAG), Zagreb|Zagreb Općina|Zagreb Coun...",Zagreb,Zagreb
227,INI,Nis,,RS,NISI,ZTH,Serbia,,Nis,,128668650,27542860,"43.333889,21.851667","Nis (INI), Nis|Niavski|Southern and Eastern S...","Nis (INI), Nis|Niavski|Southern and Eastern S...",Zakinthos Island,Zakinthos Island


In [298]:
# Create a for loop to call the API for each route pair
def extract_flight_data_by_date(crawling_date, crawling_range):
    list_flight_data = []
    list_failed_routes = []
    for idx in range(0, crawling_range):
        # Get the departure and arrival cities from df_routes
        departure_city = df_routes.loc[idx, 'departure_city']
        arrival_city = df_routes.loc[idx, 'arrival_city']

        # Print a status message
        logging.info(f"Extracting the flight route data of destination airport {departure_city} and arrival airport {arrival_city}. This is route number {idx + 1} out of {crawling_range}")

        # Get the city IDs that will be inserted as parameters in the get_flight_data function
        departure_city_id = df_airport_data_excel[df_airport_data_excel["search_term_original"] == departure_city]["IataCode"].reset_index(drop=True)[0]
        arrival_city_id = df_airport_data_excel[df_airport_data_excel["search_term_original"] == arrival_city]["IataCode"].reset_index(drop=True)[0]

        # Get the airport data and create a data frame out of it
        try:
            output_flight_data = get_flight_data(origin_airport=departure_city_id, destination_airport=arrival_city_id, departure_date=crawling_date)["data"]
            
            # Loop over all the results in "data"
            list_output_dict = []
            for idx_2, res in enumerate(output_flight_data):
                logging.info(f"Extracting flight data from result number {idx_2 + 1} out of {len(output_flight_data)}. This is for destination airport {departure_city} and arrival airport {arrival_city}, which is route number {idx + 1} out of {crawling_range}")
                output_dict = {
                    "price": res["price"]["amount"],
                    "origin_airport_name": res["legs"][0]["origin"]["name"],
                    "origin_airport_display_code": res["legs"][0]["origin"]["display_code"],
                    "departure_airport_name": res["legs"][0]["destination"]["name"],
                    "departure_airport_display_code": res["legs"][0]["destination"]["display_code"],
                    "flight_departure_time": res["legs"][0]["departure"],
                    "flight_arrival_time": res["legs"][0]["arrival"],
                    "competitor": res["legs"][0]["carriers"][0]["name"],
                    "flight_duration": res["totalDuration"],
                    "departure_city_search_term": departure_city,
                    "arrival_city_search_term": arrival_city,
                    "departure_city_id": departure_city_id,
                    "arrival_city_id": arrival_city_id,
                    "crawling_date": crawling_date
                }
                list_output_dict.append(output_dict)
            
            # Change lsit_output_dict to a pandas dataframe
            df_output_dict = pd.DataFrame(list_output_dict)

            # Append the result to airport_list
            list_flight_data.append(df_output_dict)

            # Write the result to Excel
            with pd.ExcelWriter(flight_data_excel_file, engine="openpyxl", mode="a", if_sheet_exists="overlay") as writer:
                last_row = pd.read_excel(flight_data_excel_file).shape[0]
                if last_row == 0:
                    df_output_dict.to_excel(writer, sheet_name="Sheet1", index=False, header=True, encoding="utf-8", startrow=last_row)
                else:
                    df_output_dict.to_excel(writer, sheet_name="Sheet1", index=False, header=False, encoding="utf-8", startrow=last_row + 1)
            
            # Print a new line to mark the start of a new route
            logging.info("\n")
        except KeyError: # This KeyError can happen when the API fails to return a response with the given parameters. This could be because there are no flights on that date or the city IDs are incorrect
            logging.info(f"A KeyError has occurred while attempting to extract the data for destination airport {departure_city} and arrival airport {arrival_city}")
            output_dict = {
                "departure_city": departure_city,
                "arrival_city": arrival_city,
                "departure_city_id": departure_city_id,
                "arrival_city_id": arrival_city_id,
                "crawling_date": crawling_date
            }
            list_failed_routes.append(output_dict)
            
            # Write the result to Excel
            with pd.ExcelWriter(failed_routes_excel_file, engine="openpyxl", mode="a", if_sheet_exists="overlay") as writer:
                last_row = pd.read_excel(failed_routes_excel_file).shape[0]
                pd.DataFrame([output_dict]).to_excel(writer, sheet_name="Sheet1", index=False, header=False, encoding="utf-8", startrow=last_row + 1)

    # Change list_flight_data to a data frame
    df_flight_data = pd.concat(list_flight_data)

    return df_flight_data

In [294]:
extract_flight_data_by_date(crawling_date=initial_crawling_date_formatted, crawling_range=len(df_routes))

KeyboardInterrupt: 