In [1]:
import requests
import json
import pandas as pd
import os
import time
from datetime import datetime, timedelta
import logging
logging.basicConfig(
    level="INFO",
    datefmt="%Y-%m-%d",
    format="%(asctime)s - %(levelname)s - %(message)s",
    filename="skyscanner_logs.log"
)
import warnings
warnings.filterwarnings(action="ignore")
from dotenv import load_dotenv
# Load environment variables
load_dotenv()



True

### Define some global inputs

In [2]:
airport_data_excel_file = "airport_data.xlsx"
failed_airports_excel_file = "failed_airports.xlsx"
flight_data_excel_file = "flight_data_current.xlsx"
failed_routes_excel_file = "failed_routes_current.xlsx"

initial_crawling_date = datetime.now().date() + timedelta(days=7)
initial_crawling_date_formatted = datetime.strftime(initial_crawling_date, "%Y-%m-%d")

return_crawling_date = datetime.now().date() + timedelta(days=14)
return_crawling_date_formatted = datetime.strftime(return_crawling_date, "%Y-%m-%d")

t_plus_6_months_crawling_date = datetime.now().date() + timedelta(days=7) + timedelta(days=180)
t_plus_6_months_crawling_date_formatted = datetime.strftime(t_plus_6_months_crawling_date, "%Y-%m-%d")

t_plus_12_months_crawling_date = datetime.now().date() + timedelta(days=7) + timedelta(days=365)
t_plus_12_months_crawling_date_formatted = datetime.strftime(t_plus_12_months_crawling_date, "%Y-%m-%d")

api_request_wait = 20 # in seconds

num_api_attemps = 3

### Load the flight routes from the Excel file

In [35]:
df_routes = pd.read_excel("flight_routes.xlsx")

# Display the data frame
df_routes.columns = df_routes.columns.str.lower().str.replace(" ", "_")
df_routes

Unnamed: 0,departure_city,arrival_city,route_competition_status
0,Aalborg,Kaunas,Monopoly always
1,Aalborg,London (GB),Monopoly not always
2,Aalborg,Stockholm,Monopoly always
3,Aarhus,Faro (PT),Monopoly always
4,Aarhus,Gdansk,Monopoly always
...,...,...,...
5115,Zadar,Paris (FR),1 comp No Wizz
5116,Zadar,Vienna,1 comp No Wizz
5117,Zagreb,Dublin (IE),1 comp No Wizz
5118,Zagreb,Duesseldorf,1 comp No Wizz


In [4]:
# Create a list of all airports from df_routes
airport_list = pd.concat([df_routes["departure_city"], df_routes["arrival_city"]], axis=0).drop_duplicates().reset_index(drop=True).sort_values().tolist()

# Clean the list so only valid airports are shown
failed_airports_mapping_dict = {
    # Failed airports
    "Duesseldorf": "Düsseldorf",
    "Basel/Mulhouse": "Basel",
    "Cologne/Bonn": "Cologne",
    "Karlsruhe/Baden-Baden": "Karlsruhe",
    "Klaipeda/Palanga": "Palanga",
    "Leipzig/Halle": "Leipzig",
    "Lourdes/Tarbes": "Tarbes",
    "Maastricht/Aachen (NL)": "Maastricht",
    "Muenster/Osnabrueck (DE) 00": "Münster",
    "Paderborn/Lippstadt": "Paderborn Lippstadt",
    "Preveza/Lefkada": "Preveza",

    # Cities that don't have a City ID in the API response
    "Palma de Mallorca": "Mallorca",
    "Kerkyra": "Corfu",
    "Irakleion": "Heraklion",
    "Eilat (IL)": "Eilat",
    "Tel Aviv-yafo": "Tel Aviv",
}
airport_list = [failed_airports_mapping_dict.get(airport) if airport in list(failed_airports_mapping_dict.keys()) else airport for airport in airport_list]

### Define a function to extract the airport based on the city name

In [3]:
def extract_airport(city_name):
    url = "https://skyscanner50.p.rapidapi.com/api/v1/searchAirport"

    querystring = {"query": city_name}

    headers = {
        "X-RapidAPI-Key": os.getenv("API_KEY"),
        "X-RapidAPI-Host": "skyscanner50.p.rapidapi.com"
    }

    response = requests.get(url, headers=headers, params=querystring)

    return response.json()

### Call the airport data endpoint to retrieve information about the airports found in the cities

In [None]:
list_airport_data = []
list_failed_airports = []
for idx, i in enumerate(airport_list):
    # Print a status message
    logging.info(f"Extracting the airport data of airport {i}, which is airport number {idx + 1} out of {len(airport_list)}")
    
    # Get the airport data and create a data frame out of it
    try:
        output_extract_airport = pd.json_normalize(extract_airport(i)["data"][0])
        output_extract_airport["search_term"] = i

        # Append the result to airport_list
        list_airport_data.append(output_extract_airport)

        # Write the result to Excel
        with pd.ExcelWriter(airport_data_excel_file, mode='a', engine='openpyxl', if_sheet_exists="overlay") as writer:
            last_row = pd.read_excel(airport_data_excel_file).shape[0]
            if idx == 0:
                output_extract_airport.to_excel(writer, sheet_name='Sheet1', index=False, header=True, encoding="utf-8", startrow=last_row)
            else:
                output_extract_airport.to_excel(writer, sheet_name='Sheet1', index=False, header=False, encoding="utf-8", startrow=last_row + 1)
     # This KeyError can occur because the airport cannot be found in the data object of the API
     # The IndexError occurs because the airport cannot be found in the 0th index of the API response
    except (KeyError, IndexError): 
        list_failed_airports.append(i)
        
        # Write the result to Excel
        with pd.ExcelWriter(failed_airports_excel_file, mode='a', engine='openpyxl', if_sheet_exists="overlay") as writer:
            last_row = pd.read_excel(failed_airports_excel_file).shape[0]
            pd.DataFrame({"failed_airport": [i]}).to_excel(writer, sheet_name='Sheet1', index=False, header=False, encoding="utf-8", startrow=last_row + 1)

    # Introduce a sleep of 0.5 second between each call and the next
    time.sleep(0.5)

# Change list_airport_data to a dataframe
df_airport_data = pd.concat(list_airport_data).reset_index(drop=True)

# Drop the unnecessary columns after "search_term"
df_airport_data = df_airport_data.loc[:, :"search_term"]

### Define a function to get the route data from the searchFlights API endpoint

In [4]:
def get_flight_data(origin_airport, destination_airport, departure_date):
	url = "https://skyscanner50.p.rapidapi.com/api/v1/searchFlights"

	querystring = {
		"origin": origin_airport,
		"destination": destination_airport,
		"date": departure_date,
		"adults": "1",
		"cabinClass":"economy",
		"filter": "best",
		"currency": "EUR",
	}

	headers = {
		"X-RapidAPI-Key": os.getenv("API_KEY"),
		"X-RapidAPI-Host": "skyscanner50.p.rapidapi.com"
	}

	response = requests.get(url, headers=headers, params=querystring)

	return response.json()

### Get the route data from the searchFlights API endpoint

In [38]:
# Pull the airport data from airport_date.xlsx
df_airport_data_excel = pd.read_excel("airport_data.xlsx")

def find_dict_key_from_value(val, dictionary):
    return next((key for key, value in dictionary.items() if value == val), None)

def modify_search_term(x):
    if x in list(failed_airports_mapping_dict.values()):
        return find_dict_key_from_value(val=x, dictionary=failed_airports_mapping_dict) # Return the value of the dictionary
    else:
        return x # Return the same value in the data frame or the key of the dictionary

df_airport_data_excel["search_term_original"] = df_airport_data_excel["search_term"].apply(modify_search_term)
df_airport_data_excel

Unnamed: 0,PlaceId,PlaceName,LocalizedPlaceName,CountryId,CityId,IataCode,CountryName,PlaceNameEn,CityName,CityNameEn,GeoId,GeoContainerId,Location,ResultingPhrase,UntransliteratedResultingPhrase,search_term,search_term_original
0,AAL,Aalborg,,DK,AALB,AAL,Denmark,,Aalborg,,95673990,27536413,"57.093056,9.85","Aalborg (AAL), Aalborg|Ålborg|North Denmark Re...","Aalborg (AAL), Aalborg|Ålborg|North Denmark Re...",Aalborg,Aalborg
1,AAR,Aarhus,,DK,AARH,AAR,Denmark,,Aarhus,,95673987,27547464,"56.310278,10.618056","Aarhus (AAR), Aarhus|Århus Kommune|Aarhus Limo...","Aarhus (AAR), Aarhus|Århus Kommune|Aarhus Limo...",Aarhus,Aarhus
2,ABZ,Aberdeen,,UK,ABER,ABZ,United Kingdom,,Aberdeen,,95674056,27536433,"57.204167,-2.198056","Aberdeen (ABZ), Aberdeen|Aberdeen City|Scotlan...","Aberdeen (ABZ), Aberdeen|Aberdeen City|Scotlan...",Aberdeen (GB),Aberdeen (GB)
3,AGA,Agadir,,MA,AGAD,AGA,Morocco,,Agadir,,95673640,27536479,"30.325619,-9.412708","Agadir (AGA), Agadir|Morocco","Agadir (AGA), Agadir|Morocco",Agadir,Agadir
4,AHO,Alghero Sardinia,,IT,ALGS,AHO,Italy,,Alghero Sardinia,,95674177,27536503,"40.630556,8.288889","Alghero Sardinia (AHO), Alghero Sardinia|Provi...","Alghero Sardinia (AHO), Alghero Sardinia|Provi...",Alghero,Alghero
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224,WRO,Wroclaw,,PL,WROC,WRO,Poland,,Wroclaw,,95674155,27536248,"51.1,16.883333","Wroclaw (WRO), Wroclaw|Lower Silesian Voivodes...","Wroclaw (WRO), Wroclaw|Lower Silesian Voivodes...",Wroclaw,Wroclaw
225,ZAD,Zadar,,HR,ZADA,ZAD,Croatia,,Zadar,,95674072,27537473,"44.097778,15.356667","Zadar (ZAD), Zadar|Zadar County|Croatia","Zadar (ZAD), Zadar|Zadar County|Croatia",Zadar,Zadar
226,ZAG,Zagreb,,HR,ZAGR,ZAG,Croatia,,Zagreb,,95673639,27537474,"45.7408627,16.067501","Zagreb (ZAG), Zagreb|Zagreb Općina|Zagreb Coun...","Zagreb (ZAG), Zagreb|Zagreb Općina|Zagreb Coun...",Zagreb,Zagreb
227,INI,Nis,,RS,NISI,ZTH,Serbia,,Nis,,128668650,27542860,"43.333889,21.851667","Nis (INI), Nis|Niavski|Southern and Eastern S...","Nis (INI), Nis|Niavski|Southern and Eastern S...",Zakinthos Island,Zakinthos Island


In [5]:
# Create a for loop to call the API for each route pair
def extract_flight_data_by_date(crawling_date, crawling_range, flight_data_json_file_name, failed_routes_json_file_name, no_data_routes_json_file_name, routes_dataframe):
    list_flight_data = []
    list_no_data_routes = []
    list_failed_routes = []
    length_of_crawling_list = crawling_range[-1] + 1
    for idx in crawling_range:
        # Get the departure and arrival cities from df_routes
        departure_city = routes_dataframe.loc[idx, 'departure_city'] # This could be df_routes or df_failed_routes
        arrival_city = routes_dataframe.loc[idx, 'arrival_city'] # This could be df_routes or df_failed_routes

        # Print a status message
        logging.info(f"Extracting the flight route data of destination airport {departure_city} and arrival airport {arrival_city}. This is route number {idx + 1} out of {length_of_crawling_list}")

        # Get the city IDs that will be inserted as parameters in the get_flight_data function
        departure_city_id = df_airport_data_excel[df_airport_data_excel["search_term_original"] == departure_city]["IataCode"].reset_index(drop=True)[0]
        arrival_city_id = df_airport_data_excel[df_airport_data_excel["search_term_original"] == arrival_city]["IataCode"].reset_index(drop=True)[0]

        # Get the airport data and create a data frame out of it
        try:
            output_flight_data = get_flight_data(origin_airport=departure_city_id, destination_airport=arrival_city_id, departure_date=crawling_date)["data"]
        # This KeyError can happen when the API fails to return a response with the given parameters
        # This could be because there are no flights on that date or the city IDs are incorrect or simply because the requests are too fast
        # We will retry for three times and if there is still no response, then we will return an error
        except KeyError:
            logging.info(f"A KeyError has occurred while attempting to extract the data for destination airport {departure_city} and arrival airport {arrival_city}")
            for i in range(num_api_attemps):
                # Print a status message informing the user of the number of retry attempts
                logging.info(f"Retry #{i + 1}. Waiting for {api_request_wait} seconds")
                
                # Wait for a certain number of seconds
                time.sleep(api_request_wait)
                
                # Repeat the API request
                try:
                    logging.info(f"Sending an API request for destination airport {departure_city} and arrival airport {arrival_city}")
                    output_flight_data = get_flight_data(origin_airport=departure_city_id, destination_airport=arrival_city_id, departure_date=crawling_date)["data"]
                except KeyError:
                    # If the API request does not succeed, check if the number of attempts has been exhaused
                    if i <= (num_api_attemps-1):
                        logging.info(f"Total number of attempts is {i + 1}, which is still less than {num_api_attemps}. Continuing to the next iteration")
                        pass
                    else:
                        logging.info(f"A PERMANENT KeyError has occurred while attempting to extract the data for destination airport {departure_city} and arrival airport {arrival_city}. Appending the failed_routes list")
                        output_dict = {
                            "departure_city": departure_city,
                            "arrival_city": arrival_city,
                            "origin_city_id": departure_city_id,
                            "arrival_city_id": arrival_city_id,
                            "crawling_date": crawling_date
                        }
                        list_failed_routes.append(output_dict)
                        
                        # Write the result to a JSON file
                        with open(f"{failed_routes_json_file_name}.json", mode="w", encoding="utf-8") as f:
                            json.dump(obj=list_failed_routes, fp=f, indent=4, ensure_ascii=False)
                            f.close()
            
                # If the API request succeeds, exit the for loop and continue with the rest of the code
                if len(output_flight_data) > 0:
                    break
        
        # If output_flight_data contains data, append it to list_output_dict and populate flight_data_json_file_name.json. If not, add the route to the list of routes with no data
        if output_flight_data != []:
            # Loop over all the results in "data"
            list_output_dict = []
            for idx_2, res in enumerate(output_flight_data):
                logging.info(f"Extracting flight data from result number {idx_2 + 1} out of {len(output_flight_data)}. This is for destination airport {departure_city} and arrival airport {arrival_city}, which is route number {idx + 1} out of {length_of_crawling_list}")
                output_dict = {
                    "price_eur": res["price"]["amount"],
                    "origin_airport_name": res["legs"][0]["origin"]["name"],
                    "origin_airport_display_code": res["legs"][0]["origin"]["display_code"],
                    "arrival_airport_name": res["legs"][0]["destination"]["name"],
                    "arrival_airport_display_code": res["legs"][0]["destination"]["display_code"],
                    "flight_departure_time": res["legs"][0]["departure"],
                    "flight_arrival_time": res["legs"][0]["arrival"],
                    "competitor": res["legs"][0]["carriers"][0]["name"],
                    "flight_duration": res["totalDuration"],
                    "origin_city_search_term": departure_city,
                    "arrival_city_search_term": arrival_city,
                    "origin_city_id": departure_city_id,
                    "arrival_city_id": arrival_city_id,
                    "crawling_date": crawling_date
                }
                list_output_dict.append(output_dict)
            
            # Append the result to airport_list
            list_flight_data.append(list_output_dict)

            # Write the result to a JSON file
            with open(f"{flight_data_json_file_name}.json", mode="w", encoding="utf-8") as f:
                json.dump(obj=list_flight_data, fp=f, indent=4, ensure_ascii=False)
                f.close()
        else:
            logging.info(f"There is no data for destination airport {departure_city} and arrival airport {arrival_city}. Appending to the list of routes with no data...")
            output_dict = {
                "departure_city": departure_city,
                "arrival_city": arrival_city,
                "origin_city_id": departure_city_id,
                "arrival_city_id": arrival_city_id,
                "crawling_date": crawling_date
            }
            list_no_data_routes.append(output_dict)

            # Write the result to a JSON file
            with open(f"{no_data_routes_json_file_name}.json", mode="w", encoding="utf-8") as f:
                json.dump(obj=list_no_data_routes, fp=f, indent=4, ensure_ascii=False)
                f.close()
        
        # Print a new line to mark the start of a new route
        logging.info("\n")
        
        # Wait one second between each request and the next
        time.sleep(1)

    return

In [101]:
# Read the crawled routes from flight_data.xlsx and compare to df_routes to see the routes that we should still crawl
crawled_routes = pd.read_excel("flight_data.xlsx")
df_failed_routes_to_repeat = pd.merge(
    left=df_routes,
    right=crawled_routes[["origin_city_search_term", "arrival_city_search_term"]].drop_duplicates().reset_index(drop=True),
    left_on=["departure_city", "arrival_city"],
    right_on=["origin_city_search_term", "arrival_city_search_term"],
    how="left"
)
df_failed_routes_to_repeat = df_failed_routes_to_repeat[df_failed_routes_to_repeat["origin_city_search_term"].isnull()]
df_failed_routes_to_repeat.reset_index(drop=True, inplace=True)

In [111]:
extract_flight_data_by_date(
    crawling_date=initial_crawling_date_formatted,
    crawling_range=range(0, len(df_failed_routes_to_repeat)),
    flight_data_json_file_name="flight_data_current",
    failed_routes_json_file_name="failed_routes_current",
    no_data_routes_json_file_name="no_data_routes_current",
    routes_dataframe=df_failed_routes_to_repeat
)

In [93]:
# Open the JSON file containing the flight_data_current and dump its data into an Excel sheet
with open("flight_data_current.json", mode="r", encoding="utf-8") as f:
    data = json.load(f)
    f.close()

flight_data_list = []
for i in range(0, len(data)):
    flight_data = pd.DataFrame(data[i])
    flight_data_list.append(flight_data)

df_flight_data = pd.concat(flight_data_list)
df_flight_data.to_excel("flight_data_iter.xlsx", index=False)

In [96]:
# Open the JSON file containing the failed routes and dump its data into an Excel file
with open("no_data_routes_current.json", mode="r", encoding="utf-8") as f:
    data = json.load(f)
    f.close()

df_failed_routes = pd.DataFrame(data)
# df_failed_routes.to_excel("failed_routes_iter.xlsx", index=False)

In [92]:
# Manual crawling of routes
i = 15
departure_city = df_failed_routes.loc[i, "departure_city"]
arrival_city = df_failed_routes.loc[i, "arrival_city"]
departure_city_id = df_failed_routes.loc[i, "departure_city_id"]
arrival_city_id = df_failed_routes.loc[i, "arrival_city_id"]
crawling_date = "2023-06-05"

x = get_flight_data(origin_airport=departure_city_id, destination_airport=arrival_city_id, departure_date=crawling_date)["data"]
print(x)

list_output_dict = []
for idx_2, res in enumerate(x):
    output_dict = {
        "price_eur": res["price"]["amount"],
        "origin_airport_name": res["legs"][0]["origin"]["name"],
        "origin_airport_display_code": res["legs"][0]["origin"]["display_code"],
        "arrival_airport_name": res["legs"][0]["destination"]["name"],
        "arrival_airport_display_code": res["legs"][0]["destination"]["display_code"],
        "flight_departure_time": res["legs"][0]["departure"],
        "flight_arrival_time": res["legs"][0]["arrival"],
        "competitor": res["legs"][0]["carriers"][0]["name"],
        "flight_duration": res["totalDuration"],
        "origin_city_search_term": departure_city,
        "arrival_city_search_term": arrival_city,
        "origin_city_id": departure_city_id,
        "arrival_city_id": arrival_city_id,
        "crawling_date": crawling_date
    }
    list_output_dict.append(output_dict)

if list_output_dict != []:
    list_flight_data.append(list_output_dict)
    # Write the result to a JSON file
    with open("flight_data_current.json", mode="w", encoding="utf-8") as f:
        json.dump(obj=list_flight_data, fp=f, indent=4, ensure_ascii=False)
        f.close()

[{'id': '15270-2306052015--31915-0-9409-2306052315', 'price': {'amount': 96.11, 'update_status': 'pending', 'last_updated': '2023-05-29T11:27:17', 'quote_age': 574, 'score': 10, 'transfer_type': 'MANAGED'}, 'legs': [{'id': '15270-2306052015--31915-0-9409-2306052315', 'origin': {'id': 15270, 'entity_id': 128667204, 'alt_id': 'PED', 'parent_id': 6152, 'parent_entity_id': 27545927, 'name': 'Pardubice', 'type': 'Airport', 'display_code': 'PED'}, 'destination': {'id': 9409, 'entity_id': 95565083, 'alt_id': 'ALC', 'parent_id': 470, 'parent_entity_id': 27536538, 'name': 'Alicante', 'type': 'Airport', 'display_code': 'ALC'}, 'departure': '2023-06-05T20:15:00', 'arrival': '2023-06-05T23:15:00', 'duration': 180, 'carriers': [{'id': -31915, 'name': 'Ryanair', 'alt_id': 'FR', 'display_code': 'FR', 'display_code_type': 'IATA'}], 'stop_count': 0, 'stops': []}], 'is_eco_contender': True, 'eco_contender_delta': 25.345474, 'score': 10, 'totalDuration': 180}]
