In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
# Suppress just SettingWithCopyWarning
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)

# Part 1: Getting current routes

Now, we get the current routes for each airport based on their wikipedia article based on a series of scripts developed in the first notebook. Save to a current_routes.csv file. 6


defining key functions

In [None]:
#function to extract text between two found substrings
def extract_between(text, start, end):
    start_idx = text.find(start)
    if start_idx == -1:
        return "" #return empty string
    start_idx += len(start)
    end_idx = text.find(end, start_idx)
    if end_idx == -1:
        return ""
    return text[start_idx:end_idx]
#function to get iata code from wikipedia name
def get_iata_code(iata,ref_data=pd.read_csv("./data/top_airports_basic_data.csv")):
    matched = ref_data[ref_data["wiki_name"]==iata].iloc[0]
    return matched["iata"]
#custom function to find the text matching the passenger destination section 
def extract_passenger_destinations(text, start, end):
    start_idx = text.find(start)
    #narrow down search text 
    if start_idx == -1:
        return "" #return empty string
    start_idx += len(start)
    start_idx += 3 #increment by 3 to account for ==
    end_idx = text.find(end, start_idx)
    if end_idx == -1:
        return ""
    return text[start_idx:end_idx]

def get_destinations(iata_source, wiki_name, file_append_path):
    #open file in append mode
    f = open(file_append_path, "a")
    
    search_url = f"https://en.wikipedia.org/w/index.php?title={wiki_name}&action=raw"
    response = requests.get(search_url)
    full_text = response.text
    #get to the passenger destinations section marked by a predictiable substrings ()
    # Call both and choose the first non-None result
    destinations_text = extract_passenger_destinations(full_text, "Airlines and destinations","==") 
    #if the length if less than 5, a low amount, we try differently
    if(len(destinations_text) < 5): destinations_text = extract_passenger_destinations(full_text, "Airlines and destinations","Cargo") 
    destinations_list = destinations_text.split("<!-- -->")#split predictably
    
    return_list = [] #return a list of destination of tuple of the form (iata_source, starting_wiki_name, dest_wikipedia_name,airline, isSeasonal)
    #ignore the first part, since that is some disclaimer
    for i in range(1, len(destinations_list)):
        airlineDest = destinations_list[i]#we get a list relating to airlines and their list of destinations
        #get information before and the seasonal destination section (if present)
        beforeSeasonal = ""
        afterSeasonal = None
        if "'''Seasonal:'''" in airlineDest:
            seasonalSplit = airlineDest.split("'''Seasonal:'''")
            beforeSeasonal = seasonalSplit[0]
            afterSeasonal = seasonalSplit[1]
        else: 
            beforeSeasonal = airlineDest 
        
        #get the airline based on a predictable pattern (in the first element)
        beforeSeasonalsplit = beforeSeasonal.split(",") #split based on commas
        airline_firstdest = beforeSeasonalsplit[0].split(" | [[")  #split between the airline and first destination
        airline = str(airline_firstdest[0]) #extract the airline
        airline = extract_between(airline,"[[","]]") #remove the [[]]
        #do a final split just in case (even if | is not found), [0] selects the current string
        firstdest = ""  #get first destination (assuming not seasonal) 
        if len(airline_firstdest) == 2:
            destination = airline_firstdest[1].split("|")[0].strip()
            destination_wiki_name = destination.replace(" ", "_") #replace destination to get the destination iata_code
            f.write(f"{iata_source},{wiki_name},{destination_wiki_name},{airline},0\n") #(iata_source, starting_wiki_name, dest_wikipedia_name,airline, isSeasonal)
        #loop thorugh the rest of the comma list for the before seasonal split if possible 
        for i in range(1, len(beforeSeasonalsplit)):
            destination = beforeSeasonalsplit[i]
            extracted_destination = extract_between(destination,"[[","]]")
            if (extracted_destination!=""): #assuming 
                destination_wiki_name = extracted_destination.split("|")[0].strip()
                destination_wiki_name = destination_wiki_name.replace(" ", "_")
                f.write(f"{iata_source},{wiki_name},{destination_wiki_name},{airline},0\n") #(iata_source, starting_wiki_name, dest_wikipedia_name,airline, isSeasonal)
        #loop through the seasonal destinations list if the current airline has it
        if (afterSeasonal):
            afterSeasonalSplit = afterSeasonal.split(",")
            for i in range(0, len(afterSeasonalSplit)):
                destination = afterSeasonalSplit[i]
                extracted_destination = extract_between(destination,"[[","]]")
                if (extracted_destination!=""): #assuming 
                    destination_wiki_name = extracted_destination.split("|")[0].strip()
                    destination_wiki_name = destination_wiki_name.replace(" ", "_")
                    f.write(f"{iata_source},{wiki_name},{destination_wiki_name},{airline},1\n") #(iata_source, starting_wiki_name, dest_wikipedia_name,airline, isSeasonal)
    f.close() #close file at end
    return "done"

Start the file to store modern route data

In [3]:
f = open("./data/current_routes.csv", "w")
f.write("iata_source,starting_wiki_name,dest_wikipedia_name,airline,isSeasonal") 
f.close() #add column names

get iata code, wikiname lists

In [16]:
data = pd.read_csv("./data/top_airports_basic_data.csv")
print(len(data))
data.head(n=1)
codes_list = data["iata"]
names_list = data["wiki_name"]
file_append_path = "./data/current_routes.csv"

989


iterrate through airports 0 to 50

In [12]:
for i in range(0,50):
    print("airport index:", i)
    code = codes_list[i]
    name = names_list[i]
    get_destinations(code,name,file_append_path)
    

airport index: 0
airport index: 1
airport index: 2
airport index: 3
airport index: 4
airport index: 5
airport index: 6
airport index: 7
airport index: 8
airport index: 9
airport index: 10
airport index: 11
airport index: 12
airport index: 13
airport index: 14
airport index: 15
airport index: 16
airport index: 17
airport index: 18
airport index: 19
airport index: 20
airport index: 21
airport index: 22
airport index: 23
airport index: 24
airport index: 25
airport index: 26
airport index: 27
airport index: 28
airport index: 29
airport index: 30
airport index: 31
airport index: 32
airport index: 33
airport index: 34
airport index: 35
airport index: 36
airport index: 37
airport index: 38
airport index: 39
airport index: 40
airport index: 41
airport index: 42
airport index: 43
airport index: 44
airport index: 45
airport index: 46
airport index: 47
airport index: 48
airport index: 49


iterrate through airports 50 to 100

In [13]:
for i in range(50,100):
    print("airport index:", i)
    code = codes_list[i]
    name = names_list[i]
    get_destinations(code,name,file_append_path)

airport index: 50
airport index: 51
airport index: 52
airport index: 53
airport index: 54
airport index: 55
airport index: 56
airport index: 57
airport index: 58
airport index: 59
airport index: 60
airport index: 61
airport index: 62
airport index: 63
airport index: 64
airport index: 65
airport index: 66
airport index: 67
airport index: 68
airport index: 69
airport index: 70
airport index: 71
airport index: 72
airport index: 73
airport index: 74
airport index: 75
airport index: 76
airport index: 77
airport index: 78
airport index: 79
airport index: 80
airport index: 81
airport index: 82
airport index: 83
airport index: 84
airport index: 85
airport index: 86
airport index: 87
airport index: 88
airport index: 89
airport index: 90
airport index: 91
airport index: 92
airport index: 93
airport index: 94
airport index: 95
airport index: 96
airport index: 97
airport index: 98
airport index: 99


iterrate through airpots 100 to 150

In [18]:
for i in range(100, 150):
    print("airport index:", i)
    code = codes_list[i]
    name = names_list[i]
    get_destinations(code,name,file_append_path)

airport index: 100
airport index: 101
airport index: 102
airport index: 103
airport index: 104
airport index: 105
airport index: 106
airport index: 107
airport index: 108
airport index: 109
airport index: 110
airport index: 111
airport index: 112
airport index: 113
airport index: 114
airport index: 115
airport index: 116
airport index: 117
airport index: 118
airport index: 119
airport index: 120
airport index: 121
airport index: 122
airport index: 123
airport index: 124
airport index: 125
airport index: 126
airport index: 127
airport index: 128
airport index: 129
airport index: 130
airport index: 131
airport index: 132
airport index: 133
airport index: 134
airport index: 135
airport index: 136
airport index: 137
airport index: 138
airport index: 139
airport index: 140
airport index: 141
airport index: 142
airport index: 143
airport index: 144
airport index: 145
airport index: 146
airport index: 147
airport index: 148
airport index: 149
