In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import re 
import pandas as pd
import numpy as np
import math
# Suppress just SettingWithCopyWarning
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.ChainedAssignmentError)
warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)
pd.options.mode.chained_assignment = None  # Disable the warning


# Part 1: Get most recent wikipedia urls ids for airports before 2020, and 2022

This will help us analyze covid recovery route trends. Additionally, in the process we create a detaiiled dataset of current routes. We first check for redirects in the names

In [2]:
def redirectCheck(wiki_name):
    url = f"https://en.wikipedia.org/w/index.php?title={wiki_name}&action=raw"
    response = requests.get(url)
    #find the text between
    text = response.text

    #check for redirect
    check_text = text.split("[[")[0] #get section between [[
    check_text = check_text.lower()
    #if the text is clearly too long (>10 lines, there is clearly no redirect)
    lines = text.splitlines()
    number_of_lines = len(lines)
    if number_of_lines > 10:
        return wiki_name

    if ("redirect" in check_text):
        #get the text in between [[]]
        redirect = text.split("]]")[0]
        #if the text does not contain a key #, we reject
        if "#" not in check_text:
            return wiki_name
        redirect = redirect.split("[[")[1]
        redirect = redirect.replace(" ", "_") #replace spaces
        print("redirect found:", redirect)
        return redirect
    else:
        return wiki_name #return the same name back

In [3]:
redirectCheck("Beijing_Capital_International_Airport")

'Beijing_Capital_International_Airport'

In [4]:
redirectCheck("Malacca_International_Airport")
redirectCheck("Sultan_Muhammad_Salahuddin_Airport")

redirect found: Malacca_Airport
redirect found: Sultan_Muhammad_Salahudin_Airport


'Sultan_Muhammad_Salahudin_Airport'

Running redirects code

In [5]:
ref_data = pd.read_csv("./data/current_source_airports.csv", encoding='utf-8')
ref_data["redirects"] = "" #add columns for directs 
for index, row in ref_data.iterrows():
    iata = row["IATA"]
    wikiname = row["wiki_name"]
    print("reparing index:", index)
    val = redirectCheck(wikiname)
    #add to redirects just in case, the original wikiname is different
    if val != wikiname:
        ref_data.at[index, "redirects"] = val
#write to the new csv about the details of the airport
ref_data.to_csv("./data/current_source_airports_details.csv", encoding='utf-8', index=False)
        

reparing index: 0
reparing index: 1
reparing index: 2
reparing index: 3
reparing index: 4
reparing index: 5
reparing index: 6
reparing index: 7
reparing index: 8
reparing index: 9
reparing index: 10
reparing index: 11
reparing index: 12
reparing index: 13
reparing index: 14
reparing index: 15
reparing index: 16
reparing index: 17
reparing index: 18
reparing index: 19
reparing index: 20
reparing index: 21
reparing index: 22
reparing index: 23
redirect found: Madrid–Barajas_Airport
reparing index: 24
reparing index: 25
reparing index: 26
reparing index: 27
reparing index: 28
reparing index: 29
reparing index: 30
reparing index: 31
reparing index: 32
reparing index: 33
reparing index: 34
reparing index: 35
reparing index: 36
reparing index: 37
reparing index: 38
reparing index: 39
reparing index: 40
reparing index: 41
reparing index: 42
reparing index: 43
reparing index: 44
reparing index: 45
reparing index: 46
reparing index: 47
reparing index: 48
reparing index: 49
reparing index: 50
re

In [6]:
#function to find version code before a certain formated date 

def get_oldid_before(title, date):
    """Get the revision ID (oldid) of the latest version before a given date."""
    api_url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "prop": "revisions",
        "titles": title,
        "rvlimit": 1,
        "rvstart": date,
        "rvdir": "older",
        "rvprop": "ids",
        "formatversion": 2
    }

    response = requests.get(api_url, params=params)
    data = response.json()

    try:
        return str(data['query']['pages'][0]['revisions'][0]['revid'])
    except (KeyError, IndexError):
        return ""

# Example usage
id = get_oldid_before("John_F._Kennedy_International_Airport", "2020-01-01T00:00:00Z")
print(id)

933044375


In [7]:
#use the current routes airports source detailed data
ref_data = pd.read_csv("./data/current_source_airports_details.csv", encoding='utf-8')
ref_data.head(n=1)

Unnamed: 0,IATA,wiki_name,city,country,latitude,longitude,redirects
0,ATL,Hartsfield–Jackson_Atlanta_International_Airport,Atlanta,Usa,33.64,-84.427,


checking to see if it is possible to check for nulls

In [8]:
test1 = ref_data.iloc[0]["redirects"]
print(pd.isna(test1))

True


iterate to find old ids.


In [9]:
ref_data["pre2020_ids"] = None
ref_data["pre2022_ids"] = None
pre2020_ids = []
pre2022_ids = []
for index, row in ref_data.iterrows():
    print(index)
    wiki_name = row["wiki_name"]
    #check for redirects
    redirect = row["redirects"]
    if (pd.isna(redirect) == False): #if there is an redirect, use it
        wiki_name = redirect
        print("redirect used:",wiki_name)
    id1 = get_oldid_before(wiki_name, "2020-01-01T00:00:00Z")
    pre2020_ids.append(id1)
    id2 = get_oldid_before(wiki_name, "2022-01-01T00:00:00Z")
    pre2022_ids.append(id2)
ref_data["pre2020_ids"] = pre2020_ids
ref_data["pre2022_ids"] = pre2022_ids

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
redirect used: Madrid–Barajas_Airport
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
redirect used: Lisbon_Airport
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
redirect used: Ürümqi_Tianshan_International_Airport
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
redirect used: Mashhad_Shahid_Hasheminejad_International_Airport
189
190
191
192
193
194
195
196
197
198
199
200
redirect used: Milan_Linate_Airport
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
redirect used: F

In [10]:
ref_data.head(n=1)

Unnamed: 0,IATA,wiki_name,city,country,latitude,longitude,redirects,pre2020_ids,pre2022_ids
0,ATL,Hartsfield–Jackson_Atlanta_International_Airport,Atlanta,Usa,33.64,-84.427,,932935279,1063034925


In [11]:
ref_data.to_csv("./data/current_source_airports_details.csv", encoding='utf-8', index=False)

checking both new columns have all non - None values

In [12]:
ref_data = pd.read_csv("./data/current_source_airports_details.csv")
print( len(ref_data[ref_data["pre2020_ids"].isnull()])  )
print( len(ref_data[ref_data["pre2022_ids"].isnull()])  )

print( len(ref_data[ref_data["pre2020_ids"]==""])  )
print( len(ref_data[ref_data["pre2022_ids"]==""])  )


0
0
0
0


checks passed, done with part 1. Keep in mind that during routes generation, if an entry has an redirect, we use that 

# Part 2: Route generation for 2020

Additionally, improvements to current_source_airports data, new modified data for exceptions

In [13]:
def get_destinations(iata_source, article_id, wiki_name, path_write):
    file = open(path_write, "a") #file to append to
    
    url = f"https://en.wikipedia.org/w/index.php?title={wiki_name}&oldid={article_id}"
    response = requests.get(url)
    
    soup = BeautifulSoup(response.text, 'html.parser')
    #find the related destination table
    # Case-insensitive string match
    heading = soup.find("h2", string=re.compile(r"destination", re.IGNORECASE))
    #check text in heading
    heading_text =  heading.get_text()
    if  "former" in heading_text or "Former" in heading_text: #if either text is found, abort the function. This indicate the airport is no longer in service
        file.close() #close
        return
        
    
    table = heading.find_next("table") 
    while ('wikitable' not in table.get("class")): #find the next table matching a predictable class, if one has not been found
        table = table.find_next("table") 
    rows = table.find_all("tr")

    
    for i in range(1,len(rows)): #exclude the first row
        row = rows[i]
        # Extract all cells (td or th)
        cols = row.find_all(["td", "th"])
        # Write the row text content to CSV
        #first column is the airline
        airline = cols[0].get_text(strip=True)
        #get the list of destinations in the 2nd  
        destinations = cols[1]
        isSeasonal = 0 #iterate over subcomponents (seasonal always comes last, so set is seasonal to be false for now)
        for child in destinations.children: 
            #anchor components are the only destinations
            if (child.name == "a"):
                dest_name = child.get('title') #the title is the official wikipedia airport name (without _ in place of spaces)
                dest_name = dest_name.replace(" ", "_") 
                output = f"\"{iata_source}\",\"{wiki_name}\",\"{dest_name}\",\"{airline}\",\"{isSeasonal}\"\n" #final output to append to the file
                file.write(output)#write file
            elif ((child.name == "b") and (child.text == "Seasonal:")):
                isSeasonal = 1 #get seasonal to be 1 for future destinations
    file.close() #close
    return

get reference, airport lists for both years (from the detailed airport list earlier)

In [14]:
data = pd.read_csv("./data/current_source_airports_details.csv")
print(len(data))
data.head(n=1)

980


Unnamed: 0,IATA,wiki_name,city,country,latitude,longitude,redirects,pre2020_ids,pre2022_ids
0,ATL,Hartsfield–Jackson_Atlanta_International_Airport,Atlanta,Usa,33.64,-84.427,,932935279,1063034925


checking if it is possible to check for nulls (to know when to use the redirect version)

In [15]:
test1 = data.iloc[0]["redirects"]
print(pd.isna(test1))

True


Start files to start route data

In [16]:
file_append_path = "./data/pre2020_routes.csv"
f = open(file_append_path, "w")
f.write("iata_source,starting_wiki_name,dest_wikipedia_name,airline,isSeasonal\n") 
f.close() #add column names

Iteration

In [17]:

for index, row in data.iterrows():
    print("airport index:", index)
    wikiname = row["wiki_name"]
    #check for redirects 
    redirect = row["redirects"]
    if (pd.isna(redirect) == False): 
        wikiname = redirect
    #get the id for the year
    id = row["pre2020_ids"]
    code = row["IATA"]
    try:
        get_destinations(code,id,wikiname,file_append_path)
    except:
        print(f"failed airport:{wikiname}")


airport index: 0
airport index: 1
airport index: 2
airport index: 3
airport index: 4
airport index: 5
airport index: 6
airport index: 7
airport index: 8
airport index: 9
airport index: 10
airport index: 11
airport index: 12
airport index: 13
airport index: 14
airport index: 15
airport index: 16
airport index: 17
airport index: 18
airport index: 19
airport index: 20
airport index: 21
airport index: 22
airport index: 23
airport index: 24
airport index: 25
airport index: 26
airport index: 27
airport index: 28
airport index: 29
airport index: 30
airport index: 31
airport index: 32
airport index: 33
airport index: 34
airport index: 35
airport index: 36
airport index: 37
airport index: 38
airport index: 39
airport index: 40
airport index: 41
airport index: 42
airport index: 43
airport index: 44
airport index: 45
airport index: 46
airport index: 47
airport index: 48
airport index: 49
airport index: 50
airport index: 51
airport index: 52
airport index: 53
airport index: 54
airport index: 55
ai

In [18]:
data = pd.read_csv(file_append_path, encoding='utf-8') #for windows
data.to_csv(file_append_path, encoding='utf-8', index=False)

### locating the missing airports

In [19]:
ref_data = pd.read_csv("./data/current_source_airports_details.csv") #get current airports 
all_airports = ref_data["IATA"]
print("number of total airports:", len(all_airports))

number of total airports: 980


In [20]:
#find missing current airports
curr_airports = pd.read_csv(file_append_path, encoding='utf-8') #for windows
curr_airports = curr_airports["iata_source"].unique()
print("number of airports currently in routes data:", len(curr_airports))

#find missing airports by set difference
missing_airports = set(all_airports) - set(curr_airports)
print("number of missing airports:", len(missing_airports))

number of airports currently in routes data: 974
number of missing airports: 6


In [21]:
missing_airports

{'FRS', 'HIN', 'HSR', 'ITM', 'ROT', 'TAO'}

Most of these airports did not exist back then or good wikipedia links from back then, with the exceptions of:

ITM, TAO These are all relatively small airports. We define a function to manually add entries if a valid wikipedia article is found, by making the exact old article link be posted

In [None]:
def add_routes_exact(iata_source, link, path_write, match="destination"):
    file = open(path_write, "a") #file to append to
    
    url = link
    response = requests.get(url)
    
    soup = BeautifulSoup(response.text, 'html.parser')
    #find the related destination table
    # Case-insensitive string match
    find = match
    heading = soup.find(string=re.compile(r"({})".format(find), re.IGNORECASE))
    #check text in heading
    heading_text =  heading.get_text()
    if  "former" in heading_text or "Former" in heading_text: #if either text is found, abort the function. This indicate the airport is no longer in service
        file.close() #close
        return
        
    
    table = heading.find_next("table") 
    rows = table.find_all("tr")
    #find wikiname by splitting 
    wiki_name = link.split("title=")[1]
    wiki_name = wiki_name.split("&")[0]
    for i in range(1,len(rows)): #exclude the first row
        row = rows[i]
        # Extract all cells (td or th)
        cols = row.find_all(["td", "th"])
        # Write the row text content to CSV
        #first column is the airline
        airline = cols[0].get_text(strip=True)
        #get the list of destinations in the 2nd  
        destinations = cols[1]
        isSeasonal = 0 #iterate over subcomponents (seasonal always comes last, so set is seasonal to be false for now)
        for child in destinations.children: 
            #anchor components are the only destinations
            if (child.name == "a"):
                dest_name = child.get('title') #the title is the official wikipedia airport name (without _ in place of spaces)
                dest_name = dest_name.replace(" ", "_") 
                output = f"\"{iata_source}\",\"{wiki_name}\",\"{dest_name}\",\"{airline}\",\"{isSeasonal}\"\n" #final output to append to the file
                file.write(output)#write file
            elif ((child.name == "b") and (child.text == "Seasonal:")):
                isSeasonal = 1 #get seasonal to be 1 for future destinations
    file.close() #close
    return

In [41]:
#repair for FRS
found_link = "https://en.wikipedia.org/w/index.php?title=Mundo_Maya_International_Airport&oldid=933223161"
add_routes_exact("FRS",link=found_link,path_write=file_append_path, match="airlines")

In [28]:
#repair for HIN
found_link = "https://en.wikipedia.org/w/index.php?title=Sacheon_Airport&oldid=1265903527"
add_routes_exact("HIN",link=found_link,path_write=file_append_path, match="airlines")

In [None]:
#HSR did not exist back then

In [58]:
#repair for ITM
found_link = "https://en.wikipedia.org/w/index.php?title=Itami_Airport&oldid=929306656"
add_routes_exact("ITM",link=found_link,path_write=file_append_path, match=" terminal is planned to be extensively renovated by August 2020 to include a new pier for additional aircraft,")

[<td> <a href="/wiki/All_Nippon_Airways" title="All Nippon Airways">All Nippon Airways</a> </td>, <td> <a href="/wiki/Fukuoka_Airport" title="Fukuoka Airport">Fukuoka</a>, <a href="/wiki/Hakodate_Airport" title="Hakodate Airport">Hakodate</a>, <a href="/wiki/Kagoshima_Airport" title="Kagoshima Airport">Kagoshima</a>, <a href="/wiki/Kumamoto_Airport" title="Kumamoto Airport">Kumamoto</a>, <a href="/wiki/Miyazaki_Airport" title="Miyazaki Airport">Miyazaki</a>, <a href="/wiki/Nagasaki_Airport" title="Nagasaki Airport">Nagasaki</a>, <a href="/wiki/Naha_Airport" title="Naha Airport">Naha</a>, <a href="/wiki/Niigata_Airport" title="Niigata Airport">Niigata</a>, <a href="/wiki/New_Chitose_Airport" title="New Chitose Airport">Sapporo–Chitose</a>, <a href="/wiki/Sendai_Airport" title="Sendai Airport">Sendai</a>, <a href="/wiki/Haneda_Airport" title="Haneda Airport">Tokyo–Haneda</a>, <a href="/wiki/Narita_International_Airport" title="Narita International Airport">Tokyo–Narita</a><br/><b>Seasona

In [33]:
#repair for ROT
found_link = "https://en.wikipedia.org/w/index.php?title=Rotorua_Airport&oldid=996409149#Airlines_and_destinations"
add_routes_exact("ROT",link=found_link,path_write=file_append_path, match="destinations")

In [35]:
#TAO with the current name, using the old airport name, which was also in a differnt location
found_link =  "https://en.wikipedia.org/w/index.php?title=Qingdao_Liuting_International_Airport&oldid=992291152"
add_routes_exact("TAO",link=found_link,path_write=file_append_path, match="destinations")

# part 3: 2022 data 

In [59]:
def get_destinations(iata_source, article_id, wiki_name, path_write):
    file = open(path_write, "a") #file to append to
    
    url = f"https://en.wikipedia.org/w/index.php?title={wiki_name}&oldid={article_id}"
    response = requests.get(url)
    
    soup = BeautifulSoup(response.text, 'html.parser')
    #find the related destination table
    # Case-insensitive string match
    heading = soup.find("h2", string=re.compile(r"destination", re.IGNORECASE))
    #check text in heading
    heading_text =  heading.get_text()
    if  "former" in heading_text or "Former" in heading_text: #if either text is found, abort the function. This indicate the airport is no longer in service
        file.close() #close
        return
        
    
    table = heading.find_next("table") 
    while ('wikitable' not in table.get("class")): #find the next table matching a predictable class, if one has not been found
        table = table.find_next("table") 
    rows = table.find_all("tr")

    
    for i in range(1,len(rows)): #exclude the first row
        row = rows[i]
        # Extract all cells (td or th)
        cols = row.find_all(["td", "th"])
        # Write the row text content to CSV
        #first column is the airline
        airline = cols[0].get_text(strip=True)
        #get the list of destinations in the 2nd  
        destinations = cols[1]
        isSeasonal = 0 #iterate over subcomponents (seasonal always comes last, so set is seasonal to be false for now)
        for child in destinations.children: 
            #anchor components are the only destinations
            if (child.name == "a"):
                dest_name = child.get('title') #the title is the official wikipedia airport name (without _ in place of spaces)
                dest_name = dest_name.replace(" ", "_") 
                output = f"\"{iata_source}\",\"{wiki_name}\",\"{dest_name}\",\"{airline}\",\"{isSeasonal}\"\n" #final output to append to the file
                file.write(output)#write file
            elif ((child.name == "b") and (child.text == "Seasonal:")):
                isSeasonal = 1 #get seasonal to be 1 for future destinations
    file.close() #close
    return

In [60]:
file_append_path = "./data/pre2022_routes.csv"
f = open(file_append_path, "w")
f.write("iata_source,starting_wiki_name,dest_wikipedia_name,airline,isSeasonal\n") 
f.close() #add column names

Getting reference data

In [61]:
data = pd.read_csv("./data/current_source_airports_details.csv")
print(len(data))
data.head(n=1)

980


Unnamed: 0,IATA,wiki_name,city,country,latitude,longitude,redirects,pre2020_ids,pre2022_ids
0,ATL,Hartsfield–Jackson_Atlanta_International_Airport,Atlanta,Usa,33.64,-84.427,,932935279,1063034925


Looping through to try to add data

In [62]:
for index, row in data.iterrows():
    print("airport index:", index)
    wikiname = row["wiki_name"]
    #check for redirects 
    redirect = row["redirects"]
    if (pd.isna(redirect) == False): 
        wikiname = redirect
    #get the id for the year
    id = row["pre2022_ids"]
    code = row["IATA"]
    try:
        get_destinations(code,id,wikiname,file_append_path)
    except:
        print(f"failed airport:{wikiname}")

airport index: 0
airport index: 1
airport index: 2
airport index: 3
airport index: 4
airport index: 5
airport index: 6
airport index: 7
airport index: 8
airport index: 9
airport index: 10
airport index: 11
airport index: 12
airport index: 13
airport index: 14
airport index: 15
airport index: 16
airport index: 17
airport index: 18
airport index: 19
airport index: 20
airport index: 21
airport index: 22
airport index: 23
airport index: 24
airport index: 25
airport index: 26
airport index: 27
airport index: 28
airport index: 29
airport index: 30
airport index: 31
airport index: 32
airport index: 33
airport index: 34
airport index: 35
airport index: 36
airport index: 37
airport index: 38
airport index: 39
airport index: 40
airport index: 41
airport index: 42
airport index: 43
airport index: 44
airport index: 45
airport index: 46
airport index: 47
airport index: 48
airport index: 49
airport index: 50
airport index: 51
airport index: 52
airport index: 53
airport index: 54
airport index: 55
ai