In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import re
import pandas as pd
# Suppress just SettingWithCopyWarning
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)

# Part 1: Getting current routes

Now, we get the current routes for each airport based on their wikipedia article based on a series of scripts developed in the first notebook. Save to a current_routes.csv file. 6


defining key functions

In [None]:
def get_destinations(iata_source, wiki_name, path_write):
    file = open(path_write, "a") #file to append to
    
    url = f"https://en.wikipedia.org/wiki/{wiki_name}"
    response = requests.get(url)
    
    soup = BeautifulSoup(response.text, 'html.parser')
    #find the related destination table
    # Case-insensitive string match
    heading = soup.find("h2", string=re.compile(r"destination", re.IGNORECASE))
    #check text in heading
    heading_text =  heading.get_text()
    if  "former" in heading_text or "Former" in heading_text: #if either text is found, abort the function. This indicate the airport is no longer in service
        file.close() #close
        return
        
    
    table = heading.find_next("table") 
    while ('wikitable' not in table.get("class")): #find the next table matching a predictable class, if one has not been found
        table = table.find_next("table") 
    rows = table.find_all("tr")

    
    for i in range(1,len(rows)): #exclude the first row
        row = rows[i]
        # Extract all cells (td or th)
        cols = row.find_all(["td", "th"])
        # Write the row text content to CSV
        #first column is the airline
        airline = cols[0].get_text(strip=True)
        #get the list of destinations in the 2nd  
        destinations = cols[1]
        isSeasonal = 0 #iterate over subcomponents (seasonal always comes last, so set is seasonal to be false for now)
        for child in destinations.children: 
            #anchor components are the only destinations
            if (child.name == "a"):
                dest_name = child.get('title') #the title is the official wikipedia airport name (without _ in place of spaces)
                dest_name = dest_name.replace(" ", "_") 
                output = f"\"{iata_source}\",\"{wiki_name}\",\"{dest_name}\",\"{airline}\",\"{isSeasonal}\"\n" #final output to append to the file
                file.write(output)#write file
            elif ((child.name == "b") and (child.text == "Seasonal:")):
                isSeasonal = 1 #get seasonal to be 1 for future destinations
    file.close() #close
    return

Start the file to store modern route data

In [3]:
f = open("./data/current_routes.csv", "w")
f.write("iata_source,starting_wiki_name,dest_wikipedia_name,airline,isSeasonal\n") 
f.close() #add column names

get iata code, wikiname lists

In [4]:
data = pd.read_csv("./data/top_airports_basic_data.csv")
print(len(data))
data.head(n=1)
codes_list = data["iata"]
names_list = data["wiki_name"]
file_append_path = "./data/current_routes.csv"

989


iterrate through airports 0 to 50

In [5]:

for i in range(0,50):
    print("airport index:", i)
    code = codes_list[i]
    name = names_list[i]
    try:
        get_destinations(code,name,file_append_path)
    except:
        print(f"failed airport:{code}")


airport index: 0
airport index: 1
airport index: 2
airport index: 3
airport index: 4
airport index: 5
airport index: 6
airport index: 7
airport index: 8
airport index: 9
airport index: 10
airport index: 11
airport index: 12
airport index: 13
airport index: 14
failed airport:IST
airport index: 15
airport index: 16
airport index: 17
airport index: 18
airport index: 19
airport index: 20
airport index: 21
airport index: 22
airport index: 23
airport index: 24
airport index: 25
airport index: 26
airport index: 27
airport index: 28
airport index: 29
airport index: 30
airport index: 31
airport index: 32
airport index: 33
airport index: 34
airport index: 35
airport index: 36
airport index: 37
airport index: 38
airport index: 39
airport index: 40
airport index: 41
airport index: 42
airport index: 43
airport index: 44
airport index: 45
airport index: 46
airport index: 47
airport index: 48
airport index: 49


iterrate through airports 50 to 100

In [6]:
for i in range(50,100):
    print("airport index:", i)
    code = codes_list[i]
    name = names_list[i]
    try:
        get_destinations(code,name,file_append_path)
    except:
        print(f"failed airport:{code}")

airport index: 50
airport index: 51
airport index: 52
airport index: 53
airport index: 54
airport index: 55
airport index: 56
airport index: 57
airport index: 58
airport index: 59
airport index: 60
airport index: 61
airport index: 62
failed airport:ORL
airport index: 63
airport index: 64
airport index: 65
airport index: 66
airport index: 67
airport index: 68
airport index: 69
airport index: 70
airport index: 71
airport index: 72
airport index: 73
airport index: 74
airport index: 75
airport index: 76
airport index: 77
airport index: 78
airport index: 79
airport index: 80
airport index: 81
airport index: 82
airport index: 83
airport index: 84
airport index: 85
airport index: 86
airport index: 87
airport index: 88
airport index: 89
airport index: 90
airport index: 91
airport index: 92
airport index: 93
airport index: 94
airport index: 95
airport index: 96
airport index: 97
failed airport:TAO
airport index: 98
airport index: 99


iterate through airports 100 to 150

In [7]:
for i in range(100, 150):
    print("airport index:", i)
    code = codes_list[i]
    name = names_list[i]
    try:
        get_destinations(code,name,file_append_path)
    except:
        print(f"failed airport:{code}")

airport index: 100
airport index: 101
airport index: 102
airport index: 103
airport index: 104
airport index: 105
airport index: 106
airport index: 107
airport index: 108
airport index: 109
airport index: 110
airport index: 111
airport index: 112
airport index: 113
airport index: 114
airport index: 115
airport index: 116
airport index: 117
airport index: 118
airport index: 119
airport index: 120
airport index: 121
airport index: 122
airport index: 123
airport index: 124
airport index: 125
airport index: 126
airport index: 127
failed airport:HRB
airport index: 128
airport index: 129
airport index: 130
airport index: 131
airport index: 132
airport index: 133
airport index: 134
airport index: 135
airport index: 136
airport index: 137
airport index: 138
airport index: 139
airport index: 140
airport index: 141
airport index: 142
airport index: 143
airport index: 144
airport index: 145
airport index: 146
airport index: 147
airport index: 148
airport index: 149


iterate through airports 150 to 200

In [8]:
for i in range(150,200):
    print("airport index:", i)
    code = codes_list[i]
    name = names_list[i]
    try:
        get_destinations(code,name,file_append_path)
    except:
        print(f"failed airport:{code}")

airport index: 150
airport index: 151
airport index: 152
airport index: 153
airport index: 154
airport index: 155
airport index: 156
airport index: 157
airport index: 158
airport index: 159
airport index: 160
airport index: 161
airport index: 162
airport index: 163
airport index: 164
airport index: 165
airport index: 166
airport index: 167
airport index: 168
airport index: 169
airport index: 170
airport index: 171
airport index: 172
airport index: 173
airport index: 174
airport index: 175
airport index: 176
airport index: 177
airport index: 178
airport index: 179
airport index: 180
airport index: 181
airport index: 182
airport index: 183
airport index: 184
airport index: 185
airport index: 186
airport index: 187
airport index: 188
airport index: 189
airport index: 190
airport index: 191
airport index: 192
airport index: 193
airport index: 194
airport index: 195
airport index: 196
airport index: 197
airport index: 198
airport index: 199


iterate through airports 200 to 250

In [9]:
for i in range(200,250):
    print("airport index:", i)
    code = codes_list[i]
    name = names_list[i]
    try:
        get_destinations(code,name,file_append_path)
    except:
        print(f"failed airport:{code}")

airport index: 200
airport index: 201
airport index: 202
airport index: 203
airport index: 204
airport index: 205
airport index: 206
failed airport:HFE
airport index: 207
airport index: 208
airport index: 209
airport index: 210
airport index: 211
airport index: 212
airport index: 213
airport index: 214
airport index: 215
airport index: 216
airport index: 217
airport index: 218
airport index: 219
airport index: 220
airport index: 221
airport index: 222
airport index: 223
airport index: 224
airport index: 225
failed airport:JOG
airport index: 226
failed airport:INC
airport index: 227
airport index: 228
airport index: 229
airport index: 230
airport index: 231
airport index: 232
airport index: 233
airport index: 234
airport index: 235
airport index: 236
airport index: 237
airport index: 238
airport index: 239
airport index: 240
airport index: 241
airport index: 242
airport index: 243
airport index: 244
airport index: 245
airport index: 246
airport index: 247
airport index: 248
airport inde

iterate through airports 250 to 300

In [10]:
for i in range(250,300):
    print("airport index:", i)
    code = codes_list[i]
    name = names_list[i]
    try:
        get_destinations(code,name,file_append_path)
    except:
        print(f"failed airport:{code}")

airport index: 250
airport index: 251
airport index: 252
airport index: 253
airport index: 254
airport index: 255
airport index: 256
airport index: 257
airport index: 258
airport index: 259
airport index: 260
failed airport:YNT
airport index: 261
airport index: 262
airport index: 263
airport index: 264
airport index: 265
airport index: 266
airport index: 267
airport index: 268
airport index: 269
airport index: 270
airport index: 271
airport index: 272
airport index: 273
airport index: 274
airport index: 275
airport index: 276
airport index: 277
airport index: 278
failed airport:NAY
airport index: 279
airport index: 280
airport index: 281
airport index: 282
airport index: 283
airport index: 284
airport index: 285
airport index: 286
airport index: 287
airport index: 288
airport index: 289
airport index: 290
airport index: 291
airport index: 292
airport index: 293
airport index: 294
airport index: 295
airport index: 296
airport index: 297
airport index: 298
airport index: 299


iterate through airports 300 to 350

In [11]:
for i in range(300,350):
    print("airport index:", i)
    code = codes_list[i]
    name = names_list[i]
    try:
        get_destinations(code,name,file_append_path)
    except:
        print(f"failed airport:{code}")

airport index: 300
airport index: 301
airport index: 302
airport index: 303
airport index: 304
airport index: 305
airport index: 306
airport index: 307
airport index: 308
airport index: 309
airport index: 310
airport index: 311
airport index: 312
airport index: 313
airport index: 314
airport index: 315
airport index: 316
airport index: 317
airport index: 318
airport index: 319
airport index: 320
airport index: 321
airport index: 322
airport index: 323
airport index: 324
airport index: 325
airport index: 326
airport index: 327
airport index: 328
airport index: 329
airport index: 330
airport index: 331
airport index: 332
airport index: 333
airport index: 334
airport index: 335
airport index: 336
airport index: 337
airport index: 338
airport index: 339
airport index: 340
airport index: 341
airport index: 342
airport index: 343
airport index: 344
airport index: 345
airport index: 346
airport index: 347
airport index: 348
airport index: 349


iterate through the rest of the airports

In [12]:
for i in range(350, len(codes_list)):
    print("airport index:", i)
    code = codes_list[i]
    name = names_list[i]
    try:
        get_destinations(code,name,file_append_path)
    except:
        print(f"failed airport:{code}")

airport index: 350
airport index: 351
airport index: 352
airport index: 353
airport index: 354
airport index: 355
airport index: 356
airport index: 357
airport index: 358
airport index: 359
failed airport:SYZ
airport index: 360
airport index: 361
airport index: 362
airport index: 363
airport index: 364
airport index: 365
airport index: 366
airport index: 367
airport index: 368
airport index: 369
airport index: 370
airport index: 371
airport index: 372
airport index: 373
airport index: 374
airport index: 375
airport index: 376
airport index: 377
airport index: 378
airport index: 379
airport index: 380
airport index: 381
airport index: 382
airport index: 383
airport index: 384
airport index: 385
airport index: 386
airport index: 387
airport index: 388
airport index: 389
airport index: 390
airport index: 391
airport index: 392
airport index: 393
airport index: 394
airport index: 395
airport index: 396
airport index: 397
airport index: 398
airport index: 399
airport index: 400
airport inde

### fixing failed airports and updating airport data

A few airports were failing, due to outdated data, mainly due to some airports in cities like Qingdao closing, along with faulty links from the original query for some smaller airports. The updated wikipedia names were found for airports in cities where the main airport had changed (Berlin Tegel, Istanbul Ataturk, for example), with the following functions ran to add data like below.

A new table of airports based on the routes table here will be created to used be as a reference, with the same information obtained by wikipedia api

We find the missing airports like below:

In [None]:
routes = pd.read_csv("./data/current_routes.csv")
unique_airports_in_routes = set(routes["iata_source"].unique())
#find missing airports
missing = set(codes_list) - unique_airports_in_routes
for m in missing:
    print("missing airports", m)

adding some  missing entries (with the exception of executive/closed airports or those without destinations on wikipedia)

Some new airports have destinations added in the case that the old iata code was replace

In [16]:
get_destinations("LDU","Lahad_Datu_Airport",file_append_path)

In [17]:
get_destinations("CGY","Laguindingan_Airport",file_append_path)

In [18]:
#new Saratov airport -replacing RTW, removed RTW entries
get_destinations("GSV","Saratov_Gagarin_Airport",file_append_path)

In [19]:
#new Berlin airport- replacing TXL, SXF 
get_destinations("BER", "Berlin_Brandenburg_Airport",file_append_path)

In [20]:
get_destinations("UTH", "Udon_Thani_International_Airport",file_append_path)

In [21]:

get_destinations("VAS", "Sivas_Airport",file_append_path)

In [22]:
get_destinations("TER", "Lajes_Airport",file_append_path)

In [23]:

get_destinations("SYZ", "Shiraz_Shahid_Dastgheib_International_Airport",file_append_path)

In [24]:

get_destinations("SNO", "Sakon_Nakhon_Airport",file_append_path)

In [25]:
get_destinations("TTE", "Sultan_Babullah_Airport",file_append_path)

In [26]:

get_destinations("NST","Nakhon_Si_Thammarat_Airport",file_append_path)

In [27]:
obt = ("SOC", "Adisoemarmo_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [28]:
#new Rajkot airport, replacing RAJ
obt = ("HSR", "Rajkot_International_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [29]:
#replacing TAG, old TAG entries in routes removed
obt = ("TAG", "Bohol–Panglao_International_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [None]:
# New samarinda,indonesia airport, replacing SRI, old entries revmoed
obt = ("AAP", "Aji_Pangeran_Tumenggung_Pranoto_International_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [31]:
obt = ("DIN", "Dien_Bien_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [32]:
obt = ("UBJ", "Yamaguchi_Ube_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [33]:
obt = ("KUV", "Gunsan_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [34]:
obt = ("HMA", "Khanty-Mansiysk Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [35]:
obt = ("WGA", "Wagga_Wagga_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [36]:
obt = ("GRV", "Kadyrov_Grozny_International_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [37]:
obt = ("TAO", "Qingdao_Jiaodong_International_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [38]:
obt = ("HRB", "Harbin_Taiping_International_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [39]:
obt = ("YNT", "Yantai_Penglai_International_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [40]:
obt = ("ZAZ", "Zaragoza_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [41]:
obt = ("THS", "Sukhothai_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [42]:
#new Murcia airport, replacing MJV
obt = ("RMU", "Región_de_Murcia_International_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [43]:
obt = ("MSJ", "Misawa_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [44]:
obt = ("ISG", "New_Ishigaki_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [45]:
obt = ("TIM", "Mozes_Kilangin_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [46]:
obt = ("UBP", "Ubon_Ratchathani_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [47]:
obt = ("HFE", "Hefei_Xinqiao_International_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [48]:
obt = ("MLX", "Malatya_Erhaç_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [49]:
obt = ("REU", "Reus_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [50]:
obt = ("YKS", "Platon_Oyunsky_Yakutsk_International_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [51]:
#new istanbul airport
obt = ("IST", "Istanbul_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [52]:
obt = ("BMV", "Buon_Ma_Thuot_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [53]:
obt = ("ROV", "Platov_International_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [54]:
obt = ("TRZ", "Tiruchirappalli_International_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [55]:
obt = ("RBR", "Rio_Branco_International_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [56]:
obt = ("KOP", "Nakhon_Phanom_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [57]:
obt = ("JOG", "Adisutjipto_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [58]:
obt = ("UUS", "Yuzhno-Sakhalinsk_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [59]:
obt = ("NSN", "Nelson_Airport_(New_Zealand)")
get_destinations(obt[0],obt[1],file_append_path)

In [60]:
obt = ("NUX", "Novy_Urengoy_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [61]:
obt = ("INC", "Incheon_International_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [62]:
obt = ("PMW", "Palmas_Airport")
get_destinations(obt[0],obt[1],file_append_path)

### checking missing airports again, verifying none have further passengers

In [64]:
routes = pd.read_csv("./data/current_routes.csv")
unique_airports_in_routes = set(routes["iata_source"].unique())
#find missing airports
missing = set(codes_list) - unique_airports_in_routes
for m in missing:
    print("missing airports", m)

missing airports SXF
missing airports PRH
missing airports SRI
missing airports MJV
missing airports TXL
missing airports COT
missing airports MRQ
missing airports ORL
missing airports RTW
missing airports NAY
missing airports ULY
missing airports GET
missing airports PLU
missing airports RAJ
