In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import re 
import pandas as pd

# Suppress just SettingWithCopyWarning
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.ChainedAssignmentError)
warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)
pd.options.mode.chained_assignment = None  # Disable the warning


# Part 1: Getting current routes

Now, we get the current routes for each airport based on their wikipedia article based on a series of scripts developed in the first notebook. Save to a current_routes.csv file. 6


defining key functions

In [None]:
def get_destinations(iata_source, wiki_name, path_write):
    file = open(path_write, "a") #file to append to
    
    url = f"https://en.wikipedia.org/wiki/{wiki_name}"
    response = requests.get(url)
    
    soup = BeautifulSoup(response.text, 'html.parser')
    #find the related destination table
    # Case-insensitive string match
    heading = soup.find("h2", string=re.compile(r"destination", re.IGNORECASE))
    #check text in heading
    heading_text =  heading.get_text()
    if  "former" in heading_text or "Former" in heading_text: #if either text is found, abort the function. This indicate the airport is no longer in service
        file.close() #close
        return
        
    
    table = heading.find_next("table") 
    while ('wikitable' not in table.get("class")): #find the next table matching a predictable class, if one has not been found
        table = table.find_next("table") 
    rows = table.find_all("tr")

    
    for i in range(1,len(rows)): #exclude the first row
        row = rows[i]
        # Extract all cells (td or th)
        cols = row.find_all(["td", "th"])
        # Write the row text content to CSV
        #first column is the airline
        airline = cols[0].get_text(strip=True)
        #get the list of destinations in the 2nd  
        destinations = cols[1]
        isSeasonal = 0 #iterate over subcomponents (seasonal always comes last, so set is seasonal to be false for now)
        for child in destinations.children: 
            #anchor components are the only destinations
            if (child.name == "a"):
                dest_name = child.get('title') #the title is the official wikipedia airport name (without _ in place of spaces)
                dest_name = dest_name.replace(" ", "_") 
                output = f"\"{iata_source}\",\"{wiki_name}\",\"{dest_name}\",\"{airline}\",\"{isSeasonal}\"\n" #final output to append to the file
                file.write(output)#write file
            elif ((child.name == "b") and (child.text == "Seasonal:")):
                isSeasonal = 1 #get seasonal to be 1 for future destinations
    file.close() #close
    return

## start

Start the file to store modern route data

In [3]:
f = open("./data/current_routes.csv", "w")
f.write("iata_source,starting_wiki_name,dest_wikipedia_name,airline,isSeasonal\n") 
f.close() #add column names

get iata code, wikiname lists

In [4]:
data = pd.read_csv("./data/top_airports_basic_data.csv")
print(len(data))
data.head(n=1)
codes_list = data["iata"]
names_list = data["wiki_name"]
file_append_path = "./data/current_routes.csv"

989


iterrate through airports 0 to 50

In [5]:

for i in range(0,50):
    print("airport index:", i)
    code = codes_list[i]
    name = names_list[i]
    try:
        get_destinations(code,name,file_append_path)
    except:
        print(f"failed airport:{code}")


airport index: 0
airport index: 1
airport index: 2
airport index: 3
airport index: 4
airport index: 5
airport index: 6
airport index: 7
airport index: 8
airport index: 9
airport index: 10
airport index: 11
airport index: 12
airport index: 13
airport index: 14
failed airport:IST
airport index: 15
airport index: 16
airport index: 17
airport index: 18
airport index: 19
airport index: 20
airport index: 21
airport index: 22
airport index: 23
airport index: 24
airport index: 25
airport index: 26
airport index: 27
airport index: 28
airport index: 29
airport index: 30
airport index: 31
airport index: 32
airport index: 33
airport index: 34
airport index: 35
airport index: 36
airport index: 37
airport index: 38
airport index: 39
airport index: 40
airport index: 41
airport index: 42
airport index: 43
airport index: 44
airport index: 45
airport index: 46
airport index: 47
airport index: 48
airport index: 49


iterrate through airports 50 to 100

In [6]:
for i in range(50,100):
    print("airport index:", i)
    code = codes_list[i]
    name = names_list[i]
    try:
        get_destinations(code,name,file_append_path)
    except:
        print(f"failed airport:{code}")

airport index: 50
airport index: 51
airport index: 52
airport index: 53
airport index: 54
airport index: 55
airport index: 56
airport index: 57
airport index: 58
airport index: 59
airport index: 60
airport index: 61
airport index: 62
failed airport:ORL
airport index: 63
airport index: 64
airport index: 65
airport index: 66
airport index: 67
airport index: 68
airport index: 69
airport index: 70
airport index: 71
airport index: 72
airport index: 73
airport index: 74
airport index: 75
airport index: 76
airport index: 77
airport index: 78
airport index: 79
airport index: 80
airport index: 81
airport index: 82
airport index: 83
airport index: 84
airport index: 85
airport index: 86
airport index: 87
airport index: 88
airport index: 89
airport index: 90
airport index: 91
airport index: 92
airport index: 93
airport index: 94
airport index: 95
airport index: 96
airport index: 97
failed airport:TAO
airport index: 98
airport index: 99


iterate through airports 100 to 150

In [7]:
for i in range(100, 150):
    print("airport index:", i)
    code = codes_list[i]
    name = names_list[i]
    try:
        get_destinations(code,name,file_append_path)
    except:
        print(f"failed airport:{code}")

airport index: 100
airport index: 101
airport index: 102
airport index: 103
airport index: 104
airport index: 105
airport index: 106
airport index: 107
airport index: 108
airport index: 109
airport index: 110
airport index: 111
airport index: 112
airport index: 113
airport index: 114
airport index: 115
airport index: 116
airport index: 117
airport index: 118
airport index: 119
airport index: 120
airport index: 121
airport index: 122
airport index: 123
airport index: 124
airport index: 125
airport index: 126
airport index: 127
failed airport:HRB
airport index: 128
airport index: 129
airport index: 130
airport index: 131
airport index: 132
airport index: 133
airport index: 134
airport index: 135
airport index: 136
airport index: 137
airport index: 138
airport index: 139
airport index: 140
airport index: 141
airport index: 142
airport index: 143
airport index: 144
airport index: 145
airport index: 146
airport index: 147
airport index: 148
airport index: 149


iterate through airports 150 to 200

In [8]:
for i in range(150,200):
    print("airport index:", i)
    code = codes_list[i]
    name = names_list[i]
    try:
        get_destinations(code,name,file_append_path)
    except:
        print(f"failed airport:{code}")

airport index: 150
airport index: 151
airport index: 152
airport index: 153
airport index: 154
airport index: 155
airport index: 156
airport index: 157
airport index: 158
airport index: 159
airport index: 160
airport index: 161
airport index: 162
airport index: 163
airport index: 164
airport index: 165
airport index: 166
airport index: 167
airport index: 168
airport index: 169
airport index: 170
airport index: 171
airport index: 172
airport index: 173
airport index: 174
airport index: 175
airport index: 176
airport index: 177
airport index: 178
airport index: 179
airport index: 180
airport index: 181
airport index: 182
airport index: 183
airport index: 184
airport index: 185
airport index: 186
airport index: 187
airport index: 188
airport index: 189
airport index: 190
airport index: 191
airport index: 192
airport index: 193
airport index: 194
airport index: 195
airport index: 196
airport index: 197
airport index: 198
airport index: 199


iterate through airports 200 to 250

In [9]:
for i in range(200,250):
    print("airport index:", i)
    code = codes_list[i]
    name = names_list[i]
    try:
        get_destinations(code,name,file_append_path)
    except:
        print(f"failed airport:{code}")

airport index: 200
airport index: 201
airport index: 202
airport index: 203
airport index: 204
airport index: 205
airport index: 206
failed airport:HFE
airport index: 207
airport index: 208
airport index: 209
airport index: 210
airport index: 211
airport index: 212
airport index: 213
airport index: 214
airport index: 215
airport index: 216
airport index: 217
airport index: 218
airport index: 219
airport index: 220
airport index: 221
airport index: 222
airport index: 223
airport index: 224
airport index: 225
failed airport:JOG
airport index: 226
failed airport:INC
airport index: 227
airport index: 228
airport index: 229
airport index: 230
airport index: 231
airport index: 232
airport index: 233
airport index: 234
airport index: 235
airport index: 236
airport index: 237
airport index: 238
airport index: 239
airport index: 240
airport index: 241
airport index: 242
airport index: 243
airport index: 244
airport index: 245
airport index: 246
airport index: 247
airport index: 248
airport inde

iterate through airports 250 to 300

In [10]:
for i in range(250,300):
    print("airport index:", i)
    code = codes_list[i]
    name = names_list[i]
    try:
        get_destinations(code,name,file_append_path)
    except:
        print(f"failed airport:{code}")

airport index: 250
airport index: 251
airport index: 252
airport index: 253
airport index: 254
airport index: 255
airport index: 256
airport index: 257
airport index: 258
airport index: 259
airport index: 260
failed airport:YNT
airport index: 261
airport index: 262
airport index: 263
airport index: 264
airport index: 265
airport index: 266
airport index: 267
airport index: 268
airport index: 269
airport index: 270
airport index: 271
airport index: 272
airport index: 273
airport index: 274
airport index: 275
airport index: 276
airport index: 277
airport index: 278
failed airport:NAY
airport index: 279
airport index: 280
airport index: 281
airport index: 282
airport index: 283
airport index: 284
airport index: 285
airport index: 286
airport index: 287
airport index: 288
airport index: 289
airport index: 290
airport index: 291
airport index: 292
airport index: 293
airport index: 294
airport index: 295
airport index: 296
airport index: 297
airport index: 298
airport index: 299


iterate through airports 300 to 350

In [11]:
for i in range(300,350):
    print("airport index:", i)
    code = codes_list[i]
    name = names_list[i]
    try:
        get_destinations(code,name,file_append_path)
    except:
        print(f"failed airport:{code}")

airport index: 300
airport index: 301
airport index: 302
airport index: 303
airport index: 304
airport index: 305
airport index: 306
airport index: 307
airport index: 308
airport index: 309
airport index: 310
airport index: 311
airport index: 312
airport index: 313
airport index: 314
airport index: 315
airport index: 316
airport index: 317
airport index: 318
airport index: 319
airport index: 320
airport index: 321
airport index: 322
airport index: 323
airport index: 324
airport index: 325
airport index: 326
airport index: 327
airport index: 328
airport index: 329
airport index: 330
airport index: 331
airport index: 332
airport index: 333
airport index: 334
airport index: 335
airport index: 336
airport index: 337
airport index: 338
airport index: 339
airport index: 340
airport index: 341
airport index: 342
airport index: 343
airport index: 344
airport index: 345
airport index: 346
airport index: 347
airport index: 348
airport index: 349


iterate through the rest of the airports

In [12]:
for i in range(350, len(codes_list)):
    print("airport index:", i)
    code = codes_list[i]
    name = names_list[i]
    try:
        get_destinations(code,name,file_append_path)
    except:
        print(f"failed airport:{code}")

airport index: 350
airport index: 351
airport index: 352
airport index: 353
airport index: 354
airport index: 355
airport index: 356
airport index: 357
airport index: 358
airport index: 359
failed airport:SYZ
airport index: 360
airport index: 361
airport index: 362
airport index: 363
airport index: 364
airport index: 365
airport index: 366
airport index: 367
airport index: 368
airport index: 369
airport index: 370
airport index: 371
airport index: 372
airport index: 373
airport index: 374
airport index: 375
airport index: 376
airport index: 377
airport index: 378
airport index: 379
airport index: 380
airport index: 381
airport index: 382
airport index: 383
airport index: 384
airport index: 385
airport index: 386
airport index: 387
airport index: 388
airport index: 389
airport index: 390
airport index: 391
airport index: 392
airport index: 393
airport index: 394
airport index: 395
airport index: 396
airport index: 397
airport index: 398
airport index: 399
airport index: 400
airport inde

### fixing failed airports and updating airport data

A few airports were failing, due to outdated data, mainly due to some airports in cities like Qingdao closing, along with faulty links from the original query for some smaller airports. The updated wikipedia names were found for airports in cities where the main airport had changed (Berlin Tegel, Istanbul Ataturk, for example), with the following functions ran to add data like below.

A new table of airports based on the routes table here will be created to used be as a reference, with the same information obtained by wikipedia api

We find the missing airports like below:

In [None]:
routes = pd.read_csv("./data/current_routes.csv")
unique_airports_in_routes = set(routes["iata_source"].unique())
#find missing airports
missing = set(codes_list) - unique_airports_in_routes
for m in missing:
    print("missing airports", m)

adding some  missing entries (with the exception of executive/closed airports or those without destinations on wikipedia)

Some new airports have destinations added in the case that the old iata code was replace

In [16]:
get_destinations("LDU","Lahad_Datu_Airport",file_append_path)

In [17]:
get_destinations("CGY","Laguindingan_Airport",file_append_path)

In [18]:
#new Saratov airport -replacing RTW, removed RTW entries
get_destinations("GSV","Saratov_Gagarin_Airport",file_append_path)

In [19]:
#new Berlin airport- replacing TXL, SXF 
get_destinations("BER", "Berlin_Brandenburg_Airport",file_append_path)

In [20]:
get_destinations("UTH", "Udon_Thani_International_Airport",file_append_path)

In [21]:

get_destinations("VAS", "Sivas_Airport",file_append_path)

In [22]:
get_destinations("TER", "Lajes_Airport",file_append_path)

In [23]:

get_destinations("SYZ", "Shiraz_Shahid_Dastgheib_International_Airport",file_append_path)

In [24]:

get_destinations("SNO", "Sakon_Nakhon_Airport",file_append_path)

In [25]:
get_destinations("TTE", "Sultan_Babullah_Airport",file_append_path)

In [26]:

get_destinations("NST","Nakhon_Si_Thammarat_Airport",file_append_path)

In [27]:
obt = ("SOC", "Adisoemarmo_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [28]:
#new Rajkot airport, replacing RAJ
obt = ("HSR", "Rajkot_International_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [29]:
#replacing TAG, old TAG entries in routes removed
obt = ("TAG", "Bohol–Panglao_International_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [None]:
# New samarinda,indonesia airport, replacing SRI, old entries revmoed
obt = ("AAP", "Aji_Pangeran_Tumenggung_Pranoto_International_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [31]:
obt = ("DIN", "Dien_Bien_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [32]:
obt = ("UBJ", "Yamaguchi_Ube_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [33]:
obt = ("KUV", "Gunsan_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [34]:
obt = ("HMA", "Khanty-Mansiysk Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [35]:
obt = ("WGA", "Wagga_Wagga_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [36]:
obt = ("GRV", "Kadyrov_Grozny_International_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [37]:
obt = ("TAO", "Qingdao_Jiaodong_International_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [38]:
obt = ("HRB", "Harbin_Taiping_International_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [39]:
obt = ("YNT", "Yantai_Penglai_International_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [40]:
obt = ("ZAZ", "Zaragoza_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [41]:
obt = ("THS", "Sukhothai_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [42]:
#new Murcia airport, replacing MJV
obt = ("RMU", "Región_de_Murcia_International_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [43]:
obt = ("MSJ", "Misawa_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [44]:
obt = ("ISG", "New_Ishigaki_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [45]:
obt = ("TIM", "Mozes_Kilangin_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [46]:
obt = ("UBP", "Ubon_Ratchathani_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [47]:
obt = ("HFE", "Hefei_Xinqiao_International_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [48]:
obt = ("MLX", "Malatya_Erhaç_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [49]:
obt = ("REU", "Reus_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [50]:
obt = ("YKS", "Platon_Oyunsky_Yakutsk_International_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [51]:
#new istanbul airport
obt = ("IST", "Istanbul_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [52]:
obt = ("BMV", "Buon_Ma_Thuot_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [53]:
obt = ("ROV", "Platov_International_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [54]:
obt = ("TRZ", "Tiruchirappalli_International_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [55]:
obt = ("RBR", "Rio_Branco_International_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [56]:
obt = ("KOP", "Nakhon_Phanom_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [57]:
obt = ("JOG", "Adisutjipto_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [58]:
obt = ("UUS", "Yuzhno-Sakhalinsk_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [59]:
obt = ("NSN", "Nelson_Airport_(New_Zealand)")
get_destinations(obt[0],obt[1],file_append_path)

In [60]:
obt = ("NUX", "Novy_Urengoy_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [61]:
obt = ("INC", "Incheon_International_Airport")
get_destinations(obt[0],obt[1],file_append_path)

In [62]:
obt = ("PMW", "Palmas_Airport")
get_destinations(obt[0],obt[1],file_append_path)

### checking missing airports again, verifying none have further passengers

In [64]:
routes = pd.read_csv("./data/current_routes.csv")
unique_airports_in_routes = set(routes["iata_source"].unique())
#find missing airports
missing = set(codes_list) - unique_airports_in_routes
for m in missing:
    print("missing airports", m)

missing airports SXF
missing airports PRH
missing airports SRI
missing airports MJV
missing airports TXL
missing airports COT
missing airports MRQ
missing airports ORL
missing airports RTW
missing airports NAY
missing airports ULY
missing airports GET
missing airports PLU
missing airports RAJ


dropping duplicate data

In [66]:
routes= routes.drop_duplicates() #drop rows where all values are the same (should not be any)
print(len(routes))
routes.to_csv("./data/current_routes.csv")

76501


### fixing airline names with [] quotations, removing them

In [None]:
new_airlines_list = []
old_airlines_list = routes["airline"]
for airline in old_airlines_list:
    airline = airline.split("[")[0] #remove quotation
    new_airlines_list.append(airline)
routes["airline"] = new_airlines_list
routes.head()

Unnamed: 0,iata_source,starting_wiki_name,dest_wikipedia_name,airline,isSeasonal
0,ATL,Hartsfield–Jackson_Atlanta_International_Airport,Bajío_International_Airport,Aeroméxico Connect,0
1,ATL,Hartsfield–Jackson_Atlanta_International_Airport,Playa_de_Oro_International_Airport,Aeroméxico Connect,0
2,ATL,Hartsfield–Jackson_Atlanta_International_Airport,Monterrey_International_Airport,Aeroméxico Connect,0
3,ATL,Hartsfield–Jackson_Atlanta_International_Airport,Querétaro_Intercontinental_Airport,Aeroméxico Connect,0
4,ATL,Hartsfield–Jackson_Atlanta_International_Airport,San_Luis_Potosí_International_Airport,Aeroméxico Connect,0


In [70]:
routes.to_csv("./data/current_routes.csv")

### Some basic analysis on our preliminary current routes data

In [71]:
print("number of unique airlines:", routes["airline"].nunique())
print("number of unique destinations:", routes["dest_wikipedia_name"].nunique())

number of unique airlines: 900
number of unique destinations: 4704


# Part 2: Creating a database of current airports served

We now use wikipedia to create a table of detailed information for all the destination airport names in the current routes.csv, which can be done easily since we know the wikipedia names, using the GlobalAirportDatabase data downloaded (some detailed information may be outdated), but we only want a general location and accurate coordinates to within 30mi (to estimate flight paths), which is not too restrictive.

In [2]:
routes = pd.read_csv("./data/current_routes.csv")
destinations_wiki_names = routes["dest_wikipedia_name"].unique()

create the csv file storing this information

In [3]:
f = open("./data/current_served_airports.csv", "w")
f.write("IATA,wiki_name,city,country,latitude,longitude\n") 
f.close() #add column names

link to global airport database, at https://www.partow.net/miscellaneous/airportdatabase/  and guide. Loading in the database so the city 

In [4]:
db_names=["ICAO","IATA","Airport_name", "City", "Country", 
          "lat_deg","lat_min","lat_sec","lat_dir",
          "long_deg","long_min","long_sec","long_dir",
          "Altitude", "lat_dec", "long_dec"]#names given on the website in order
airport_db = pd.read_csv("./data/GlobalAirportDatabase.txt", delimiter=":",names=db_names)
airport_db.head()

Unnamed: 0,ICAO,IATA,Airport_name,City,Country,lat_deg,lat_min,lat_sec,lat_dir,long_deg,long_min,long_sec,long_dir,Altitude,lat_dec,long_dec
0,AYGA,GKA,GOROKA,GOROKA,PAPUA NEW GUINEA,6,4,54,S,145,23,30,E,1610,-6.082,145.392
1,AYLA,LAE,,LAE,PAPUA NEW GUINEA,0,0,0,U,0,0,0,U,0,0.0,0.0
2,AYMD,MAG,MADANG,MADANG,PAPUA NEW GUINEA,5,12,25,S,145,47,19,E,7,-5.207,145.789
3,AYMH,HGU,MOUNT HAGEN,MOUNT HAGEN,PAPUA NEW GUINEA,5,49,34,S,144,17,46,E,1643,-5.826,144.296
4,AYNZ,LAE,NADZAB,NADZAB,PAPUA NEW GUINEA,6,34,11,S,146,43,34,E,73,-6.57,146.726


create a function to find the coordinates, Country, City of a airport given an iata code

In [5]:
def getdetails(iata, airport_db):
    try:
        matching = airport_db[airport_db["IATA"]==iata].iloc[0]
        return {"city":matching["City"], "country":matching["Country"],"latitude":matching["lat_dec"], "longitude":matching["long_dec"]}
    except:
        return {"city":"", "country":"","latitude":"", "longitude":""}
getdetails("JFK", airport_db)

{'city': 'NEW YORK', 'country': 'USA', 'latitude': 40.64, 'longitude': -73.779}

creating getting raw text from wikipedia to get the iata code of any airport


In [6]:
def getIataFromWikiName(wiki_name):
    url = f"https://en.wikipedia.org/w/index.php?title={wiki_name}&action=raw"
    response = requests.get(url)
    #find the text between
    text = response.text
    
    try:
        match = re.findall(r'IATA.*?=', text)[0] #find the first indstance
        start = text.find(match) #find the starting index, by matching the re pattern iata*=
        start += len(match) #do not include iata
        
        end = text.find("\n", start) #starting from the end, find the starting index
        iata_code = text[start:end]
        iata_code = iata_code.split("<")[0]#get rid of ref tags
        iata_code = iata_code.strip()
        return iata_code
    except:
        #check for redirect
        check_text = text.split("[[")[0] #get section between [[
        check_text = check_text.lower()
        if ("redirect" in check_text):
            #get the text in between [[]]
            redirect = text.split("]]")[0]
            redirect = redirect.split("[[")[1]
            redirect = redirect.replace(" ", "_") #replace spaces
            print("redirect found:", redirect)
            return getIataFromWikiName(redirect)
        return "x"

Testing for redirects and modifying functions to conduct directs (for sample Dallas/Fort Worth International Airport) should map to the same iata code as  Dallas_Fort_Worth_International_Airport (wiki names for the same airport are not necessarily unique)

In [7]:
getIataFromWikiName("Dallas/Fort Worth International Airport")

redirect found: Dallas_Fort_Worth_International_Airport


'DFW'

In [8]:
getIataFromWikiName("Genoa_Airport")

redirect found: Genoa_Cristoforo_Colombo_Airport


'GOA'

In [9]:
getIataFromWikiName("Victoria_Falls_International_Airport")

redirect found: Victoria_Falls_Airport_


'VFA'

attempting to get details for all ~ 4000 airports

In [None]:

i = 0
for dest_wiki in destinations_wiki_names:
    try:
        f = open("./data/current_served_airports.csv", "a")
        print("index is:",i)
        iata_code = getIataFromWikiName(dest_wiki)
        matching = getdetails(iata_code, airport_db) #details object 
        city =  matching["city"]
        country = matching["country"]
        lat = matching["latitude"]
        long = matching["longitude"]
        output = f"\"{iata_code}\",\"{dest_wiki}\",\"{city}\",\"{country}\",\"{lat}\",\"{long}\"\n" #write IATA,wiki_name,country,city,latitude,longitude
        f.write(output)
        f.close()    
    except:
        output = f"\"{iata_code}\",\"{dest_wiki}\",\"\",\"\",\"\",\"\"\n" #write IATA,wiki_name,country,city,latitude,longitude
        f.write(output)
        f.close()
        continue
    i += 1

index is: 0
index is: 1
index is: 2
index is: 3
index is: 4
index is: 5
index is: 6
index is: 7
index is: 8
index is: 9
index is: 10
index is: 11
index is: 12
index is: 13
index is: 14
index is: 15
index is: 16
index is: 17
index is: 18
index is: 19
index is: 20
index is: 21
index is: 22
index is: 23
index is: 24
index is: 25
index is: 26
index is: 27
index is: 28
index is: 29
index is: 30
index is: 31
index is: 32
index is: 33
index is: 34


### checking invalid iata code, iterating through csv

In [None]:
airports = pd.read_csv("./data/current_served_airports.csv", on_bad_lines="skip")
print(len(airports))
airports.head(n=1)

In [None]:
def checkiata(airports):
    for index, row in airports.iterrows():
        iata = row["IATA"]
        name = row["wiki_name"]
        try:
            if len(iata) != 3:
                print(f"faulty iata:{iata}| for name: {name}") 
        except:
            print(f"faulty iata:{iata}| for name: {name}") 
    return

In [None]:
checkiata(airports)

notice that Basel airport has 3 iata codes. This is valid. However, we dropping "nan" value

In [None]:
drop = []
for index, row in airports.iterrows():
    iata = row["IATA"]
    iata = str(iata)
    if iata == "nan":
        drop.append(index)

In [None]:
print(drop)

In [None]:
airports = airports.drop(index=drop)
airports = airports.reset_index(drop=True) #reindex (dropping old index)

try to rerun the updated get Iata program on invalid iata 

In [None]:
for index, row in airports.iterrows():
    iata = str(row["IATA"])
    name = row["wiki_name"]
    try:
        if len(iata) != 3:
            iata_code = getIataFromWikiName(name)
            matching = getdetails(iata_code, airport_db) #details object 
            airports["IATA"][index] = iata_code
            airports["city"][index] = matching["city"]
            airports["country"][index] = matching["country"]
            airports["latitude"][index] = matching["latitude"]
            airports["longitude"][index] = matching["longitude"]
    except:
        continue
    

check iata code:

In [None]:
checkiata(airports)

do a final drop, exclude Basel case, and 2 other special cases found

In [None]:
drop = []
for index, row in airports.iterrows():
    iata = str(row["IATA"])
    name = row["wiki_name"]
    if len(iata) != 3:
        if "BSL, MLH, EAP" != iata and "OIBH" != iata:
            print("dropped iata code:", iata)
            drop.append(index)
        if "IKU" in iata :
            iata_code = "IKU"
            matching = getdetails(iata_code, airport_db) #details object 
            airports["IATA"][index] = iata_code
            airports["city"][index] = matching["city"]
            airports["country"][index] = matching["country"]
            airports["latitude"][index] = matching["latitude"]
            airports["longitude"][index] = matching["longitude"]
        if "''TRT''" in iata :
            iata_code = "TRT"
            matching = getdetails(iata_code, airport_db) #details object 
            airports["IATA"][index] = iata_code
            airports["city"][index] = matching["city"]
            airports["country"][index] = matching["country"]
            airports["latitude"][index] = matching["latitude"]
            airports["longitude"][index] = matching["longitude"]
    


In [None]:
drop

In [None]:
airports = airports.drop(index=drop)
airports = airports.reset_index(drop=True) #reindex (dropping old index)

save data

In [None]:
len(airports)
#
airports.to_csv("./data/current_served_airports.csv")

### now, add iata codes when possible to the current routes database for the destination airports

In [None]:
routes_data = pd.read_csv("./data/current_routes.csv")
airports = pd.read_csv("./data/current_served_airports.csv")

In [None]:
routes_data.head(n=1)

In [None]:
airports.head(n=1)

In [None]:
#add iata_dest_source
routes_data["iata_dest"] = None 
for index, rows in routes_data.iterrows():
    print("current row:", index)
    dest_wiki = rows["dest_wikipedia_name"]
    try:  #try to match to a iata code
        match = airports[airports["wiki_name"]==dest_wiki].iloc[0]
        match = match["IATA"]
        routes_data["iata_dest"][index] = match
    except:
        routes_data["iata_dest"][index] = None

save new data

In [None]:
routes_data.to_csv("./data/current_routes.csv", index=False)

In [None]:
routes_data.head(n=1)

find routes without any valid iata (null)

In [None]:
none_dest_Data = routes_data[routes_data["iata_dest"].isnull()]
print(len(none_dest_Data))

In [None]:
none_dest_Data.head()

Therefore, it turns out the percentage of routes, without a destination IATA is low (282/76500) ~ 0.36 %

# Part 3: Narrowing down to create database of airports in the database listed as origins

### populating data 

We now narrow down to airports where there is an origin flights. In this case, we require all rows to have latitude, longitude for the purpose of distance calculations later.

In [2]:
routes = pd.read_csv("./data/current_routes.csv", encoding='utf-8')
iata_sources = routes["iata_source"].unique()
print(len(iata_sources))
routes.head(n=1)

980


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,iata_source,starting_wiki_name,dest_wikipedia_name,airline,isSeasonal,iata_dest
0,0,0,ATL,Hartsfield–Jackson_Atlanta_International_Airport,Bajío_International_Airport,Aeroméxico Connect,0,BJX


In [3]:
source_airport_link = "./data/current_source_airports.csv"
data = [
    ["IATA","wiki_name","city","country","latitude","longitude"]
]


load reference data of the current served airports. It must be the case that the served airports is a superset of those origin ones

In [4]:
ref_data =pd.read_csv("./data/current_served_airports.csv", encoding='utf-8')
ref_data.head(n=1)

Unnamed: 0.1,Unnamed: 0,IATA,wiki_name,city,country,latitude,longitude
0,0,BJX,Bajío_International_Airport,DEL BAJIO,MEXICO,20.993,-101.481


Get all details of these ~ 980 airports

In [5]:
i = 0 
for iata_source in iata_sources:
    try:
        print("index is:",i)
        #match based on reference data 
        matching = ref_data[ref_data["IATA"]==iata_source].iloc[0]
        wiki_name =  str(matching["wiki_name"])
        city =  str(matching["city"])
        country = str(matching["country"])
        lat = str(matching["latitude"])
        long = str(matching["longitude"])
        data.append([iata_source, wiki_name, city,country,lat,long])

    except:
        data.append([iata_source, wiki_name, "","","",""])
        continue
    i += 1
with open(source_airport_link, 'w', encoding='utf-8', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(data)

index is: 0
index is: 1
index is: 2
index is: 3
index is: 4
index is: 5
index is: 6
index is: 7
index is: 8
index is: 9
index is: 10
index is: 11
index is: 12
index is: 13
index is: 14
index is: 15
index is: 16
index is: 17
index is: 18
index is: 19
index is: 20
index is: 21
index is: 22
index is: 23
index is: 24
index is: 25
index is: 26
index is: 27
index is: 28
index is: 29
index is: 30
index is: 31
index is: 32
index is: 33
index is: 34
index is: 35
index is: 36
index is: 37
index is: 38
index is: 39
index is: 40
index is: 41
index is: 42
index is: 43
index is: 44
index is: 45
index is: 46
index is: 47
index is: 48
index is: 49
index is: 50
index is: 51
index is: 52
index is: 53
index is: 54
index is: 55
index is: 56
index is: 57
index is: 58
index is: 59
index is: 60
index is: 61
index is: 62
index is: 63
index is: 64
index is: 65
index is: 66
index is: 67
index is: 68
index is: 69
index is: 70
index is: 71
index is: 72
index is: 73
index is: 74
index is: 75
index is: 76
index is:

### Repairing data 

Getting rows that are empty for the iata code

In [6]:
source_airport_link = "./data/current_source_airports.csv"
current_airports =pd.read_csv(source_airport_link, encoding='utf-8')
print(len(current_airports))
current_airports.head(n=1)

980


Unnamed: 0,IATA,wiki_name,city,country,latitude,longitude
0,ATL,Hartsfield–Jackson_Atlanta_International_Airport,ATLANTA,USA,33.64,-84.427


In [7]:
len(current_airports[current_airports["IATA"].isnull()])

0

Checking rows that are empty for details like latitude

In [8]:
empty_rows = current_airports[current_airports["latitude"].isnull()]
print(len(empty_rows))

196


In [9]:
empty_rows.head(n=196)

Unnamed: 0,IATA,wiki_name,city,country,latitude,longitude
8,PVG,Shanghai_Pudong_International_Airport,,,,
17,ICN,Incheon_International_Airport,,,,
50,DMK,Don_Mueang_International_Airport,,,,
61,SAW,Sabiha_Gökçen_International_Airport,,,,
63,DME,Moscow_Domodedovo_Airport,,,,
...,...,...,...,...,...,...
954,GRV,Grozny_Airport,,,,
959,RMU,Región_de_Murcia_International_Airport,,,,
969,BMV,Buon_Ma_Thuot_Airport,,,,
977,NUX,Novy_Urengoy_Airport,,,,


Get list of wikipedia names that are empty, convert to set for quick checking

In [10]:
wiki_names = set(empty_rows["wiki_name"])
print(len(wiki_names))

190


Loading function to get city, country, latitude, and longitude from wikipedia

In [11]:
def getRow(text, key): #help function to text a key from = of the first rpws
    try:
        regex = f"{key}"+'.*?='
        match = re.findall(rf'{regex}', text)[0] #find the first indstance
        
        start = text.find(match) #find the starting index, by matching the re pattern iata*=
        start += len(match) #do not include iata
        
        end = text.find("\n", start) #starting from the end, find the starting index
        code = text[start:end]
        code = code.split("<")[0]#get rid of ref tags
        code = code.strip()
        return code
    except:
        #check for redirect
        check_text = text.split("[[")[0] #get section between [[
        check_text = check_text.lower()
        if ("redirect" in check_text):
            #get the text in between [[]]
            redirect = text.split("]]")[0]
            redirect = redirect.split("[[")[1]
            redirect = redirect.replace(" ", "_") #replace spaces
            #get text from redirect
            url = f"https://en.wikipedia.org/w/index.php?title={wiki_name}&action=raw"
            response = requests.get(url)
            #find the text between
            text = response.text
            return getRow(text, key)
        return "" #return empty string if nothing is found

#function to convert DMS coordinates on wiki to decimal ones
def dms_to_decimal(degrees, minutes, seconds, direction):
    dd = float(degrees) + float(minutes) / 60 + float(seconds) / 3600
    if direction.upper() in ['S', 'W']:
        dd *= -1
    return str(dd)



def getDetailsFromWikiName(wiki_name):
    url = f"https://en.wikipedia.org/w/index.php?title={wiki_name}&action=raw"
    response = requests.get(url)
    #find the text between
    text = response.text
    #intialize as empty strings
    city = ""
    country = ""
    lat = ""
    long = ""
    try:
        #process city data
        city = getRow(text,"city-served")#city
        if "[[" in city:
            city = city.split("[[")[1]
        city = city.split("]]")[0]
        # process latitude, longtitude data
        coor = getRow(text,"coordinates")
        coor = coor.split("}}")[0]
        coor = coor.split("{{")[1]
        coor = coor.split("|")
        lat = dms_to_decimal(coor[1],coor[2],coor[3],coor[4]) #convert using function
        long = dms_to_decimal(coor[5],coor[6],coor[7],coor[8])
        #get country data, using wikipedia api using city
        url2 = f"https://en.wikipedia.org/w/index.php?title={city}&action=raw"
        response2 = requests.get(url2)
        #find the text between
        text2 = response2.text
        country = getRow(text2,"subdivision_name")
        #depending on the enclosing symbol
        if "[" in country:
            country = country.split("[[")[1]#get between [[]]
            country = country.split("]]")[0]
        elif "{" in country:
            #get between (())
            country = country.split("{{")[1]#get between [[]]
            country = country.split("}}")[0]
        if "|" in country: #now, check for |
            tlist = country.split("|")
            country = tlist[len(tlist)-1]
        return {"city":city, "country":country, "latitude":lat, "longitude":long}
    except:
        #check for redirect
        check_text = text.split("[[")[0] #get section between [[
        check_text = check_text.lower()
        if ("redirect" in check_text):
            #get the text in between [[]]
            redirect = text.split("]]")[0]
            redirect = redirect.split("[[")[1]
            redirect = redirect.replace(" ", "_") #replace spaces
            return getDetailsFromWikiName(redirect)
        return {"city":city, "country":country, "latitude":lat, "longitude":long}
        
        

Testing

In [12]:
getDetailsFromWikiName("Shanghai_Pudong_International_Airport")

{'city': 'Shanghai',
 'country': 'China',
 'latitude': '31.143333333333334',
 'longitude': '121.80527777777777'}

In [13]:
getDetailsFromWikiName("Incheon_International_Airport")

{'city': 'Seoul Metropolitan Area',
 'country': 'South Korea',
 'latitude': '37.46333333333334',
 'longitude': '126.44'}

In [14]:

data = getDetailsFromWikiName("Mashhad_International_Airport")
print(data)
print(data["city"])

{'city': 'Mashhad', 'country': 'Iran', 'latitude': '36.235', 'longitude': '59.64083333333333'}
Mashhad


Try to fill out missing cells. do this with different index ranges at a time to avoid overloading the wikiapi

In [15]:
def repair(start, end, current_airports):
    f = open("temp.txt", "w", encoding='utf-8')#open a temp progress txt
    for index, row in current_airports.iterrows():
        if index in range(start, end):
            print(f"current index: {index}")
            wiki_name = row["wiki_name"]
            if wiki_name in wiki_names: #if this is in the empty list
                try:
                    #write 
                    f = open("temp.txt", "a", encoding='utf-8')#open a temp progress txt for appending
                    f.write(f"current wiki name: {wiki_name}\n")
                    data = getDetailsFromWikiName(wiki_name)
                    f.write(f"data found for missing rows at index:{index}\n")
                    f.close()
                    current_airports.at[index, "city"] = data["city"]
                    current_airports.at[index, "country"] = data["country"]
                    current_airports.at[index, "latitude"] = float(data["latitude"])
                    current_airports.at[index, "longitude"] = float(data["longitude"])
                except:
                    current_airports.at[index, "city"] = ""
                    current_airports.at[index, "country"] = ""
                    current_airports.at[index, "latitude"] = ""
                    current_airports.at[index, "longitude"] = ""
            else:
                continue
        else:
            continue
    return current_airports #return the modified data
        

In [16]:
#conducting for indices 0-187
current_airports = repair(0, 187, current_airports)

current index: 0
current index: 1
current index: 2
current index: 3
current index: 4
current index: 5
current index: 6
current index: 7
current index: 8
current index: 9
current index: 10
current index: 11
current index: 12
current index: 13
current index: 14
current index: 15
current index: 16
current index: 17
current index: 18
current index: 19
current index: 20
current index: 21
current index: 22
current index: 23
current index: 24
current index: 25
current index: 26
current index: 27
current index: 28
current index: 29
current index: 30
current index: 31
current index: 32
current index: 33
current index: 34
current index: 35
current index: 36
current index: 37
current index: 38
current index: 39
current index: 40
current index: 41
current index: 42
current index: 43
current index: 44
current index: 45
current index: 46
current index: 47
current index: 48
current index: 49
current index: 50
current index: 51
current index: 52
current index: 53
current index: 54
current index: 55
cu

  current_airports.at[index, "latitude"] = ""
  current_airports.at[index, "longitude"] = ""


current index: 71
current index: 72
current index: 73
current index: 74
current index: 75
current index: 76
current index: 77
current index: 78
current index: 79
current index: 80
current index: 81
current index: 82
current index: 83
current index: 84
current index: 85
current index: 86
current index: 87
current index: 88
current index: 89
current index: 90
current index: 91
current index: 92
current index: 93
current index: 94
current index: 95
current index: 96
current index: 97
current index: 98
current index: 99
current index: 100
current index: 101
current index: 102
current index: 103
current index: 104
current index: 105
current index: 106
current index: 107
current index: 108
current index: 109
current index: 110
current index: 111
current index: 112
current index: 113
current index: 114
current index: 115
current index: 116
current index: 117
current index: 118
current index: 119
current index: 120
current index: 121
current index: 122
current index: 123
current index: 124
cur

In [17]:
#conducting for indices 187-300
current_airports = repair(187,300, current_airports)

current index: 187
current index: 188
current index: 189
current index: 190
current index: 191
current index: 192
current index: 193
current index: 194
current index: 195
current index: 196
current index: 197
current index: 198
current index: 199
current index: 200
current index: 201
current index: 202
current index: 203
current index: 204
current index: 205
current index: 206
current index: 207
current index: 208
current index: 209
current index: 210
current index: 211
current index: 212
current index: 213
current index: 214
current index: 215
current index: 216
current index: 217
current index: 218
current index: 219
current index: 220
current index: 221
current index: 222
current index: 223
current index: 224
current index: 225
current index: 226
current index: 227
current index: 228
current index: 229
current index: 230
current index: 231
current index: 232
current index: 233
current index: 234
current index: 235
current index: 236
current index: 237
current index: 238
current inde

In [18]:
#conducting for indices 300-500
current_airports = repair(300,500, current_airports)

current index: 300
current index: 301
current index: 302
current index: 303
current index: 304
current index: 305
current index: 306
current index: 307
current index: 308
current index: 309
current index: 310
current index: 311
current index: 312
current index: 313
current index: 314
current index: 315
current index: 316
current index: 317
current index: 318
current index: 319
current index: 320
current index: 321
current index: 322
current index: 323
current index: 324
current index: 325
current index: 326
current index: 327
current index: 328
current index: 329
current index: 330
current index: 331
current index: 332
current index: 333
current index: 334
current index: 335
current index: 336
current index: 337
current index: 338
current index: 339
current index: 340
current index: 341
current index: 342
current index: 343
current index: 344
current index: 345
current index: 346
current index: 347
current index: 348
current index: 349
current index: 350
current index: 351
current inde

In [19]:
#conducting for indices 500-700
current_airports = repair(500,700, current_airports)

current index: 500
current index: 501
current index: 502
current index: 503
current index: 504
current index: 505
current index: 506
current index: 507
current index: 508
current index: 509
current index: 510
current index: 511
current index: 512
current index: 513
current index: 514
current index: 515
current index: 516
current index: 517
current index: 518
current index: 519
current index: 520
current index: 521
current index: 522
current index: 523
current index: 524
current index: 525
current index: 526
current index: 527
current index: 528
current index: 529
current index: 530
current index: 531
current index: 532
current index: 533
current index: 534
current index: 535
current index: 536
current index: 537
current index: 538
current index: 539
current index: 540
current index: 541
current index: 542
current index: 543
current index: 544
current index: 545
current index: 546
current index: 547
current index: 548
current index: 549
current index: 550
current index: 551
current inde

In [20]:
#conducting for indices 700-
current_airports = repair(700, len(current_airports), current_airports)

current index: 700
current index: 701
current index: 702
current index: 703
current index: 704
current index: 705
current index: 706
current index: 707
current index: 708
current index: 709
current index: 710
current index: 711
current index: 712
current index: 713
current index: 714
current index: 715
current index: 716
current index: 717
current index: 718
current index: 719
current index: 720
current index: 721
current index: 722
current index: 723
current index: 724
current index: 725
current index: 726
current index: 727
current index: 728
current index: 729
current index: 730
current index: 731
current index: 732
current index: 733
current index: 734
current index: 735
current index: 736
current index: 737
current index: 738
current index: 739
current index: 740
current index: 741
current index: 742
current index: 743
current index: 744
current index: 745
current index: 746
current index: 747
current index: 748
current index: 749
current index: 750
current index: 751
current inde

In [21]:
# Count rows where any column is null
for col in current_airports.columns:
    print(f"current column {col}")
    num_rows_with_nulls = current_airports[current_airports[col]== ""]
    print(f"number of empty rows in the {col} column:", len(num_rows_with_nulls))

current column IATA
number of empty rows in the IATA column: 0
current column wiki_name
number of empty rows in the wiki_name column: 0
current column city
number of empty rows in the city column: 20
current column country
number of empty rows in the country column: 110
current column latitude
number of empty rows in the latitude column: 7
current column longitude
number of empty rows in the longitude column: 7


In [22]:
#save .to_csv()
current_airports.to_csv("./data/current_source_airports.csv", index=False, encoding='utf-8') #save without index

In [23]:
current_airports.head(n=10)

Unnamed: 0,IATA,wiki_name,city,country,latitude,longitude
0,ATL,Hartsfield–Jackson_Atlanta_International_Airport,ATLANTA,USA,33.64,-84.427
1,PEK,Beijing_Capital_International_Airport,BEIJING,CHINA,40.08,116.584
2,DXB,Dubai_International_Airport,DUBAI,UNITED ARAB EMIRATES,25.255,55.364
3,LAX,Los_Angeles_International_Airport,LOS ANGELES,USA,33.942,-118.408
4,ORD,O'Hare_International_Airport,CHICAGO,USA,41.979,-87.904
5,LHR,Heathrow_Airport,LONDON,ENGLAND,0.0,0.0
6,HND,Haneda_Airport,TOKYO,JAPAN,35.552,139.779
7,HKG,Hong_Kong_International_Airport,HONG KONG,HONG KONG,22.309,113.914
8,PVG,Shanghai_Pudong_International_Airport,Shanghai,China,31.143333,121.805278
9,CDG,Charles_de_Gaulle_Airport,PARIS,FRANCE,49.013,2.55


### manual repair

Final part: inspecting and manually repairing empty rows, first repairing those without a latitude/longitude, as this is the most important part (needed to calculate distances later for graph algorithm)

In [24]:

current_airports= pd.read_csv("./data/current_source_airports.csv", encoding='utf-8')
current_airports.head(n=7)

Unnamed: 0,IATA,wiki_name,city,country,latitude,longitude
0,ATL,Hartsfield–Jackson_Atlanta_International_Airport,ATLANTA,USA,33.64,-84.427
1,PEK,Beijing_Capital_International_Airport,BEIJING,CHINA,40.08,116.584
2,DXB,Dubai_International_Airport,DUBAI,UNITED ARAB EMIRATES,25.255,55.364
3,LAX,Los_Angeles_International_Airport,LOS ANGELES,USA,33.942,-118.408
4,ORD,O'Hare_International_Airport,CHICAGO,USA,41.979,-87.904
5,LHR,Heathrow_Airport,LONDON,ENGLAND,0.0,0.0
6,HND,Haneda_Airport,TOKYO,JAPAN,35.552,139.779


In [25]:
# Count rows where any column is null
for col in current_airports.columns:
    print(f"current column {col}")
    num_rows_with_nulls = current_airports[current_airports[col].isnull()]
    print(f"number of empty rows in the {col} column:", len(num_rows_with_nulls))

current column IATA
number of empty rows in the IATA column: 0
current column wiki_name
number of empty rows in the wiki_name column: 0
current column city
number of empty rows in the city column: 20
current column country
number of empty rows in the country column: 110
current column latitude
number of empty rows in the latitude column: 7
current column longitude
number of empty rows in the longitude column: 7


In [26]:
rows_with_nulls = current_airports[current_airports["latitude"].isnull()]
print(rows_with_nulls)

    IATA                            wiki_name city country  latitude  \
70   KIX         Kansai_International_Airport  NaN     NaN       NaN   
392  KUF       Kurumoch_International_Airport  NaN     NaN       NaN   
623  GNY                Şanlıurfa_GAP_Airport  NaN     NaN       NaN   
744  MWX           Muan_International_Airport  NaN     NaN       NaN   
790  CSY     Cheboksary_International_Airport  NaN     NaN       NaN   
818  IGT                        Magas_Airport  NaN     NaN       NaN   
947  TAG  Bohol–Panglao_International_Airport  NaN     NaN       NaN   

     longitude  
70         NaN  
392        NaN  
623        NaN  
744        NaN  
790        NaN  
818        NaN  
947        NaN  


In [27]:
current_airports.loc[70] = ["KIX", "Kansai_International_Airport", "Osaka", "Japan",34.4272,135.244]
current_airports.loc[392] = ["KUF", "Kurumoch_International_Airport", "Samara", "Russia",53.501667, 50.155]
current_airports.loc[623] = ["GNY", "Şanlıurfa_GAP_Airport", "Şanlıurfa", "Turkey",37.45, 38.9]

current_airports.loc[744] = ["MWX", "Muan_International_Airport", "Muan", "South Korea",34.991406, 126.382814]
current_airports.loc[790] = ["CSV", "Cheboksary_International_Airport", "Cheboksary", "Russia",56.0903, 47.3472]

current_airports.loc[818] = ["IGT", "Magas_Airport", "Magas", "Russia",43.3193, 45.013]
current_airports.loc[947] = ["TAG", "Bohol–Panglao_International_Airport", "Bohol", "Philippines",9.566667, 123.775]

In [28]:
current_airports.to_csv("./data/current_source_airports.csv", index=False, encoding='utf-8') #save without index

last step : standardize uppercase/lowercase format in the csv for city  and country

In [29]:
for index, row in current_airports.iterrows():
    city = row["city"]
    country = row["country"]
    print("current index", index)
    try: 
        #all lower case by spaces 
        city_list = city.split(" ")
        new_city_name = "" #new city name
        for part in city_list: 
            part = part.lower()
            part = part[0].upper() + part[1:] #string immutable, must be done this way
            new_city_name += part + " "
        #all lower case by spaces 
        country_list = country.split(" ")
        new_country_name = "" #new city name
        for part in country_list: 
            part = part.lower()
            part = part[0].upper() + part[1:]
            new_country_name += part + " "
        #trim the extra space
        new_city_name = new_city_name.strip()
        new_country_name = new_country_name.strip()
        current_airports.at[index, "city"] = new_city_name
        current_airports.at[index, "country"] = new_country_name
    except:
        continue
        

current index 0
current index 1
current index 2
current index 3
current index 4
current index 5
current index 6
current index 7
current index 8
current index 9
current index 10
current index 11
current index 12
current index 13
current index 14
current index 15
current index 16
current index 17
current index 18
current index 19
current index 20
current index 21
current index 22
current index 23
current index 24
current index 25
current index 26
current index 27
current index 28
current index 29
current index 30
current index 31
current index 32
current index 33
current index 34
current index 35
current index 36
current index 37
current index 38
current index 39
current index 40
current index 41
current index 42
current index 43
current index 44
current index 45
current index 46
current index 47
current index 48
current index 49
current index 50
current index 51
current index 52
current index 53
current index 54
current index 55
current index 56
current index 57
current index 58
current

In [30]:
current_airports.to_csv("./data/current_source_airports.csv", index=False, encoding='utf-8') #save without index

After saving and fixing the capitalization of Dallas-Fort Worth, we are done