In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import re 
import pandas as pd
import numpy as np
import math
# Suppress just SettingWithCopyWarning
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.ChainedAssignmentError)
warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)
pd.options.mode.chained_assignment = None  # Disable the warning
import time


# Part 1: Get most recent wikipedia urls ids for airports before 2020, and 2022

This will help us analyze covid recovery route trends. Additionally, in the process we create a detailed dataset of current routes. We first check for redirects in the names

In [None]:
def redirectCheck(wiki_name):
    url = f"https://en.wikipedia.org/w/index.php?title={wiki_name}&action=raw"
    response = requests.get(url)
    #find the text between
    text = response.text

    #check for redirect
    check_text = text.split("[[")[0] #get section between [[
    check_text = check_text.lower()
    #if the text is clearly too long (>10 lines, there is clearly no redirect)
    lines = text.splitlines()
    number_of_lines = len(lines)
    if number_of_lines > 10:
        return wiki_name

    if ("redirect" in check_text):
        #get the text in between [[]]
        redirect = text.split("]]")[0]
        #if the text does not contain a key #, we reject
        if "#" not in check_text:
            return wiki_name
        redirect = redirect.split("[[")[1]
        redirect = redirect.replace(" ", "_") #replace spaces
        print("redirect found:", redirect)
        return redirect
    else:
        return wiki_name #return the same name back

In [None]:
redirectCheck("Beijing_Capital_International_Airport")

In [None]:
redirectCheck("Malacca_International_Airport")
redirectCheck("Sultan_Muhammad_Salahuddin_Airport")

Running redirects code

In [None]:
ref_data = pd.read_csv("./data/current_source_airports.csv", encoding='utf-8')
ref_data["redirects"] = "" #add columns for directs 
for index, row in ref_data.iterrows():
    iata = row["IATA"]
    wikiname = row["wiki_name"]
    print("reparing index:", index)
    val = redirectCheck(wikiname)
    #add to redirects just in case, the original wikiname is different
    if val != wikiname:
        ref_data.at[index, "redirects"] = val
#write to the new csv about the details of the airport
ref_data.to_csv("./data/current_source_airports_details.csv", encoding='utf-8', index=False)
        

In [None]:
#function to find version code before a certain formated date 

def get_oldid_before(title, date):
    """Get the revision ID (oldid) of the latest version before a given date."""
    api_url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "prop": "revisions",
        "titles": title,
        "rvlimit": 1,
        "rvstart": date,
        "rvdir": "older",
        "rvprop": "ids",
        "formatversion": 2
    }

    response = requests.get(api_url, params=params)
    data = response.json()

    try:
        return str(data['query']['pages'][0]['revisions'][0]['revid'])
    except (KeyError, IndexError):
        return ""

# Example usage
id = get_oldid_before("John_F._Kennedy_International_Airport", "2020-01-01T00:00:00Z")
print(id)

In [None]:
#use the current routes airports source detailed data
ref_data = pd.read_csv("./data/current_source_airports_details.csv", encoding='utf-8')
ref_data.head(n=1)

checking to see if it is possible to check for nulls

In [None]:
test1 = ref_data.iloc[0]["redirects"]
print(pd.isna(test1))

iterate to find old ids.


In [None]:
ref_data["pre2020_ids"] = None
ref_data["pre2022_ids"] = None
pre2020_ids = []
pre2022_ids = []
for index, row in ref_data.iterrows():
    print(index)
    wiki_name = row["wiki_name"]
    #check for redirects
    redirect = row["redirects"]
    if (pd.isna(redirect) == False): #if there is an redirect, use it
        wiki_name = redirect
        print("redirect used:",wiki_name)
    id1 = get_oldid_before(wiki_name, "2020-01-01T00:00:00Z")
    pre2020_ids.append(id1)
    id2 = get_oldid_before(wiki_name, "2022-01-01T00:00:00Z")
    pre2022_ids.append(id2)
ref_data["pre2020_ids"] = pre2020_ids
ref_data["pre2022_ids"] = pre2022_ids

In [None]:
ref_data.head(n=1)

In [None]:
ref_data.to_csv("./data/current_source_airports_details.csv", encoding='utf-8', index=False)

checking both new columns have all non - None values

In [None]:
ref_data = pd.read_csv("./data/current_source_airports_details.csv")
print( len(ref_data[ref_data["pre2020_ids"].isnull()])  )
print( len(ref_data[ref_data["pre2022_ids"].isnull()])  )

print( len(ref_data[ref_data["pre2020_ids"]==""])  )
print( len(ref_data[ref_data["pre2022_ids"]==""])  )


checks passed, done with part 1. Keep in mind that during routes generation, if an entry has an redirect, we use that 

# Part 2: Route generation for 2020

Additionally, improvements to current_source_airports data, new modified data for exceptions

In [None]:
def get_destinations(iata_source, article_id, wiki_name, path_write):
    file = open(path_write, "a") #file to append to
    
    url = f"https://en.wikipedia.org/w/index.php?title={wiki_name}&oldid={article_id}"
    response = requests.get(url)
    
    soup = BeautifulSoup(response.text, 'html.parser')
    #find the related destination table
    # Case-insensitive string match
    heading = soup.find("h2", string=re.compile(r"destination", re.IGNORECASE))
    #check text in heading
    heading_text =  heading.get_text()
    if  "former" in heading_text or "Former" in heading_text: #if either text is found, abort the function. This indicate the airport is no longer in service
        file.close() #close
        return
        
    
    table = heading.find_next("table") 
    while ('wikitable' not in table.get("class")): #find the next table matching a predictable class, if one has not been found
        table = table.find_next("table") 
    rows = table.find_all("tr")

    
    for i in range(1,len(rows)): #exclude the first row
        row = rows[i]
        # Extract all cells (td or th)
        cols = row.find_all(["td", "th"])
        # Write the row text content to CSV
        #first column is the airline
        airline = cols[0].get_text(strip=True)
        #get the list of destinations in the 2nd  
        destinations = cols[1]
        isSeasonal = 0 #iterate over subcomponents (seasonal always comes last, so set is seasonal to be false for now)
        for child in destinations.children: 
            #anchor components are the only destinations
            if (child.name == "a"):
                dest_name = child.get('title') #the title is the official wikipedia airport name (without _ in place of spaces)
                dest_name = dest_name.replace(" ", "_") 
                output = f"\"{iata_source}\",\"{wiki_name}\",\"{dest_name}\",\"{airline}\",\"{isSeasonal}\"\n" #final output to append to the file
                file.write(output)#write file
            elif ((child.name == "b") and (child.text == "Seasonal:")):
                isSeasonal = 1 #get seasonal to be 1 for future destinations
    file.close() #close
    return

get reference, airport lists for both years (from the detailed airport list earlier)

In [None]:
data = pd.read_csv("./data/current_source_airports_details.csv")
print(len(data))
data.head(n=1)

checking if it is possible to check for nulls (to know when to use the redirect version)

In [None]:
test1 = data.iloc[0]["redirects"]
print(pd.isna(test1))

Start files to start route data

In [None]:
file_append_path = "./data/pre2020_routes.csv"
f = open(file_append_path, "w")
f.write("iata_source,starting_wiki_name,dest_wikipedia_name,airline,isSeasonal\n") 
f.close() #add column names

Iteration

In [None]:

for index, row in data.iterrows():
    print("airport index:", index)
    wikiname = row["wiki_name"]
    #check for redirects 
    redirect = row["redirects"]
    if (pd.isna(redirect) == False): 
        wikiname = redirect
    #get the id for the year
    id = row["pre2020_ids"]
    code = row["IATA"]
    try:
        get_destinations(code,id,wikiname,file_append_path)
    except:
        print(f"failed airport:{wikiname}")


In [None]:
data = pd.read_csv(file_append_path, encoding='utf-8') #for windows
data.to_csv(file_append_path, encoding='utf-8', index=False)

### locating the missing airports

In [None]:
ref_data = pd.read_csv("./data/current_source_airports_details.csv") #get current airports 
all_airports = ref_data["IATA"]
print("number of total airports:", len(all_airports))

In [None]:

#find missing current airports
curr_airports = pd.read_csv(file_append_path, encoding='utf-8') #for windows
curr_airports = curr_airports["iata_source"].unique()
print("number of airports currently in routes data:", len(curr_airports))

#find missing airports by set difference
missing_airports = set(all_airports) - set(curr_airports)
print("number of missing airports:", len(missing_airports))

In [None]:
missing_airports

Most of these airports did not exist back then or good wikipedia links from back then, with the exceptions of:

ITM, TAO These are all relatively small airports. We define a function to manually add entries if a valid wikipedia article is found, by making the exact old article link be posted

In [None]:
def add_routes_exact(iata_source, link, path_write, match="destination"):
    file = open(path_write, "a") #file to append to
    
    url = link
    response = requests.get(url)
    
    soup = BeautifulSoup(response.text, 'html.parser')
    #find the related destination table
    # Case-insensitive string match
    find = match
    heading = soup.find(string=re.compile(r"({})".format(find), re.IGNORECASE))
    #check text in heading
    heading_text =  heading.get_text()
    if  "former" in heading_text or "Former" in heading_text: #if either text is found, abort the function. This indicate the airport is no longer in service
        file.close() #close
        return
        
    
    table = heading.find_next("table") 
    rows = table.find_all("tr")
    #find wikiname by splitting 
    wiki_name = link.split("title=")[1]
    wiki_name = wiki_name.split("&")[0]
    for i in range(1,len(rows)): #exclude the first row
        row = rows[i]
        # Extract all cells (td or th)
        cols = row.find_all(["td", "th"])
        # Write the row text content to CSV
        #first column is the airline
        airline = cols[0].get_text(strip=True)
        #get the list of destinations in the 2nd  
        destinations = cols[1]
        isSeasonal = 0 #iterate over subcomponents (seasonal always comes last, so set is seasonal to be false for now)
        for child in destinations.children: 
            #anchor components are the only destinations
            if (child.name == "a"):
                dest_name = child.get('title') #the title is the official wikipedia airport name (without _ in place of spaces)
                dest_name = dest_name.replace(" ", "_") 
                output = f"\"{iata_source}\",\"{wiki_name}\",\"{dest_name}\",\"{airline}\",\"{isSeasonal}\"\n" #final output to append to the file
                file.write(output)#write file
            elif ((child.name == "b") and (child.text == "Seasonal:")):
                isSeasonal = 1 #get seasonal to be 1 for future destinations
    file.close() #close
    return

In [None]:
#repair for FRS
found_link = "https://en.wikipedia.org/w/index.php?title=Mundo_Maya_International_Airport&oldid=933223161"
add_routes_exact("FRS",link=found_link,path_write=file_append_path, match="airlines")

In [None]:
#repair for HIN
found_link = "https://en.wikipedia.org/w/index.php?title=Sacheon_Airport&oldid=1265903527"
add_routes_exact("HIN",link=found_link,path_write=file_append_path, match="airlines")

In [None]:
#HSR did not exist back then

In [None]:
#repair for ITM
found_link = "https://en.wikipedia.org/w/index.php?title=Itami_Airport&oldid=929306656"
add_routes_exact("ITM",link=found_link,path_write=file_append_path, match=" terminal is planned to be extensively renovated by August 2020 to include a new pier for additional aircraft,")

In [None]:
#repair for ROT
found_link = "https://en.wikipedia.org/w/index.php?title=Rotorua_Airport&oldid=996409149#Airlines_and_destinations"
add_routes_exact("ROT",link=found_link,path_write=file_append_path, match="destinations")

In [None]:
#TAO with the current name, using the old airport name, which was also in a differnt location
found_link =  "https://en.wikipedia.org/w/index.php?title=Qingdao_Liuting_International_Airport&oldid=992291152"
add_routes_exact("TAO",link=found_link,path_write=file_append_path, match="destinations")

# part 3: 2022 data 

In [None]:
def get_destinations(iata_source, article_id, wiki_name, path_write):
    file = open(path_write, "a") #file to append to
    
    url = f"https://en.wikipedia.org/w/index.php?title={wiki_name}&oldid={article_id}"
    response = requests.get(url)
    
    soup = BeautifulSoup(response.text, 'html.parser')
    #find the related destination table
    # Case-insensitive string match
    heading = soup.find("h2", string=re.compile(r"destination", re.IGNORECASE))
    #check text in heading
    heading_text =  heading.get_text()
    if  "former" in heading_text or "Former" in heading_text: #if either text is found, abort the function. This indicate the airport is no longer in service
        file.close() #close
        return
        
    
    table = heading.find_next("table") 
    while ('wikitable' not in table.get("class")): #find the next table matching a predictable class, if one has not been found
        table = table.find_next("table") 
    rows = table.find_all("tr")

    
    for i in range(1,len(rows)): #exclude the first row
        row = rows[i]
        # Extract all cells (td or th)
        cols = row.find_all(["td", "th"])
        # Write the row text content to CSV
        #first column is the airline
        airline = cols[0].get_text(strip=True)
        #get the list of destinations in the 2nd  
        destinations = cols[1]
        isSeasonal = 0 #iterate over subcomponents (seasonal always comes last, so set is seasonal to be false for now)
        for child in destinations.children: 
            #anchor components are the only destinations
            if (child.name == "a"):
                dest_name = child.get('title') #the title is the official wikipedia airport name (without _ in place of spaces)
                dest_name = dest_name.replace(" ", "_") 
                output = f"\"{iata_source}\",\"{wiki_name}\",\"{dest_name}\",\"{airline}\",\"{isSeasonal}\"\n" #final output to append to the file
                file.write(output)#write file
            elif ((child.name == "b") and (child.text == "Seasonal:")):
                isSeasonal = 1 #get seasonal to be 1 for future destinations
    file.close() #close
    return

In [None]:
file_append_path = "./data/pre2022_routes.csv"
f = open(file_append_path, "w")
f.write("iata_source,starting_wiki_name,dest_wikipedia_name,airline,isSeasonal\n") 
f.close() #add column names

Getting reference data

In [None]:
data = pd.read_csv("./data/current_source_airports_details.csv")
print(len(data))
data.head(n=1)

Looping through to try to add data

In [None]:
for index, row in data.iterrows():
    print("airport index:", index)
    wikiname = row["wiki_name"]
    #check for redirects 
    redirect = row["redirects"]
    if (pd.isna(redirect) == False): 
        wikiname = redirect
    #get the id for the year
    id = row["pre2022_ids"]
    code = row["IATA"]
    try:
        get_destinations(code,id,wikiname,file_append_path)
    except:
        print(f"failed airport:{wikiname}")

Find missing airports, if any

In [None]:
ref_data = pd.read_csv("./data/current_source_airports_details.csv") #get current airports 
all_airports = ref_data["IATA"]
print("number of total airports:", len(all_airports))

In [None]:

#find missing current airports
curr_airports = pd.read_csv(file_append_path, encoding='utf-8') #for windows
curr_airports = curr_airports["iata_source"].unique()
print("number of airports currently in routes data:", len(curr_airports))

#find missing airports by set difference
missing_airports = set(all_airports) - set(curr_airports)
print("number of missing airports:", len(missing_airports))

In [None]:
missing_airports

add missing airport information, using a similar detailed repair function, but more exact and starting the specific table used for destinations

In [None]:
def jump_to_table(iata_source, link, path_write, match="destination"):
    file = open(path_write, "a") #file to append to
    print("trying to find:")
    url = link
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    
    table = soup.find('table', class_='wikitable')
    print(type(table))
    rows = table.find_all("tr")
    #find wikiname by splitting 
    wiki_name = link.split("title=")[1]
    wiki_name = wiki_name.split("&")[0]
    for i in range(1,len(rows)): #exclude the first row
        row = rows[i]
        # Extract all cells (td or th)
        cols = row.find_all(["td", "th"])
        # Write the row text content to CSV
        #first column is the airline
        airline = cols[0].get_text(strip=True)
        #get the list of destinations in the 2nd  
        destinations = cols[1]
        isSeasonal = 0 #iterate over subcomponents (seasonal always comes last, so set is seasonal to be false for now)
        for child in destinations.children: 
            #anchor components are the only destinations
            if (child.name == "a"):
                dest_name = child.get('title') #the title is the official wikipedia airport name (without _ in place of spaces)
                dest_name = dest_name.replace(" ", "_") 
                output = f"\"{iata_source}\",\"{wiki_name}\",\"{dest_name}\",\"{airline}\",\"{isSeasonal}\"\n" #final output to append to the file
                file.write(output)#write file
            elif ((child.name == "b") and (child.text == "Seasonal:")):
                isSeasonal = 1 #get seasonal to be 1 for future destinations
    file.close() #close
    return

In [None]:
#repair for CJJ
found_link = "https://en.wikipedia.org/w/index.php?title=Cheongju_International_Airport&oldid=1061569779"
jump_to_table("CJJ",link=found_link,path_write=file_append_path, match="destinations")

In [None]:
#repair for GAU
found_link = "https://en.wikipedia.org/w/index.php?title=Lokpriya_Gopinath_Bordoloi_International_Airport&oldid=1059628744"
jump_to_table("GAU",link=found_link,path_write=file_append_path, match="destinations")

In [None]:
#HSR did not exist then


In [None]:
# reparing KUV https://en.wikipedia.org/w/index.php?title=Gunsan_Airport&oldid=1062262046

found_link = "https://en.wikipedia.org/w/index.php?title=Gunsan_Airport&oldid=1062262046"
jump_to_table("KUV",link=found_link,path_write=file_append_path, match="destinations")

In [None]:
# reparing KWJ https://en.wikipedia.org/w/index.php?title=Gunsan_Airport&oldid=1062262046

found_link = "https://en.wikipedia.org/w/index.php?title=Gwangju_Airport&oldid=1061570946"
jump_to_table("KWJ",link=found_link,path_write=file_append_path, match="destinations")

In [None]:
# reparing KWJ -- no listed destination on those pages 

# Part 4: Fixing formating, adding IATA dest column for the two dataset

## Working on the pre2020 dataset

In [None]:
path = "./data/pre2020_routes.csv"
routes_data = pd.read_csv("./data/pre2020_routes.csv")
routes_data.head(n=1)

#### fixing airline names with [] quotations, removing them

In [None]:
new_airlines_list = []
old_airlines_list = routes_data["airline"]
for airline in old_airlines_list:
    airline = airline.split("[")[0] #remove quotation
    new_airlines_list.append(airline)
routes_data["airline"] = new_airlines_list
routes_data.head()

#### attempting to get iata code using our existing database


iterating 

In [None]:
airports =  pd.read_csv("./data/current_served_airports.csv", on_bad_lines="skip")
#add iata_dest_source
routes_data["iata_dest"] = None 
for index, rows in routes_data.iterrows():
    print("current row:", index)
    dest_wiki = rows["dest_wikipedia_name"]
    try:  #try to match to a iata code
        match = airports[airports["wiki_name"]==dest_wiki].iloc[0]
        match = match["IATA"]
        routes_data["iata_dest"][index] = match
    except:
        routes_data["iata_dest"][index] = None

In [None]:
routes_data.to_csv("./data/pre2020_routes.csv", index=False)


In [None]:
routes_data = pd.read_csv("./data/pre2020_routes.csv")
routes_data.head(n=1)

Counting the number of destinations without iata information

In [None]:
none_dest_Data = routes_data[routes_data["iata_dest"].isnull()]
print(len(none_dest_Data))

In [None]:
none_dest_Data

This is far too many missing iata codes

In [None]:
#get the set of wikipedia
missing_wikinames = none_dest_Data["dest_wikipedia_name"].unique()
print(len(missing_wikinames))

### create a new database of all airports (including past airports) from the current_served airports.csv data

Use a set of functions（inspired from part 2) to get coordinate and iata data from dest_wikipedia_name. We add to the current served airports database

In [None]:
def redirectCheck(wiki_name):
    url = f"https://en.wikipedia.org/w/index.php?title={wiki_name}&action=raw"
    response = requests.get(url)
    #find the text between
    text = response.text

    #check for redirect
    check_text = text.split("[[")[0] #get section between [[
    check_text = check_text.lower()
    #if the text is clearly too long (>10 lines, there is clearly no redirect)
    lines = text.splitlines()
    number_of_lines = len(lines)
    if number_of_lines > 10:
        return wiki_name

    if ("redirect" in check_text):
        #get the text in between [[]]
        redirect = text.split("]]")[0]
        #if the text does not contain a key #, we reject
        if "#" not in check_text:
            return wiki_name
        redirect = redirect.split("[[")[1]
        redirect = redirect.replace(" ", "_") #replace spaces
        print("redirect found:", redirect)
        return redirect
    else:
        return wiki_name #return the same name back

def getRow(text, key): #help function to text a key from = of the first rpws
    try:
        regex = f"{key}"+'.*?='
        match = re.findall(rf'{regex}', text)[0] #find the first indstance
        
        start = text.find(match) #find the starting index, by matching the re pattern iata*=
        start += len(match) #do not include iata
        
        end = text.find("\n", start) #starting from the end, find the starting index
        code = text[start:end]
        code = code.split("<")[0]#get rid of ref tags
        code = code.strip()
        return code
    except:
        #check for redirect
        check_text = text.split("[[")[0] #get section between [[
        check_text = check_text.lower()
        if ("redirect" in check_text):
            #get the text in between [[]]
            redirect = text.split("]]")[0]
            redirect = redirect.split("[[")[1]
            redirect = redirect.replace(" ", "_") #replace spaces
            #get text from redirect
            url = f"https://en.wikipedia.org/w/index.php?title={wiki_name}&action=raw"
            response = requests.get(url)
            #find the text between
            text = response.text
            return getRow(text, key)
        return "" #return empty string if nothing is found

#function to convert DMS coordinates on wiki to decimal ones
def dms_to_decimal(degrees, minutes, seconds, direction):
    dd = float(degrees) + float(minutes) / 60 + float(seconds) / 3600
    if direction.upper() in ['S', 'W']:
        dd *= -1
    return str(dd)

def getIataFromWikiName(wiki_name):
    url = f"https://en.wikipedia.org/w/index.php?title={wiki_name}&action=raw"
    response = requests.get(url)
    #find the text between
    text = response.text
    
    try:
        match = re.findall(r'IATA.*?=', text)[0] #find the first indstance
        start = text.find(match) #find the starting index, by matching the re pattern iata*=
        start += len(match) #do not include iata
        
        end = text.find("\n", start) #starting from the end, find the starting index
        iata_code = text[start:end]
        iata_code = iata_code.split("<")[0]#get rid of ref tags
        iata_code = iata_code.strip()
        return iata_code
    except:
        return ""

def getDetailsFromWikiName(wiki_name):
    #check for redirects
    wiki_name = redirectCheck(wiki_name)

    url = f"https://en.wikipedia.org/w/index.php?title={wiki_name}&action=raw"
    response = requests.get(url)
    #find the text between
    text = response.text

    #intialize as empty strings
    city = ""
    country = ""
    lat = ""
    long = ""
    iata = getIataFromWikiName(wiki_name)
    try:
        #process city data
        city = getRow(text,"city-served")#city
        if "[[" in city:
            city = city.split("[[")[1]
        city = city.split("]]")[0]
        # process latitude, longtitude data
        coor = getRow(text,"coordinates")
        coor = coor.split("}}")[0]
        coor = coor.split("{{")[1]
        coor = coor.split("|")
        lat = dms_to_decimal(coor[1],coor[2],coor[3],coor[4]) #convert using function
        long = dms_to_decimal(coor[5],coor[6],coor[7],coor[8])
        #get country data, using wikipedia api using city
        url2 = f"https://en.wikipedia.org/w/index.php?title={city}&action=raw"
        response2 = requests.get(url2)
        #find the text between
        text2 = response2.text
        country = getRow(text2,"subdivision_name")
        #depending on the enclosing symbol
        if "[" in country:
            country = country.split("[[")[1]#get between [[]]
            country = country.split("]]")[0]
        elif "{" in country:
            #get between (())
            country = country.split("{{")[1]#get between [[]]
            country = country.split("}}")[0]
        if "|" in country: #now, check for |
            tlist = country.split("|")
            country = tlist[len(tlist)-1]
        return {"IATA": iata, "wiki_name": wiki_name, "city":city, "country":country, "latitude":lat, "longitude":long}
    except:
        return {"IATA": iata, "wiki_name": wiki_name, "city":city, "country":country, "latitude":lat, "longitude":long}
        
        



Checking repair on missing data (we really only care about IATA)

In [None]:
getDetailsFromWikiName("Tajima_Airport")

iterating to create

In [None]:
ref_data = pd.read_csv("./data/current_served_airports.csv")
save_path = "./data/all_airports.csv"#define the save path
print(len(ref_data))
ref_data.head(n=1)

creating a function to run in chunks

In [None]:
def fill_missing(missing_wikinames, ref_data, start, end):
    new_rows = []
    for i in range(start, end):
        wiki_name = missing_wikinames[i]
        print("current count repaired:",i, "with name:", wiki_name)
        new_data = getDetailsFromWikiName(wiki_name)
        

        new_rows.append(new_data)# new row are already in the proper dictionary format
    ref_data = pd.concat([ref_data, pd.DataFrame(new_rows)], ignore_index=True) #append rows
    ref_data.to_csv(save_path, index =False)
    new_rows = [] #reset the new data
    return ref_data

Run in chucks of 20

In [None]:

i=0
while (i< len(missing_wikinames)):
    end = i+20 #run in chunks
    if end > len(missing_wikinames):
        end = len(missing_wikinames)
    print("partition:", i,end)
    ref_data = fill_missing(missing_wikinames, ref_data, i, end)
    time.sleep(1.2)
    #increment 
    i = end 

save new airport data

In [None]:
print(len(ref_data))
ref_data.to_csv("./data/all_airports.csv", index =False)

Note this new dataset has few iata code mapping to different wikinames (due to redirects or new airports with the same iata code)

running again on the new dataset to get destination iata codes

In [None]:
routes_data = pd.read_csv("./data/pre2020_routes.csv")

airports =  pd.read_csv("./data/all_airports.csv", on_bad_lines="skip")
#add iata_dest_source
routes_data["iata_dest"] = None 
for index, rows in routes_data.iterrows():
    print("current row:", index)
    dest_wiki = rows["dest_wikipedia_name"]
    try:  #try to match to a iata code
        match = airports[airports["wiki_name"]==dest_wiki].iloc[0]
        match = match["IATA"]
        routes_data["iata_dest"][index] = match
    except:
        routes_data["iata_dest"][index] = None

routes_data.to_csv("./data/pre2020_routes.csv", index =False)

Count missing iata codes again

In [None]:
none_dest_Data = routes_data[routes_data["iata_dest"].isnull()]
print(len(none_dest_Data))
none_dest_Data.head(n=10)

In [None]:
getDetailsFromWikiName("Orlando_Melbourne_International_Airport")

## Working on the pre2022 dataset

In [None]:
routes_data = pd.read_csv("./data/pre2022_routes.csv")

airports =  pd.read_csv("./data/all_airports.csv", on_bad_lines="skip")
#add iata_dest_source
routes_data["iata_dest"] = None 
for index, rows in routes_data.iterrows():
    print("current row:", index)
    dest_wiki = rows["dest_wikipedia_name"]
    try:  #try to match to a iata code
        match = airports[airports["wiki_name"]==dest_wiki].iloc[0]
        match = match["IATA"]
        routes_data["iata_dest"][index] = match
    except:
        routes_data["iata_dest"][index] = None

routes_data.to_csv("./data/pre2022_routes.csv", index =False)

checking iata codes

In [None]:
none_dest_Data = routes_data[routes_data["iata_dest"].isnull()]
print(len(none_dest_Data))
none_dest_Data.head(n=10)