In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import re 
import pandas as pd
import numpy as np
import math
# Suppress just SettingWithCopyWarning
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.ChainedAssignmentError)
warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)
pd.options.mode.chained_assignment = None  # Disable the warning


# Part 1: Get most recent wikipedia urls ids for airports before 2020, and 2022

This will help us analyze covid recovery route trends. Additionally, in the process we create a detaiiled dataset of current routes. We first check for redirects in the names

In [None]:
def redirectCheck(wiki_name):
    url = f"https://en.wikipedia.org/w/index.php?title={wiki_name}&action=raw"
    response = requests.get(url)
    #find the text between
    text = response.text

    #check for redirect
    check_text = text.split("[[")[0] #get section between [[
    check_text = check_text.lower()
    #if the text is clearly too long (>10 lines, there is clearly no redirect)
    lines = text.splitlines()
    number_of_lines = len(lines)
    print(number_of_lines)
    if number_of_lines > 10:
        return wiki_name

    if ("redirect" in check_text):
        #get the text in between [[]]
        redirect = text.split("]]")[0]
        #if the text does not contain a key #, we reject
        if "#" not in check_text:
            return wiki_name
        redirect = redirect.split("[[")[1]
        redirect = redirect.replace(" ", "_") #replace spaces
        print("redirect found:", redirect)
        return redirect
    else:
        return wiki_name #return the same name back

In [7]:
redirectCheck("Beijing_Capital_International_Airport")

604


'Beijing_Capital_International_Airport'

In [3]:
redirectCheck("Malacca_International_Airport")
redirectCheck("Sultan_Muhammad_Salahuddin_Airport")

redirect found: Malacca_Airport
redirect found: Sultan_Muhammad_Salahudin_Airport


'Sultan_Muhammad_Salahudin_Airport'

Running redirects code

In [4]:
ref_data = pd.read_csv("./data/current_source_airports.csv", encoding='utf-8')
ref_data["redirects"] = "" #add columns for directs 
for index, row in ref_data.iterrows():
    iata = row["IATA"]
    wikiname = row["wiki_name"]
    print("reparing index:", index)
    val = redirectCheck(wikiname)
    #add to redirects just in case, the original wikiname is different
    if val != wikiname:
        ref_data.at[index, "redirects"] = val
#write to the new csv about the details of the airport
ref_data.to_csv("./data/current_source_airports_details.csv", encoding='utf-8', index=False)
        

reparing index: 0
reparing index: 1
reparing index: 2
reparing index: 3
reparing index: 4
reparing index: 5
reparing index: 6
reparing index: 7
reparing index: 8
reparing index: 9
reparing index: 10
reparing index: 11
reparing index: 12
reparing index: 13
reparing index: 14
reparing index: 15
reparing index: 16
reparing index: 17
reparing index: 18
reparing index: 19
reparing index: 20
reparing index: 21
reparing index: 22
reparing index: 23
redirect found: Madrid–Barajas_Airport
reparing index: 24
reparing index: 25
reparing index: 26
reparing index: 27
reparing index: 28
reparing index: 29
reparing index: 30
reparing index: 31
reparing index: 32
reparing index: 33
reparing index: 34
reparing index: 35
redirect found: Kunming
reparing index: 36
reparing index: 37
reparing index: 38
reparing index: 39
reparing index: 40
reparing index: 41
reparing index: 42
reparing index: 43
reparing index: 44
reparing index: 45
reparing index: 46
reparing index: 47
reparing index: 48
reparing index: 

KeyboardInterrupt: 

In [None]:
#function to find version code before a certain formated date 

def get_oldid_before(title, date):
    """Get the revision ID (oldid) of the latest version before a given date."""
    api_url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "prop": "revisions",
        "titles": title,
        "rvlimit": 1,
        "rvstart": date,
        "rvdir": "older",
        "rvprop": "ids",
        "formatversion": 2
    }

    response = requests.get(api_url, params=params)
    data = response.json()

    try:
        return str(data['query']['pages'][0]['revisions'][0]['revid'])
    except (KeyError, IndexError):
        return ""

# Example usage
id = get_oldid_before("John_F._Kennedy_International_Airport", "2020-01-01T00:00:00Z")
print(id)

In [None]:
#use the current routes airports source data
ref_data = pd.read_csv("./data/current_source_airports.csv", encoding='utf-8')
ref_data.head(n=1)

iterate to find old ids.


In [None]:
ref_data["pre2020_ids"] = None
ref_data["pre2022_ids"] = None
pre2020_ids = []
pre2022_ids = []
i = 0
for wiki_name in ref_data["wiki_name"]:
    print(i)
    id1 = get_oldid_before(wiki_name, "2020-01-01T00:00:00Z")
    pre2020_ids.append(id1)
    id2 = get_oldid_before(wiki_name, "2022-01-01T00:00:00Z")
    pre2022_ids.append(id2)
    i += 1
ref_data["pre2020_ids"] = pre2020_ids
ref_data["pre2022_ids"] = pre2022_ids

In [None]:
ref_data.head(n=1)

In [None]:
ref_data.to_csv("./data/current_source_airports_details.csv", encoding='utf-8', index=False)

checking both new columns have all non - None values

In [14]:
ref_data = pd.read_csv("./data/current_source_airports_details.csv")
print( len(ref_data[ref_data["pre2020_ids"].isnull()])  )
print( len(ref_data[ref_data["pre2022_ids"].isnull()])  )


0
0


isolating data needing to be repaired

In [15]:
subset =ref_data[ref_data["pre2020_ids"].isnull() | ref_data["pre2022_ids"].isnull()] 
print(len(subset))
subset.head(n=46)

0


Unnamed: 0,IATA,wiki_name,city,country,latitude,longitude,pre2020_ids,pre2022_ids,redirects


### repairing redirect data

analyzing further, creating a new function to try to extract values, by considering redirects

In [9]:

def redirectCheck(wiki_name):
    url = f"https://en.wikipedia.org/w/index.php?title={wiki_name}&action=raw"
    response = requests.get(url)
    #find the text between
    text = response.text

    #check for redirect
    check_text = text.split("[[")[0] #get section between [[
    check_text = check_text.lower()
    if ("redirect" in check_text):
        #get the text in between [[]]
        redirect = text.split("]]")[0]
        redirect = redirect.split("[[")[1]
        redirect = redirect.replace(" ", "_") #replace spaces
        print("redirect found:", redirect)
        return redirect
    else:
        return wiki_name #return the same name back


##### Attempting to fill in any missing data, adding redirect column
Testing new modified function, looping through missing values and modifying to try to add values where the id is not valid, add redirect information to table. If none, etc.

In [12]:
redirectCheck("Malacca_International_Airport")
redirectCheck("Sultan_Muhammad_Salahuddin_Airport")
#, "2020-01-01T00:00:00Z"

redirect found: Malacca_Airport
redirect found: Sultan_Muhammad_Salahudin_Airport


'Sultan_Muhammad_Salahudin_Airport'

In [None]:
#running missing values
missing_iata = set(subset["IATA"])#isolate missing iata codes
print(len(missing_iata))
ref_data["redirects"] = "" #add columns for directs 
for index, row in ref_data.iterrows():
    iata = row["IATA"]
    wikiname = row["wiki_name"]
    if iata in missing_iata:
        print("reparing index:", index)
        val = redirectCheck(wikiname)
        ref_data.at[index, "pre2020_ids"] = get_oldid_before(val, "2020-01-01T00:00:00Z") #modify the column
        ref_data.at[index, "pre2022_ids"] = get_oldid_before(val, "2022-01-01T00:00:00Z")
        #add to redirects just in case, the original wikiname is different
        if val != wikiname:
            ref_data.at[index, "redirects"] = val
#write again to data
ref_data.to_csv("./data/current_source_airports_details.csv", encoding='utf-8', index=False)
        


reparing index: 0
redirect found: Metro_Atlanta


NameError: name 'get_oldid_before' is not defined

check for rows where either is none or is empty string

In [None]:
ref_data = pd.read_csv("./data/current_source_airports_details.csv", encoding="utf-8")
subset =ref_data[(ref_data["pre2020_ids"]=="") | (ref_data["pre2022_ids"]=="")] 
print(len(subset))
subset =ref_data[ref_data["pre2020_ids"].isnull() | ref_data["pre2022_ids"].isnull()] 
print(len(subset))

checks passed, done with part 1. Keep in mind that during routes generation, if an entry has an redirect, we use that 

# Part 2: Route generation for 2020, 

Additionally, improvements to current_source_airports data, new modified data for exceptions

In [2]:
def get_destinations(iata_source, article_id, wiki_name, path_write):
    file = open(path_write, "a") #file to append to
    
    url = f"https://en.wikipedia.org/w/index.php?title={wiki_name}&oldid={article_id}"
    response = requests.get(url)
    
    soup = BeautifulSoup(response.text, 'html.parser')
    #find the related destination table
    # Case-insensitive string match
    heading = soup.find("h2", string=re.compile(r"destination", re.IGNORECASE))
    #check text in heading
    heading_text =  heading.get_text()
    if  "former" in heading_text or "Former" in heading_text: #if either text is found, abort the function. This indicate the airport is no longer in service
        file.close() #close
        return
        
    
    table = heading.find_next("table") 
    while ('wikitable' not in table.get("class")): #find the next table matching a predictable class, if one has not been found
        table = table.find_next("table") 
    rows = table.find_all("tr")

    
    for i in range(1,len(rows)): #exclude the first row
        row = rows[i]
        # Extract all cells (td or th)
        cols = row.find_all(["td", "th"])
        # Write the row text content to CSV
        #first column is the airline
        airline = cols[0].get_text(strip=True)
        #get the list of destinations in the 2nd  
        destinations = cols[1]
        isSeasonal = 0 #iterate over subcomponents (seasonal always comes last, so set is seasonal to be false for now)
        for child in destinations.children: 
            #anchor components are the only destinations
            if (child.name == "a"):
                dest_name = child.get('title') #the title is the official wikipedia airport name (without _ in place of spaces)
                dest_name = dest_name.replace(" ", "_") 
                output = f"\"{iata_source}\",\"{wiki_name}\",\"{dest_name}\",\"{airline}\",\"{isSeasonal}\"\n" #final output to append to the file
                file.write(output)#write file
            elif ((child.name == "b") and (child.text == "Seasonal:")):
                isSeasonal = 1 #get seasonal to be 1 for future destinations
    file.close() #close
    return

get reference, airport lists for both years (from the detailed airport list earlier)

In [3]:
data = pd.read_csv("./data/current_source_airports_details.csv")
print(len(data))
data.head(n=1)

980


Unnamed: 0,IATA,wiki_name,city,country,latitude,longitude,pre2020_ids,pre2022_ids,redirects
0,ATL,Hartsfield–Jackson_Atlanta_International_Airport,Atlanta,Usa,33.64,-84.427,932935279,1063034925,


checking if it is possible to check for nulls (to know when to use the redirect version)

In [4]:
test1 = data.iloc[0]["redirects"]
print(pd.isna(test1))

True


Start files to start route data

In [5]:
file_append_path = "./data/pre2020_routes.csv"
f = open(file_append_path, "w")
f.write("iata_source,starting_wiki_name,dest_wikipedia_name,airline,isSeasonal\n") 
f.close() #add column names

Iteration

In [6]:

for index, row in data.iterrows():
    print("airport index:", index)
    wikiname = row["wiki_name"]
    #check for redirects 
    redirect = row["redirects"]
    if (pd.isna(redirect) == False): 
        wikiname = redirect
    #get the id for the year
    id = row["pre2020_ids"]
    code = row["IATA"]
    try:
        get_destinations(code,id,wikiname,file_append_path)
    except:
        print(f"failed airport:{wikiname}")


airport index: 0
airport index: 1
airport index: 2
airport index: 3
airport index: 4
airport index: 5
airport index: 6
airport index: 7
airport index: 8
airport index: 9
airport index: 10
airport index: 11
airport index: 12
airport index: 13
airport index: 14
airport index: 15
airport index: 16
airport index: 17
airport index: 18
airport index: 19
airport index: 20
airport index: 21
airport index: 22
airport index: 23
airport index: 24
airport index: 25
airport index: 26
airport index: 27
airport index: 28
airport index: 29
airport index: 30
airport index: 31
airport index: 32
airport index: 33
airport index: 34
airport index: 35
airport index: 36
airport index: 37
airport index: 38
airport index: 39
airport index: 40
airport index: 41
airport index: 42
airport index: 43
airport index: 44
airport index: 45
airport index: 46
airport index: 47
airport index: 48
airport index: 49
airport index: 50
airport index: 51
airport index: 52
airport index: 53
airport index: 54
airport index: 55
ai

In [None]:
data = pd.read_csv(file_append_path, encoding='utf-8') #for windows
data.to_csv(file_append_path, encoding='utf-8', index=False)

### Another redirect repair: Running code to add redirects to all columns

From before, there are a few ~30 failed airports because redirect information is missing from airports that need it to redirect to the main wikipedia page.

In [16]:
def redirectCheck(wiki_name):
    url = f"https://en.wikipedia.org/w/index.php?title={wiki_name}&action=raw"
    response = requests.get(url)
    #find the text between
    text = response.text

    #check for redirect
    check_text = text.split("[[")[0] #get section between [[
    check_text = check_text.lower()
    if ("redirect" in check_text):
        #get the text in between [[]]
        redirect = text.split("]]")[0]
        redirect = redirect.split("[[")[1]
        redirect = redirect.replace(" ", "_") #replace spaces
        print("redirect found:", redirect)
        return redirect
    else:
        return wiki_name #return the same name back


In [19]:
#check 

redirectCheck("Humberto_Delgado_Airport")

redirect found: Lisbon_Airport


'Lisbon_Airport'

### locating the exact missing values

In [25]:
ref_data = pd.read_csv("./data/current_source_airports_details.csv") #get current airports 
all_airports = ref_data["IATA"]
print("number of total airports:", len(all_airports))

number of total airports: 980


In [39]:
#find missing current airports
curr_airports = pd.read_csv(file_append_path, encoding='utf-8') #for windows
curr_airports = curr_airports["iata_source"].unique()
print("number of airports currently in routes data:", len(curr_airports))

#find missing airports by set difference
missing_airports = set(all_airports) - set(curr_airports)
print("number of missing airports:", len(missing_airports))

number of airports currently in routes data: 944
number of missing airports: 36


In [27]:
#running missing values
ref_data["redirects"] = "" #add columns for directs 
for index, row in ref_data.iterrows():
    iata = row["IATA"]
    wikiname = row["wiki_name"]
    if iata in missing_airports:
        val = redirectCheck(wikiname)
        #add to redirects just in case, the original wikiname is different
        if val != wikiname:
            print("reparing index:", index,"and iata name:", iata)
            ref_data.at[index, "redirects"] = val
#write again to data
ref_data.to_csv("./data/current_source_airports_details.csv", encoding='utf-8', index=False)
    

redirect found: Lisbon_Airport
reparing index: 74 and iata name: LIS
redirect found: Itami,_Hyogo
reparing index: 146 and iata name: ITM
redirect found: Sultan_Aji_Muhammad_Sulaiman_Sepinggan_Airport
reparing index: 235 and iata name: BPN
redirect found: Ahmedabad_Airport
reparing index: 239 and iata name: AMD
redirect found: Fortaleza_Airport
reparing index: 268 and iata name: FOR
redirect found: Sochi_International_Airport
reparing index: 276 and iata name: AER
redirect found: Simferopol_International_Airport
reparing index: 291 and iata name: SIP
redirect found: Leeds_Bradford_Airport
reparing index: 329 and iata name: LBA
redirect found: Xishuangbanna_Gasa_International_Airport
reparing index: 332 and iata name: JHG
redirect found: Félix-Houphouët-Boigny_International_Airport
reparing index: 431 and iata name: ABJ
redirect found: Bagdogra_Airport
reparing index: 436 and iata name: IXB
redirect found: Ahmed_Ben_Bella_Airport
reparing index: 455 and iata name: ORN
redirect found: Chi

Attempting to run the routes operation on all the rows with the missing iata codes

In [36]:
#getting new ref data
ref_data = pd.read_csv("./data/current_source_airports_details.csv") #get current airports 
all_airports = ref_data["IATA"]

In [37]:
data = ref_data
for index, row in data.iterrows():
    print("airport index:", index)
    wikiname = row["wiki_name"]
    #check for redirects 
    redirect = row["redirects"]
    if (pd.isna(redirect) == False): 
        wikiname = redirect
    #get the id for the year
    id = row["pre2020_ids"]
    code = row["IATA"]
    if code in missing_airports:
        try:
            print("reparing airport with name:", wikiname)
            get_destinations(code,id,wikiname,file_append_path)
        except:
            print(f"failed airport:{wikiname}")

airport index: 0
airport index: 1
airport index: 2
airport index: 3
airport index: 4
airport index: 5
airport index: 6
airport index: 7
airport index: 8
airport index: 9
airport index: 10
airport index: 11
airport index: 12
airport index: 13
airport index: 14
airport index: 15
airport index: 16
airport index: 17
airport index: 18
airport index: 19
airport index: 20
airport index: 21
airport index: 22
airport index: 23
airport index: 24
airport index: 25
airport index: 26
airport index: 27
airport index: 28
airport index: 29
airport index: 30
airport index: 31
airport index: 32
airport index: 33
airport index: 34
airport index: 35
airport index: 36
airport index: 37
airport index: 38
airport index: 39
airport index: 40
airport index: 41
airport index: 42
airport index: 43
airport index: 44
airport index: 45
airport index: 46
airport index: 47
airport index: 48
airport index: 49
airport index: 50
airport index: 51
airport index: 52
airport index: 53
airport index: 54
airport index: 55
ai

In [29]:
data = pd.read_csv(file_append_path, encoding='utf-8') #for windows
data.to_csv(file_append_path, encoding='utf-8', index=False)

Trying the locate the list of missing airports now

In [35]:
#find missing current airports
curr_airports = pd.read_csv(file_append_path, encoding='utf-8') #for windows
curr_airports = curr_airports["iata_source"].unique()
print("number of airports currently in routes data:", len(curr_airports))

#find missing airports by set difference
missing_airports = set(all_airports) - set(curr_airports)
print("number of missing airports:", len(missing_airports))

number of airports currently in routes data: 944
number of missing airports: 36


# part 3: 2022 data 

In [None]:
file_append_path = "./data/pre2022_routes.csv"
f = open(file_append_path, "w")
f.write("iata_source,starting_wiki_name,dest_wikipedia_name,airline,isSeasonal\n") 
f.close() #add column names