In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import re 
import pandas as pd

# Suppress just SettingWithCopyWarning
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.ChainedAssignmentError)
warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)
pd.options.mode.chained_assignment = None  # Disable the warning


# Part 1: Get most recent wikipedia urls ids for airports before 2020, and 2022

This will help us analyze covid recovery route trends.

In [2]:
#function to find version code before a certain formated date 

def get_oldid_before(title, date):
    """Get the revision ID (oldid) of the latest version before a given date."""
    api_url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "prop": "revisions",
        "titles": title,
        "rvlimit": 1,
        "rvstart": date,
        "rvdir": "older",
        "rvprop": "ids",
        "formatversion": 2
    }

    response = requests.get(api_url, params=params)
    data = response.json()

    try:
        return str(data['query']['pages'][0]['revisions'][0]['revid'])
    except (KeyError, IndexError):
        return ""

# Example usage
id = get_oldid_before("John_F._Kennedy_International_Airport", "2020-01-01T00:00:00Z")
print(id)

933044375


In [3]:
#use the current routes airports source data
source_link = "./data/current_source_airports.csv"
ref_data = pd.read_csv(source_link, encoding='utf-8')
ref_data.head(n=1)

Unnamed: 0,IATA,wiki_name,city,country,latitude,longitude
0,ATL,Hartsfield–Jackson_Atlanta_International_Airport,Atlanta,Usa,33.64,-84.427


iterate to find old ids.


In [4]:
ref_data["pre2020_ids"] = None
ref_data["pre2022_ids"] = None
pre2020_ids = []
pre2022_ids = []
i = 0
for wiki_name in ref_data["wiki_name"]:
    print(i)
    id1 = get_oldid_before(wiki_name, "2020-01-01T00:00:00Z")
    pre2020_ids.append(id1)
    id2 = get_oldid_before(wiki_name, "2022-01-01T00:00:00Z")
    pre2022_ids.append(id2)
    i += 1
ref_data["pre2020_ids"] = pre2020_ids
ref_data["pre2022_ids"] = pre2022_ids

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [5]:
ref_data.head(n=1)

Unnamed: 0,IATA,wiki_name,city,country,latitude,longitude,pre2020_ids,pre2022_ids
0,ATL,Hartsfield–Jackson_Atlanta_International_Airport,Atlanta,Usa,33.64,-84.427,932935279,1063034925


In [6]:
ref_data.to_csv("./data/current_source_airports_details.csv", encoding='utf-8', index=False)

checking both new columns have all non - None values

In [7]:
print( len(ref_data[ref_data["pre2020_ids"].isnull()])  )

46


analyzing further, creating a new function to try to extract values, by considering redirects

In [18]:

def redirectCheck(wiki_name):
    url = f"https://en.wikipedia.org/w/index.php?title={wiki_name}&action=raw"
    response = requests.get(url)
    #find the text between
    text = response.text

    #check for redirect
    check_text = text.split("[[")[0] #get section between [[
    check_text = check_text.lower()
    if ("redirect" in check_text):
        #get the text in between [[]]
        redirect = text.split("]]")[0]
        redirect = redirect.split("[[")[1]
        redirect = redirect.replace(" ", "_") #replace spaces
        print("redirect found:", redirect)
        return redirect
    else:
        return wiki_name #return the same name back


In [23]:
subset =ref_data[ref_data["pre2020_ids"].isnull() | ref_data["pre2022_ids"].isnull()] 
print(len(subset))
subset.head(n=46)

46


Unnamed: 0,IATA,wiki_name,city,country,latitude,longitude,pre2020_ids,pre2022_ids
23,MAD,Adolfo_Suárez_Madrid–Barajas_Airport,Madrid,Spain,40.472,-3.561,,
110,URC,Ürümqi_Diwopu_International_Airport,Urumqi,China,43.907,87.474,,
188,MHD,Mashhad_International_Airport,Mashhad,Iran,36.235,59.640833,,
200,LIN,Linate_Airport,Milan,Italy,45.445,9.277,,
217,FAO,Gago_Coutinho_Airport,Faro,Acores,37.014,-7.966,,
238,HER,Heraklion_International_Airport,Heraklion,Greece,35.339,25.18,,
249,WUX,Sunan_Shuofang_International_Airport,{{ubl|class=nowrap,,31.494444,120.429444,,
251,SOF,Sofia_Airport,Sofia,Bulgaria,42.695,23.406,,
274,PMO,Falcone_Borsellino_Airport,Palermo,Italy,38.176,13.091,,
280,XNN,Xining_Caojiabao_International_Airport,Xining,China,36.5275,102.042778,,


### Attempting to fill in any missing data, adding redirect column
Testing new modified function, looping through missing values and modifying to try to add values where the id is not valid, add redirect information to table. If none, etc.

In [24]:
val = redirectCheck("Malacca_International_Airport")
get_oldid_before(val, "2020-01-01T00:00:00Z")
#, "2020-01-01T00:00:00Z"

redirect found: Malacca_Airport


'929947665'

In [29]:
#runnign missing values
missing_iata = set(subset["IATA"])#isolate missing iata codes
print(len(missing_iata))
ref_data["redirects"] = "" #add columns for directs 
for index, row in ref_data.iterrows():
    iata = row["IATA"]
    wikiname = row["wiki_name"]
    if iata in missing_iata:
        print("reparing index:", index)
        val = redirectCheck(wikiname)
        ref_data.at[index, "pre2020_ids"] = get_oldid_before(val, "2020-01-01T00:00:00Z") #modify the column
        ref_data.at[index, "pre2022_ids"] = get_oldid_before(val, "2022-01-01T00:00:00Z")
        #add to redirects just in case, the original wikiname is different
        if val != wikiname:
            ref_data.at[index, "redirects"] = val
        
        


46
reparing index: 23
redirect found: Madrid–Barajas_Airport
reparing index: 110
redirect found: Ürümqi_Tianshan_International_Airport
reparing index: 188
redirect found: Mashhad_Shahid_Hasheminejad_International_Airport
reparing index: 200
redirect found: Milan_Linate_Airport
reparing index: 217
redirect found: Faro_Airport
reparing index: 238
redirect found: Heraklion_International_Airport_"Nikos_Kazantzakis"
reparing index: 249
redirect found: Wuxi_Shuofang_Airport
reparing index: 251
redirect found: Vasil_Levski_Sofia_Airport
reparing index: 274
redirect found: Palermo_Airport
reparing index: 280
redirect found: Xining_Caojiapu_International_Airport
reparing index: 281
redirect found: Xining_Caojiapu_International_Airport
reparing index: 311
redirect found: Trondheim_Airport
reparing index: 318
redirect found: Chaudhary_Charan_Singh_International_Airport
reparing index: 339
redirect found: Lhasa_Gonggar_International_Airport
reparing index: 350
redirect found: Supadio_International

write again to data 

In [30]:
ref_data.to_csv("./data/current_source_airports_details.csv", encoding='utf-8', index=False)

check for rows where either is none or is empty string

In [35]:
subset =ref_data[(ref_data["pre2020_ids"]=="") | (ref_data["pre2022_ids"]=="")] 
print(len(subset))
subset =ref_data[ref_data["pre2020_ids"].isnull() | ref_data["pre2022_ids"].isnull()] 
print(len(subset))

0
0


checks passed, done with part 1. Keep in mind that during routes generation, if an entry has an redirect, we use that 