# Gather passenger traffic about EU & non-EU airports from web

In [1]:
import pandas as pd
import numpy as np

import requests
from bs4 import BeautifulSoup

In [2]:
# get the response in the form of html
wikiurl = 'https://en.wikipedia.org/wiki/List_of_busiest_airports_by_passenger_traffic#2019_statistics'
table_class = "wikitable sortable jquery-tablesorter"
response = requests.get(wikiurl)
print(response.status_code)

200


In [3]:
# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(response.text, 'html.parser')
indiatable = soup.findAll('table',{'class':"wikitable"})[1]

In [4]:
df = pd.read_html(str(indiatable))

# convert list to dataframe
df = pd.DataFrame(df[0])
df.head()

Unnamed: 0,Rank,Airport,Location,Country,Code(IATA/ICAO),Totalpassengers,Rankchange,%change
0,1.0,Hartsfield–Jackson Atlanta International Airport,"Atlanta, Georgia",United States,ATL/KATL,110531300,,2.9%
1,2.0,Beijing Capital International Airport,"Chaoyang-Shunyi, Beijing",China,PEK/ZBAA,100011438,,1.0%
2,3.0,Los Angeles International Airport,"Los Angeles, California",United States,LAX/KLAX,88068013,1.0,0.6%
3,4.0,Dubai International Airport,"Garhoud, Dubai",United Arab Emirates,DXB/OMDB,86396757,1.0,3.1%
4,5.0,Tokyo Haneda Airport,"Ōta, Tokyo",Japan,HND/RJTT,85505054,,1.7%


In [5]:
# extract IATA
df['IATA'] = df['Code(IATA/ICAO)'].str.split('/').str[0]
df_final = df[['IATA','Totalpassengers']]
df_final.head()

Unnamed: 0,IATA,Totalpassengers
0,ATL,110531300
1,PEK,100011438
2,LAX,88068013
3,DXB,86396757
4,HND,85505054


In [18]:
# see what info is missing
passengersWW = pd.read_csv('../data/processed/passengersWW.csv', index_col=0)
airportsWW = pd.DataFrame(passengersWW['shopped_at'].unique(), columns=['IATA'])

traffic = pd.merge(df_final, airportsWW, on=['IATA'], how='right')
traffic

Unnamed: 0,IATA,Totalpassengers
0,HND,85505054.0
1,KUL,62336469.0
2,SFO,57418574.0
3,JFK,62551072.0
4,SIN,68283000.0
5,HKG,71415245.0
6,DFW,75066956.0
7,PVG,76153455.0
8,PEK,100011438.0
9,EZE,


In [20]:
# add missing data manually
# new_row ={'IATA':'EZE', 'Totalpassengers':12708446}
traffic.loc[traffic['IATA'] == 'EZE', 'Totalpassengers'] = 12708446
traffic.loc[traffic['IATA'] == 'MEL', 'Totalpassengers'] = 32311684

traffic['Totalpassengers'] = traffic['Totalpassengers'].astype(int)
traffic

Unnamed: 0,IATA,Totalpassengers
0,HND,85505054
1,KUL,62336469
2,SFO,57418574
3,JFK,62551072
4,SIN,68283000
5,HKG,71415245
6,DFW,75066956
7,PVG,76153455
8,PEK,100011438
9,EZE,12708446


In [None]:
# save processed data
traffic.to_csv('../data/processed/passenger_traffic_2019.csv')