In [1]:
## IDEAS
# allow user to choose to display arrvials, or departures, domestic or international
# display some graphics
# image path https://www.sydneyairport.com.au/flights/logo/VA
# combine all (arrivals, departures, international, domestic) into one data frame
# add two columns - flight_type, terminal_type

In [2]:
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

In [3]:
chrome_options = Options()  
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(options=chrome_options)

In [24]:
# Define query strings for url
today = datetime.today().strftime('%Y-%m-%d')
flight_types  = ["arrival", "departure"]
terminal_types = ["international", "domestic"]

# Initialise empty lists
journies = []
types = []
terminals = []
stopovers = []
airlines = []
airline_logos = []
flight_numbers = []
other_flight_numbers = []
statuses = []
scheduled_times = []
estimated_times = []

# Loop through each combination of (domestic/international) x (arrival/departure)
for flight_type in flight_types:
    for terminal_type in terminal_types:
        url = "https://www.sydneyairport.com.au/flights/?query=&flightType=" + flight_type + "&terminalType=" + terminal_type + "&date=" + today + "&sortColumn=scheduled_time&ascending=true&showAll=true"
        driver.get(url)
        
        html_soup = BeautifulSoup(driver.page_source, "html.parser")
        flight_containers = html_soup.find_all("div", attrs={"class": "flight-card"})[2:]
        
        # Loop through all containers (one for each flight) to extract and store info
        for container in flight_containers:
            # Arrival / departure
            types.append(flight_type)
            
            # International / domestic
            terminals.append(terminal_type)
            
            # Origin / Destinations
            journies.append(container.find("div", attrs={"class": "destination-name"}).text)

            # Stopovers (if any)
            if container.find("div", attrs={"class": "city-via"}) is not None:
                stopovers.append(container.find("div", attrs={"class": "city-via"}).text)
            else:
                stopovers.append(None)

            # Airlines
            airlines.append(container.find("span", attrs={"class": "with-image"}).text)

            # Airline logos
            #airline_logos.append("https://www.sydneyairport.com.au" + container.img['src'])
            logo_url = "https://www.sydneyairport.com.au" + container.img['src']
            driver.get(logo_url)
            logo_soup = BeautifulSoup(driver.page_source, "html.parser")
            airline_logos.append(logo_soup.img['src'])

            # Flight numbers
            flight_numbers.append(container.find("div", attrs={"class": "heading-medium"}).text)

            # Alternative flight numbers (if any)
            if container.find("div", attrs={"class": "body-xsmall"}) is not None:
                other_flight_numbers.append(container.find("div", attrs={"class": "body-xsmall"}).text)
            else:
                other_flight_numbers.append(None)

            # Statuses
            statuses.append(container.find("div", attrs={"class": "status"}).text)

            # Scheduled times
            scheduled_times.append(container.find("div", attrs={"class": "large-scheduled-time"}).text)

            # Estimated times
            estimated_times.append(container.find("div", attrs={"class": "estimated-time"}).text)
    
    
# Create dataframe from lists
flights = pd.DataFrame({'journey': journies,
                        'type': types,
                        'terminal': terminals,
                        'stopover': stopovers,
                        'airline': airlines,
                        'airline_logo': airline_logos,
                        'flight_number': flight_numbers,
                        'other_flight_number': other_flight_numbers,
                        'status': statuses,
                        'scheduled_time': scheduled_times,
                        'estimated_time': estimated_times})
# Reorder columns
flights = flights[['type', 'terminal', 'journey', 'stopover', 'airline', 'airline_logo', 'flight_number', 'other_flight_number', 'status', 'scheduled_time', 'estimated_time']]

flights

Unnamed: 0,type,terminal,journey,stopover,airline,airline_logo,flight_number,other_flight_number,status,scheduled_time,estimated_time
0,arrival,international,London,Via Singapore,Qantas,https://www.sydneyairport.com.au/flights/logo/QF,QF2,EK5002,Arrived,05:10,08:31
1,arrival,international,London,Via Singapore,British Airways,https://www.sydneyairport.com.au/flights/logo/BA,BA15,"AY5915, IB4745",Arrived,05:10,05:53
2,arrival,international,Singapore,,Singapore Airlines,https://www.sydneyairport.com.au/flights/logo/SQ,SQ221,"A31213, AF7232, ET1317, LH9780, LX4172, TK9312, VA5515",Arrived,05:55,05:40
3,arrival,international,San Francisco,,Qantas,https://www.sydneyairport.com.au/flights/logo/QF,QF74,AA7396,Cancelled,06:05,06:05
4,arrival,international,Jakarta,,Qantas,https://www.sydneyairport.com.au/flights/logo/QF,QF42,,Arrived,06:05,06:01
5,arrival,international,Dubai,,Emirates,https://www.sydneyairport.com.au/flights/logo/EK,EK412,QF8412,Arrived,06:05,06:41
6,arrival,international,Delhi,,Air India,https://www.sydneyairport.com.au/flights/logo/AI,AI302,,Arrived,06:10,06:46
7,arrival,international,Denpasar,,Virgin Australia,https://www.sydneyairport.com.au/flights/logo/VA,VA36,,Arrived,06:10,06:06
8,arrival,international,Tokyo,,Japan Airlines,https://www.sydneyairport.com.au/flights/logo/JL,JL771,LA7467,Arrived,06:10,06:18
9,arrival,international,Manila,,Qantas,https://www.sydneyairport.com.au/flights/logo/QF,QF20,,Arrived,06:10,06:13


In [89]:
flights[flights['status']=='Cancelled']

Unnamed: 0,type,terminal,journey,stopover,airline,airline_logo,flight_number,other_flight_number,status,scheduled_time,estimated_time
118,arrival,domestic,Melbourne,,Jetstar,/flights/logo/JQ,JQ500,,Cancelled,07:25,-
165,arrival,domestic,Cooma,,Rex,/flights/logo/ZL,ZL722,,Cancelled,10:40,-
177,arrival,domestic,Melbourne,,Qantas,/flights/logo/QF,QF422,,Cancelled,11:25,-
181,arrival,domestic,Adelaide,,Jetstar,/flights/logo/JQ,JQ763,,Cancelled,11:30,-
244,arrival,domestic,Gold Coast,,Jetstar,/flights/logo/JQ,JQ411,,Cancelled,14:50,-
288,arrival,domestic,Melbourne,,Jetstar,/flights/logo/JQ,JQ514,,Cancelled,16:45,-
302,arrival,domestic,Melbourne,,Jetstar,/flights/logo/JQ,JQ518,,Cancelled,17:30,-
343,arrival,domestic,Melbourne,,Virgin Australia,/flights/logo/VA,VA869,EY6707,Cancelled,18:40,-
358,arrival,domestic,Brisbane,,Qantas,/flights/logo/QF,QF547,,Cancelled,19:10,-
373,arrival,domestic,Melbourne,,Virgin Australia,/flights/logo/VA,VA879,EY6949,Cancelled,19:55,-


In [6]:
flights.iloc[1, 1]

'international'

In [11]:
flights.sort_values(by=['scheduled_time'], inplace=True)
flights

Unnamed: 0,type,terminal,journey,stopover,airline,airline_logo,flight_number,other_flight_number,status,scheduled_time,estimated_time
0,arrival,international,London,Via Singapore,Qantas,/flights/logo/QF,QF2,EK5002,Arrived,05:10,08:31
1,arrival,international,London,Via Singapore,British Airways,/flights/logo/BA,BA15,"AY5915, IB4745",Arrived,05:10,05:53
2,arrival,international,Singapore,,Singapore Airlines,/flights/logo/SQ,SQ221,"A31213, AF7232, ET1317, LH9780, LX4172, TK9312...",Arrived,05:55,05:40
476,departure,domestic,Melbourne,,Virgin Australia,/flights/logo/VA,VA800,"EY6823, HU8846",Departed,06:00,06:00
477,departure,domestic,Melbourne,,Qantas,/flights/logo/QF,QF401,,Departed,06:00,-
478,departure,domestic,Cairns,,Jetstar,/flights/logo/JQ,JQ952,,Departed,06:00,-
479,departure,domestic,Hobart,,Jetstar,/flights/logo/JQ,JQ719,,Departed,06:00,-
367,departure,international,Dubai,,Emirates,/flights/logo/EK,EK415,QF8415,Departed,06:00,-
366,departure,international,Queenstown,,Air New Zealand,/flights/logo/NZ,NZ834,,Departed,06:00,06:04
480,departure,domestic,Brisbane,,Jetstar,/flights/logo/JQ,JQ810,,Departed,06:05,-


In [8]:
from datetime import datetime
timestr = flights.iloc[1, -1]
timeobj = datetime.strptime(flights.iloc[1, -1], "%H:%M")

type(timestr)
print(timestr)

type(timeobj)
print(timeobj)

05:53
1900-01-01 05:53:00


In [26]:
print(flights.head(1).to_html(classes=['flights'], escape=False ,formatters=dict(airline_logo=path_to_image_html)))

<table border="1" class="dataframe flights">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>type</th>
      <th>terminal</th>
      <th>journey</th>
      <th>stopover</th>
      <th>airline</th>
      <th>airline_logo</th>
      <th>flight_number</th>
      <th>other_flight_number</th>
      <th>status</th>
      <th>scheduled_time</th>
      <th>estimated_time</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>arrival</td>
      <td>international</td>
      <td>London</td>
      <td>Via Singapore</td>
      <td>Qantas</td>
      <td><img src="https://www.sydneyairport.com.au/flights/logo/QF" width="60" ></td>
      <td>QF2</td>
      <td>EK5002</td>
      <td>Arrived</td>
      <td>05:10</td>
      <td>08:31</td>
    </tr>
  </tbody>
</table>


In [25]:
from IPython.core.display import HTML
def path_to_image_html(path):
    return '<img src="'+ path + '" width="60" >'

pd.set_option('display.max_colwidth', -1)

HTML(flights.to_html(escape=False ,formatters=dict(airline_logo=path_to_image_html)))

Unnamed: 0,type,terminal,journey,stopover,airline,airline_logo,flight_number,other_flight_number,status,scheduled_time,estimated_time
0,arrival,international,London,Via Singapore,Qantas,,QF2,EK5002,Arrived,05:10,08:31
1,arrival,international,London,Via Singapore,British Airways,,BA15,"AY5915, IB4745",Arrived,05:10,05:53
2,arrival,international,Singapore,,Singapore Airlines,,SQ221,"A31213, AF7232, ET1317, LH9780, LX4172, TK9312, VA5515",Arrived,05:55,05:40
3,arrival,international,San Francisco,,Qantas,,QF74,AA7396,Cancelled,06:05,06:05
4,arrival,international,Jakarta,,Qantas,,QF42,,Arrived,06:05,06:01
5,arrival,international,Dubai,,Emirates,,EK412,QF8412,Arrived,06:05,06:41
6,arrival,international,Delhi,,Air India,,AI302,,Arrived,06:10,06:46
7,arrival,international,Denpasar,,Virgin Australia,,VA36,,Arrived,06:10,06:06
8,arrival,international,Tokyo,,Japan Airlines,,JL771,LA7467,Arrived,06:10,06:18
9,arrival,international,Manila,,Qantas,,QF20,,Arrived,06:10,06:13


In [27]:
print("\"")

"


In [29]:
print('<img src=\"'+ "hello" + '\" width=\"60\" >')

<img src="hello" width="60" >


In [30]:
def flightscraper():

    from datetime import datetime
    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    from bs4 import BeautifulSoup
    import numpy as np
    import pandas as pd

    # Initialise instance of Chrome driver
    chrome_options = Options()  
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(options=chrome_options)

    # Define query strings for url
    today = datetime.today().strftime('%Y-%m-%d')
    flight_types  = ["arrival", "departure"]
    terminal_types = ["international", "domestic"]

    # Initialise empty lists
    journies = []
    types = []
    terminals = []
    stopovers = []
    airlines = []
    airline_logos = []
    flight_numbers = []
    other_flight_numbers = []
    statuses = []
    scheduled_times = []
    estimated_times = []

    # Loop through each combination of (domestic/international) x (arrival/departure)
    for flight_type in flight_types:
        for terminal_type in terminal_types:
            url = "https://www.sydneyairport.com.au/flights/?query=&flightType=" + flight_type + "&terminalType=" + terminal_type + "&date=" + today + "&sortColumn=scheduled_time&ascending=true&showAll=true"
            driver.get(url)
            
            html_soup = BeautifulSoup(driver.page_source, "html.parser")
            flight_containers = html_soup.find_all("div", attrs={"class": "flight-card"})[2:]
            
            # Loop through all containers (one for each flight) to extract and store info
            for container in flight_containers:
                # Arrival / departure
                types.append(flight_type)
                
                # International / domestic
                terminals.append(terminal_type)
                
                # Origin / Destinations
                journies.append(container.find("div", attrs={"class": "destination-name"}).text)

                # Stopovers (if any)
                if container.find("div", attrs={"class": "city-via"}) is not None:
                    stopovers.append(container.find("div", attrs={"class": "city-via"}).text)
                else:
                    stopovers.append(None)

                # Airlines
                airlines.append(container.find("span", attrs={"class": "with-image"}).text)

                # Airline logos
                airline_logos.append("https://www.sydneyairport.com.au" + container.img['src'])

                # Flight numbers
                flight_numbers.append(container.find("div", attrs={"class": "heading-medium"}).text)

                # Alternative flight numbers (if any)
                if container.find("div", attrs={"class": "body-xsmall"}) is not None:
                    other_flight_numbers.append(container.find("div", attrs={"class": "body-xsmall"}).text)
                else:
                    other_flight_numbers.append(None)

                # Statuses
                statuses.append(container.find("div", attrs={"class": "status"}).text)

                # Scheduled times
                scheduled_times.append(container.find("div", attrs={"class": "large-scheduled-time"}).text)

                # Estimated times
                estimated_times.append(container.find("div", attrs={"class": "estimated-time"}).text)
        
        
    # Create dataframe from lists
    flights = pd.DataFrame({'journey': journies,
                            'type': types,
                            'terminal': terminals,
                            'stopover': stopovers,
                            'airline': airlines,
                            'airline_logo': airline_logos,
                            'flight_number': flight_numbers,
                            'other_flight_number': other_flight_numbers,
                            'status': statuses,
                            'scheduled_time': scheduled_times,
                            'estimated_time': estimated_times})
    # Reorder columns
    flights = flights[['airline_logo', 'type', 'terminal', 'journey', 'stopover', 'airline', 'flight_number', 'other_flight_number', 'status', 'scheduled_time', 'estimated_time']]
    
    # Order flights by scheduled time
    flights.sort_values(by=['scheduled_time'], inplace=True)
    flights.reset_index(drop=True, inplace=True)
    
    return flights

In [32]:
HTML(flightscraper().to_html(classes="flights", escape=False, formatters=dict(airline_logo=path_to_image_html)))

Unnamed: 0,airline_logo,type,terminal,journey,stopover,airline,flight_number,other_flight_number,status,scheduled_time,estimated_time
0,,arrival,international,London,Via Singapore,Qantas,QF2,EK5002,Arrived,05:10,08:31
1,,arrival,international,London,Via Singapore,British Airways,BA15,"AY5915, IB4745",Arrived,05:10,05:53
2,,arrival,international,Singapore,,Singapore Airlines,SQ221,"A31213, AF7232, ET1317, LH9780, LX4172, TK9312, VA5515",Arrived,05:55,05:40
3,,departure,domestic,Melbourne,,Virgin Australia,VA800,"EY6823, HU8846",Departed,06:00,06:00
4,,departure,domestic,Melbourne,,Qantas,QF401,,Departed,06:00,-
5,,departure,domestic,Cairns,,Jetstar,JQ952,,Departed,06:00,-
6,,departure,domestic,Hobart,,Jetstar,JQ719,,Departed,06:00,-
7,,departure,international,Queenstown,,Air New Zealand,NZ834,,Departed,06:00,06:04
8,,departure,international,Dubai,,Emirates,EK415,QF8415,Departed,06:00,-
9,,arrival,domestic,Perth,,Virgin Australia,VA572,"DL7259, EY6725",Arrived,06:05,06:04
