# Web Scrape

#### Step 1: Import libraries

#### Step 2: Create a list with all urls

#### Step 3: Loop through each link:
* Get html
* Build table
* Concat all tables

#### Step 4: Expand rows with compact values

# Import Libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Create list with all URLs

In [2]:
filterDay = ['yesterday', 'today', 'tomorrow']

filterHour = [str(i) for i in range(0, 24, 6)]

urls = [f'https://www.santiago-airport.com/scl-departures?day={day}&tp={hour}' for day in filterDay for hour in filterHour]

# Loop through each link: build table

In [3]:
dataframes = []

for url in urls:
    
    request = requests.get(url)    
    soup = BeautifulSoup(request.text, 'html.parser')

    Reference_Day = url.split('=')[1][:-3]
    if Reference_Day == 'Tomorrow':
        Date = soup.find('option',{'value':'?day=tomorrow'}).text.split(' ')[0]
    else:
        Date = soup.find('option',{'selected':"selected"}).text.split(' ')[0]

    flights = soup.find_all('div', class_='flight-row')
    rows = []

    for flight in flights[1:]:
        row={}
        if flight.find('div',{'class','adsense'}): # advertisment row
            continue
        row['Destination'] = flight.find('div', {'class','flight-col flight-col__dest-term'}).find('b').text
        row['Destination Code'] = flight.find('div', {'class','flight-col flight-col__dest-term'}).find('span').text
        row['Departure'] = flight.find('div', {'class','flight-col flight-col__hour'}).text.strip()
        row['Flight'] = flight.find('div', {'class','flight-col flight-col__flight'}).text.strip()
        row['Airline'] = flight.find('div', {'class','flight-col flight-col__airline'}).text.strip()
        row['Terminal'] = flight.find('div', {'class','flight-col flight-col__terminal'}).text.strip()
        row['Status'] = flight.find_all('a')[-1].text
        row['Date'] = Date
        row['Reference Day'] = Reference_Day
        row['url'] = url
        rows.append(row)

    data = pd.DataFrame(rows)
    
    dataframes.append(data)

df = pd.concat(dataframes)

# Expand rows with compact values

In [7]:
# df_departures = pd.DataFrame(columns=['Destination','Destination Code','Departure','Flight','Airline','Terminal','Status','Date','Reference Day','url'])

# for index, row in df.iterrows():
#     values_b = row['Flight'].split('\n')
#     values_c = row['Airline'].split('\n')
#     n_values = max(len(values_b), len(values_c))
#     rows = []
#     for i in range(n_values):
#         new_row = {
#             'Destination': row['Destination'],
#             'Destination Code': row['Destination Code'],
#             'Departure': row['Departure'],
#             'Terminal': row['Terminal'],
#             'Status': row['Status'],
#             'Date': row['Date'],
#             'Reference Day': row['Reference Day'],
#             'url': row['url']
#             }
#         if i < len(values_b):
#             new_row['Flight'] = values_b[i]
#         else:
#             new_row['Flight'] = ''
#         if i < len(values_c):
#             new_row['Airline'] = values_c[i]
#         else:
#             new_row['Airline'] = ''
#         rows.append(new_row)
#     df_departures = pd.concat([df_departures, pd.DataFrame(rows)], ignore_index=True)

df_departures = pd.DataFrame(columns=['Destination', 'Destination Code', 'Departure', 'Flight', 'Airline', 'Terminal', 'Status', 'Date', 'Reference Day', 'url'])

for index, row in df.iterrows():
    values_b = row['Flight'].split('\n')
    values_c = row['Airline'].split('\n')
    n_values = max(len(values_b), len(values_c))
    rows = []
    for i in range(n_values):
        # use a dictionary comprehension to create a new row for each value
        new_row = {
            key: row[key]
            if key not in ['Flight', 'Airline']
            else (
                values_b[i]
                if key == 'Flight' and i < len(values_b)
                else values_c[i]
                    if key == 'Airline' and i < len(values_c)
                    else '')
            for key in df_departures.columns}
        rows.append(new_row)
    df_departures = pd.concat([df_departures, pd.DataFrame(rows)], ignore_index=True)


In [9]:
df_departures

Unnamed: 0,Destination,Destination Code,Departure,Flight,Airline,Terminal,Status,Date,Reference Day,url
0,Panama City,(PTY),00:26,CM118,Copa Airlines,2,Landed - On-time [+],2023-04-24,yesterday,https://www.santiago-airport.com/scl-departure...
1,Bogota,(BOG),00:30,LA578,LATAM Airlines,2,Landed - On-time [+],2023-04-24,yesterday,https://www.santiago-airport.com/scl-departure...
2,Bogota,(BOG),00:30,DL7387,Delta Air Lines,2,Landed - On-time [+],2023-04-24,yesterday,https://www.santiago-airport.com/scl-departure...
3,Bogota,(BOG),00:30,QF4207,Qantas,2,Landed - On-time [+],2023-04-24,yesterday,https://www.santiago-airport.com/scl-departure...
4,Auckland,(AKL),00:35,LA801,LATAM Airlines,2,Landed - Delayed [+],2023-04-24,yesterday,https://www.santiago-airport.com/scl-departure...
...,...,...,...,...,...,...,...,...,...,...
776,New York,(JFK),22:55,LA532,LATAM Airlines,2,Scheduled [+],2023-04-26,tomorrow,https://www.santiago-airport.com/scl-departure...
777,New York,(JFK),22:55,DL6061,Delta Air Lines,2,Scheduled [+],2023-04-26,tomorrow,https://www.santiago-airport.com/scl-departure...
778,New York,(JFK),22:55,JL7627,JAL Japan Airlines,2,Scheduled [+],2023-04-26,tomorrow,https://www.santiago-airport.com/scl-departure...
779,Miami,(MIA),23:20,LA500,LATAM Airlines,2,Scheduled [+],2023-04-26,tomorrow,https://www.santiago-airport.com/scl-departure...


# Arrivals

In [10]:
filterDay = ['yesterday', 'today', 'tomorrow']

filterHour = ['0', '6', '12', '18']

urls = [f'https://www.santiago-airport.com/scl-arrivals?day={day}&tp={hour}' for day in filterDay for hour in filterHour]

dataframes = []

for url in urls:
    
    request = requests.get(url)    
    soup = BeautifulSoup(request.text, 'html.parser')

    Reference_Day = url.split('=')[1][:-3]
    if Reference_Day == 'Tomorrow':
        Date = soup.find('option',{'value':'?day=tomorrow'}).text.split(' ')[0]
    else:
        Date = soup.find('option',{'selected':"selected"}).text.split(' ')[0]

    flights = soup.find_all('div', class_='flight-row')
    rows = []

    for flight in flights[1:]:
        row={}
        if flight.find('div',{'class','adsense'}): # advertisment row
            continue
        row['Origin'] = flight.find('div', {'class','flight-col flight-col__dest-term'}).find('b').text
        row['Origin Code'] = flight.find('div', {'class','flight-col flight-col__dest-term'}).find('span').text
        row['Departure'] = flight.find('div', {'class','flight-col flight-col__hour'}).text.strip()
        row['Flight'] = flight.find('div', {'class','flight-col flight-col__flight'}).text.strip()
        row['Airline'] = flight.find('div', {'class','flight-col flight-col__airline'}).text.strip()
        row['Terminal'] = flight.find('div', {'class','flight-col flight-col__terminal'}).text.strip()
        row['Status'] = flight.find_all('a')[-1].text
        row['Date'] = Date
        row['Reference Day'] = Reference_Day
        row['url'] = url
        rows.append(row)

    data = pd.DataFrame(rows)
    
    dataframes.append(data)

df = pd.concat(dataframes)

df_arrivals = pd.DataFrame(columns=['Origin', 'Origin Code', 'Departure', 'Flight', 'Airline', 'Terminal', 'Status', 'Date', 'Reference Day', 'url'])

for index, row in df.iterrows():
    values_b = row['Flight'].split('\n')
    values_c = row['Airline'].split('\n')
    n_values = max(len(values_b), len(values_c))
    rows = []
    for i in range(n_values):
        # use a dictionary comprehension to create a new row for each value
        new_row = {
            key: row[key]
            if key not in ['Flight', 'Airline']
            else (
                values_b[i]
                if key == 'Flight' and i < len(values_b)
                else values_c[i]
                    if key == 'Airline' and i < len(values_c)
                    else '')
            for key in df_arrivals.columns}
        rows.append(new_row)
    df_arrivals = pd.concat([df_arrivals, pd.DataFrame(rows)], ignore_index=True)

In [15]:
df_arrivals

Unnamed: 0,Origin,Origin Code,Departure,Flight,Airline,Terminal,Status,Date,Reference Day,url
0,Puerto Montt,(PMC),00:14,JA156,JetSMART,1,Landed - Delayed [+],2023-04-24,yesterday,https://www.santiago-airport.com/scl-arrivals?day=yesterday&tp=0
1,Sao Paulo,(GRU),00:25,LA751,LATAM Airlines,2,Landed - On-time [+],2023-04-24,yesterday,https://www.santiago-airport.com/scl-arrivals?day=yesterday&tp=0
2,Sao Paulo,(GRU),00:25,QF3896,Qantas,2,Landed - On-time [+],2023-04-24,yesterday,https://www.santiago-airport.com/scl-arrivals?day=yesterday&tp=0
3,Buenos Aires,(AEP),00:35,JA1006,JetSMART,2,En Route [+],2023-04-24,yesterday,https://www.santiago-airport.com/scl-arrivals?day=yesterday&tp=0
4,Antofagasta,(ANF),00:51,LA347,LATAM Airlines,1,En Route [+],2023-04-24,yesterday,https://www.santiago-airport.com/scl-arrivals?day=yesterday&tp=0
...,...,...,...,...,...,...,...,...,...,...
775,Lima,(LIM),23:30,JA401,JetSMART,2,Scheduled [+],2023-04-26,tomorrow,https://www.santiago-airport.com/scl-arrivals?day=tomorrow&tp=18
776,Panama City,(PTY),23:35,CM497,Copa Airlines,2,Scheduled [+],2023-04-26,tomorrow,https://www.santiago-airport.com/scl-arrivals?day=tomorrow&tp=18
777,La Serena,(LSC),23:36,JA127,JetSMART,1,Scheduled [+],2023-04-26,tomorrow,https://www.santiago-airport.com/scl-arrivals?day=tomorrow&tp=18
778,Buenos Aires,(EZE),23:50,5Y34,Atlas Air,2,Scheduled [+],2023-04-26,tomorrow,https://www.santiago-airport.com/scl-arrivals?day=tomorrow&tp=18


In [25]:
df_arrivals = pd.DataFrame(columns=['Origin', 'Origin Code', 'Departure', 'Flight', 'Airline', 'Terminal', 'Status', 'Date', 'Reference Day', 'url'])

for day in filterDay:
    for hour in filterHour:
        url = f'https://www.santiago-airport.com/scl-arrivals?day={day}&tp={hour}'
        request = requests.get(url)
        soup = BeautifulSoup(request.text, 'html.parser')
        reference_day = url.split('=')[1][:-3]
        date = soup.find('option', {'selected': 'selected'}).text.split(' ')[0] if reference_day != 'Tomorrow' else soup.find('option', {'value': '?day=tomorrow'}).text.split(' ')[0]
        rows = []
        for flight in soup.find_all('div', class_='flight-row')[1:]:
            if flight.find('div', {'class', 'adsense'}): # advertisement row
                continue
            origin = flight.find('div', {'class', 'flight-col flight-col__dest-term'}).find('b').text
            origin_code = flight.find('div', {'class', 'flight-col flight-col__dest-term'}).find('span').text
            departure = flight.find('div', {'class', 'flight-col flight-col__hour'}).text.strip()
            flight_number = flight.find('div', {'class', 'flight-col flight-col__flight'}).text.strip().split('\n')
            airline = flight.find('div', {'class', 'flight-col flight-col__airline'}).text.strip().split('\n')
            terminal = flight.find('div', {'class', 'flight-col flight-col__terminal'}).text.strip()
            status = flight.find_all('a')[-1].text
            for i in range(max(len(flight_number), len(airline))):
                rows.append({
                    'Origin': origin,
                    'Origin Code': origin_code,
                    'Departure': departure,
                    'Flight': flight_number[i] if i < len(flight_number) else '',
                    'Airline': airline[i] if i < len(airline) else '',
                    'Terminal': terminal,
                    'Status': status,
                    'Date': date,
                    'Reference Day': reference_day,
                    'url': url
                })
        df_arrivals = pd.concat([df_arrivals, pd.DataFrame(rows)], ignore_index=True)

df_arrivals

Unnamed: 0,Origin,Origin Code,Departure,Flight,Airline,Terminal,Status,Date,Reference Day,url
0,Sao Paulo,(GRU),00:25,LA751,LATAM Airlines,2,Landed - On-time [+],2023-04-25,yesterday,https://www.santiago-airport.com/scl-arrivals?day=yesterday&tp=0
1,Sao Paulo,(GRU),00:25,DL6293,Delta Air Lines,2,Landed - On-time [+],2023-04-25,yesterday,https://www.santiago-airport.com/scl-arrivals?day=yesterday&tp=0
2,Sao Paulo,(GRU),00:25,QF3896,Qantas,2,Landed - On-time [+],2023-04-25,yesterday,https://www.santiago-airport.com/scl-arrivals?day=yesterday&tp=0
3,Calama,(CJC),00:33,H2245,Sky Airline,1,Landed - On-time [+],2023-04-25,yesterday,https://www.santiago-airport.com/scl-arrivals?day=yesterday&tp=0
4,Puerto Montt,(PMC),00:35,H2402,Sky Airline,1,Landed - On-time [+],2023-04-25,yesterday,https://www.santiago-airport.com/scl-arrivals?day=yesterday&tp=0
...,...,...,...,...,...,...,...,...,...,...
788,Iquique,(IQQ),23:32,LA901,LATAM Airlines,1,Scheduled [+],2023-04-27,tomorrow,https://www.santiago-airport.com/scl-arrivals?day=tomorrow&tp=18
789,Panama City,(PTY),23:35,CM497,Copa Airlines,2,Scheduled [+],2023-04-27,tomorrow,https://www.santiago-airport.com/scl-arrivals?day=tomorrow&tp=18
790,Florianopolis,(FLN),23:40,H2631,Sky Airline,2,Scheduled [+],2023-04-27,tomorrow,https://www.santiago-airport.com/scl-arrivals?day=tomorrow&tp=18
791,Iquique,(IQQ),23:49,JA117,JetSMART,1,Scheduled [+],2023-04-27,tomorrow,https://www.santiago-airport.com/scl-arrivals?day=tomorrow&tp=18


In [24]:
df_arrivals

Unnamed: 0,Origin,Origin Code,Departure,Flight,Airline,Terminal,Status,Date,Reference Day,url
0,Puerto Montt,(PMC),00:14,JA156,JetSMART,1,Landed - Delayed [+],2023-04-24,yesterday,https://www.santiago-airport.com/scl-arrivals?day=yesterday&tp=0
1,Sao Paulo,(GRU),00:25,LA751,LATAM Airlines,2,Landed - On-time [+],2023-04-24,yesterday,https://www.santiago-airport.com/scl-arrivals?day=yesterday&tp=0
2,Sao Paulo,(GRU),00:25,QF3896,Qantas,2,Landed - On-time [+],2023-04-24,yesterday,https://www.santiago-airport.com/scl-arrivals?day=yesterday&tp=0
3,Buenos Aires,(AEP),00:35,JA1006,JetSMART,2,En Route [+],2023-04-24,yesterday,https://www.santiago-airport.com/scl-arrivals?day=yesterday&tp=0
4,Antofagasta,(ANF),00:51,LA347,LATAM Airlines,1,En Route [+],2023-04-24,yesterday,https://www.santiago-airport.com/scl-arrivals?day=yesterday&tp=0
...,...,...,...,...,...,...,...,...,...,...
775,Lima,(LIM),23:30,JA401,JetSMART,2,Scheduled [+],2023-04-26,tomorrow,https://www.santiago-airport.com/scl-arrivals?day=tomorrow&tp=18
776,Panama City,(PTY),23:35,CM497,Copa Airlines,2,Scheduled [+],2023-04-26,tomorrow,https://www.santiago-airport.com/scl-arrivals?day=tomorrow&tp=18
777,La Serena,(LSC),23:36,JA127,JetSMART,1,Scheduled [+],2023-04-26,tomorrow,https://www.santiago-airport.com/scl-arrivals?day=tomorrow&tp=18
778,Buenos Aires,(EZE),23:50,5Y34,Atlas Air,2,Scheduled [+],2023-04-26,tomorrow,https://www.santiago-airport.com/scl-arrivals?day=tomorrow&tp=18


In [27]:
df_arrivals.Airline.value_counts()

LATAM Airlines              319
Sky Airline                 114
Delta Air Lines              80
Qantas                       57
JetSMART                     46
Iberia                       39
Copa Airlines                15
KLM Royal Dutch Airlines     15
British Airways              13
Avianca                      10
Aerolineas Argentinas         8
JAL Japan Airlines            7
Cathay Pacific                6
ITA Airways                   6
Air Canada                    4
Korean Air                    4
American Airlines             4
Aeromexico                    3
Jetsmart Airlines             3
Malaysia Airlines             3
Air Europa                    3
 Finnair                      3
Air France                    3
Qatar Airways                 3
El Al Israel Airlines         3
ANA All Nippon Airways        3
United Airlines               3
Turkish Airlines              3
JetSMART Airlines Peru        3
LATAM Cargo Brasil            3
LATAM Cargo Chile             2
Ethiopia