# Web Scrape

#### Step 1: Import libraries

#### Step 2: Create a list with all urls

#### Step 3: Loop through each link:
* Get html
* Build table
* Concat all tables

#### Step 4: Expand rows with compact values

# Import Libraries

In [14]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Create list with all URLs

In [15]:
filterDay = ['yesterday', 'today', 'tomorrow']

filterHour = [str(i) for i in range(0, 24, 6)]

urls = [f'https://www.santiago-airport.com/scl-departures?day={day}&tp={hour}' for day in filterDay for hour in filterHour]

# Loop through each link: build table

In [16]:
dataframes = []

for url in urls:
    
    request = requests.get(url)    
    soup = BeautifulSoup(request.text, 'html.parser')

    Reference_Day = url.split('=')[1][:-3]
    if Reference_Day == 'Tomorrow':
        Date = soup.find('option',{'value':'?day=tomorrow'}).text.split(' ')[0]
    else:
        Date = soup.find('option',{'selected':"selected"}).text.split(' ')[0]

    flights = soup.find_all('div', class_='flight-row')
    rows = []

    for flight in flights[1:]:
        row={}
        if flight.find('div',{'class','adsense'}): # advertisment row
            continue
        row['Destination'] = flight.find('div', {'class','flight-col flight-col__dest-term'}).find('b').text
        row['Destination Code'] = flight.find('div', {'class','flight-col flight-col__dest-term'}).find('span').text
        row['Departure'] = flight.find('div', {'class','flight-col flight-col__hour'}).text.strip()
        row['Flight'] = flight.find('div', {'class','flight-col flight-col__flight'}).text.strip()
        row['Airline'] = flight.find('div', {'class','flight-col flight-col__airline'}).text.strip()
        row['Terminal'] = flight.find('div', {'class','flight-col flight-col__terminal'}).text.strip()
        row['Status'] = flight.find_all('a')[-1].text
        row['Date'] = Date
        row['Reference Day'] = Reference_Day
        row['url'] = url
        rows.append(row)

    data = pd.DataFrame(rows)
    
    dataframes.append(data)

df = pd.concat(dataframes)

# Expand rows with compact values

In [17]:
df_departures = pd.DataFrame(columns=['Destination', 'Destination Code', 'Departure', 'Flight', 'Airline', 'Terminal', 'Status', 'Date', 'Reference Day', 'url'])

for index, row in df.iterrows():
    values_b = row['Flight'].split('\n')
    values_c = row['Airline'].split('\n')
    n_values = max(len(values_b), len(values_c))
    rows = []
    for i in range(n_values):
        # use a dictionary comprehension to create a new row for each value
        new_row = {
            key: row[key]
            if key not in ['Flight', 'Airline']
            else (
                values_b[i]
                if key == 'Flight' and i < len(values_b)
                else values_c[i]
                    if key == 'Airline' and i < len(values_c)
                    else '')
            for key in df_departures.columns}
        rows.append(new_row)
    df_departures = pd.concat([df_departures, pd.DataFrame(rows)], ignore_index=True)


In [20]:
df_departures

Unnamed: 0,Destination,Destination Code,Departure,Flight,Airline,Terminal,Status,Date,Reference Day,url
0,Panama City,(PTY),00:26,CM118,Copa Airlines,2,Landed - On-time [+],2023-04-26,yesterday,https://www.santiago-airport.com/scl-departure...
1,Bogota,(BOG),00:30,LA572,LATAM Airlines,2,Landed - On-time [+],2023-04-26,yesterday,https://www.santiago-airport.com/scl-departure...
2,Bogota,(BOG),00:30,DL7387,Delta Air Lines,2,Landed - On-time [+],2023-04-26,yesterday,https://www.santiago-airport.com/scl-departure...
3,Bogota,(BOG),00:30,QF4207,Qantas,2,Landed - On-time [+],2023-04-26,yesterday,https://www.santiago-airport.com/scl-departure...
4,Auckland,(AKL),00:35,LA801,LATAM Airlines,2,Landed - Delayed [+],2023-04-26,yesterday,https://www.santiago-airport.com/scl-departure...
...,...,...,...,...,...,...,...,...,...,...
810,New York,(JFK),22:55,CX7822,Cathay Pacific,2,Scheduled [+],2023-04-28,tomorrow,https://www.santiago-airport.com/scl-departure...
811,New York,(JFK),22:55,DL6061,Delta Air Lines,2,Scheduled [+],2023-04-28,tomorrow,https://www.santiago-airport.com/scl-departure...
812,New York,(JFK),22:55,JL7627,JAL Japan Airlines,2,Scheduled [+],2023-04-28,tomorrow,https://www.santiago-airport.com/scl-departure...
813,Miami,(MIA),23:20,LA500,LATAM Airlines,2,Scheduled [+],2023-04-28,tomorrow,https://www.santiago-airport.com/scl-departure...


# Arrivals

In [32]:
filterDay = ['yesterday', 'today', 'tomorrow']

filterHour = ['0', '6', '12', '18']

urls = [f'https://www.santiago-airport.com/scl-arrivals?day={day}&tp={hour}' for day in filterDay for hour in filterHour]

dataframes = []

for url in urls:
    
    request = requests.get(url)    
    soup = BeautifulSoup(request.text, 'html.parser')

    Reference_Day = url.split('=')[1][:-3]
    if Reference_Day == 'Tomorrow':
        Date = soup.find('option',{'value':'?day=tomorrow'}).text.split(' ')[0]
    else:
        Date = soup.find('option',{'selected':"selected"}).text.split(' ')[0]

    flights = soup.find_all('div', class_='flight-row')
    rows = []

    for flight in flights[1:]:
        row={}
        if flight.find('div',{'class','adsense'}): # advertisment row
            continue
        row['Origin'] = flight.find('div', {'class','flight-col flight-col__dest-term'}).find('b').text
        row['Origin Code'] = flight.find('div', {'class','flight-col flight-col__dest-term'}).find('span').text
        row['Departure'] = flight.find('div', {'class','flight-col flight-col__hour'}).text.strip()
        row['Flight'] = flight.find('div', {'class','flight-col flight-col__flight'}).text.strip()
        row['Airline'] = flight.find('div', {'class','flight-col flight-col__airline'}).text.strip()
        row['Terminal'] = flight.find('div', {'class','flight-col flight-col__terminal'}).text.strip()
        row['Status'] = flight.find_all('a')[-1].text
        row['Date'] = Date
        row['Reference Day'] = Reference_Day
        row['url'] = url
        rows.append(row)

    data = pd.DataFrame(rows)
    
    dataframes.append(data)

df = pd.concat(dataframes)

df_arrivals = pd.DataFrame(columns=['Origin', 'Origin Code', 'Departure', 'Flight', 'Airline', 'Terminal', 'Status', 'Date', 'Reference Day', 'url'])

for index, row in df.iterrows():
    values_b = row['Flight'].split('\n')
    values_c = row['Airline'].split('\n')
    n_values = max(len(values_b), len(values_c))
    rows = []
    for i in range(n_values):
        # use a dictionary comprehension to create a new row for each value
        new_row = {
            key: row[key]
            if key not in ['Flight', 'Airline']
            else (
                values_b[i]
                if key == 'Flight' and i < len(values_b)
                else values_c[i]
                    if key == 'Airline' and i < len(values_c)
                    else '')
            for key in df_arrivals.columns}
        rows.append(new_row)
    df_arrivals = pd.concat([df_arrivals, pd.DataFrame(rows)], ignore_index=True)