In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# from datetime import date
from datetime import datetime
# from selenium.webdriver.support.ui import Select
import os
import pandas as pd
import time
from tqdm import tqdm
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get('https://www.abay.vn')
while True:
    try:
        search_button = driver.find_element(By.ID, 'cphMain_ctl00_usrSearchFormD2_btnSearch')
        search_button.click()
        break
    except: pass

# define scrapping function

abay_routes = {'SGN':'Tp_Ho_Chi_Minh', 'HAN':'Ha_Noi', 'DAD':'Da_Nang', 'UIH':'Quy_Nhon', 'PQC':'Phu_Quoc'}
# abay_station = {'SGN':'TP Hồ Chí Minh (SGN)', 'HAN':'Hà Nội (HAN)', 'DAD':'Đà Nẵng (DAD)', 'UIH':'Quy Nhơn (UIH)', 'PQC':'Phú Quốc (PQC)', 'BKK':'Bangkok (BKK)'}
abay_station = {'SGN':'SGN', 'HAN':'HAN', 'DAD':'DAD', 'UIH':'UIH', 'PQC':'PQC', 'BKK':'BKK', 'CXR':'CXR'}

def create_dir(route:str):
    sub_dir1 = route[:7]
    sub_dir2 = route[4:]

    parent_dir = r'C:\Users\VTA-HAN\Desktop\VTA Jupyter\webscrapping'
    dir_1 = os.path.join(parent_dir, sub_dir1)
    dir_2 = os.path.join(parent_dir, sub_dir2)

    try:
        os.mkdir(dir_1)
    except FileExistsError:
        pass
    
    try:
        os.mkdir(dir_2)
    except FileExistsError:
        pass
    return [dir_1, dir_2]


def route_pricing(from_date:str, to_date:str, route:str, parent_dir:str = r'C:\Users\VTA-HAN\Desktop\VTA Jupyter\webscrapping'):
    input_format_date = '%Y%m%d'
    from_date = datetime.strptime(from_date, input_format_date)
    to_date = datetime.strptime(to_date, input_format_date)
    scrapping_dates = pd.date_range(start=from_date, end=to_date)
    
    dep1 = route[:3]
    arr1 = route[4:7]
    dep2 = arr1
    arr2 = dep1

    
#     define and create directory
    sub_dir1 = route[:7]
    sub_dir2 = route[4:]
    dir_1 = os.path.join(parent_dir, sub_dir1)
    dir_2 = os.path.join(parent_dir, sub_dir2)
    
    try:
        os.mkdir(dir_1)
    except FileExistsError:
        pass
    
    try:
        os.mkdir(dir_2)
    except FileExistsError:
        pass
    
#     create a dataframe
    to_scrape1 = pd.DataFrame({'dates':scrapping_dates, 'dep':dep1, 'arr':arr1})
    to_scrape1['dir'] = dir_1
    to_scrape2 = pd.DataFrame({'dates':scrapping_dates, 'dep':dep2, 'arr':arr2})
    to_scrape2['dir'] = dir_2
    
    to_scrape = pd.concat([to_scrape1, to_scrape2])
    to_scrape.sort_values(by='dates', ascending=True, inplace=True)

#     flattern and return into a list
    return to_scrape.values.tolist()




In [2]:
def abay_scrapping_domestic(ngay, thang, nam, dep_station, arr_station, location):
    
    while True:
        try:
            # dep_station
            input_element = driver.find_element(By.ID, 'hdfstartplace')
            driver.execute_script(f"arguments[0].value = '{abay_station[dep_station]}';", input_element)

            # arr_station
            input_element = driver.find_element(By.ID, 'hdfendplace')
            driver.execute_script(f"arguments[0].value = '{abay_station[arr_station]}';", input_element)

            # choose departure date
            dep_date = driver.find_element(By.XPATH, '//*[@id="cphSubColumn_ctl01_txtDepartureDate"]')
            driver.execute_script(f"arguments[0].value = '{str(ngay).zfill(2)}/{str(thang).zfill(2)}/{nam}';", dep_date)


            # click the search button
            search_button = driver.find_element(By.ID, 'cphSubColumn_ctl01_btnSearchFlight')
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") #scroll windows to the end of page
            search_button.click()
            break
        except: pass



    # scrape information
    while True:
        try:
            flights_table = driver.find_element(By.XPATH, '//*[@id="OutBound"]')
            flight_rows = flights_table.find_elements(By.XPATH, '//tr[@class="i-result"]')
            
            f_name = []
            f_time = []
            f_baggage_meal = []
            f_price = []
            final_baggage_meal = []

            for row in flight_rows:
                f_name.append(row.find_element(By.XPATH, './td[2]').text)
                f_time.append(row.find_element(By.XPATH, './td[3]').text)
                f_baggage_meal.append(row.find_element(By.XPATH, './td[4]').find_elements(By.TAG_NAME, 'img'))
                f_price.append(row.find_element(By.XPATH, './td[5]').text)

            for bag_meal in f_baggage_meal:
                tmp_str = ''
                for element in bag_meal:
                    tmp_str += '-' + element.get_attribute('src').split('/')[-1]
                final_baggage_meal.append(tmp_str)

            f_date = [f'{nam}-{thang}-{ngay}'] * len(f_name)

            df = pd.DataFrame({'name': f_name, 'time': f_time, 'baggage_meal': final_baggage_meal, 'price': f_price, 'date' : f_date})
            df['price'] = df['price'].str[:-1]

            df['bag'] = df['baggage_meal'].str.contains('hanhly')*1
            df['meal'] = df['baggage_meal'].str.contains('suatan')*1
            column_order = ['name', 'time', 'bag', 'meal', 'price', 'date']
            df = df.reindex(columns=column_order)

            csv_path = f'{location}\\Abay_scrapping_price_{dep_station}-{arr_station}_{nam}{str(thang).zfill(2)}{str(ngay).zfill(2)}.csv'
            df.to_csv(csv_path, index=False)
            break
        except: pass




def abay_scrapping_international(ngay, thang, nam, dep_station, arr_station, location):
    
    while True:
        try:
            # dep_station
            input_element = driver.find_element(By.ID, 'hdfstartplace')
            driver.execute_script(f"arguments[0].value = '{abay_station[dep_station]}';", input_element)

            # arr_station
            input_element = driver.find_element(By.ID, 'hdfendplace')
            driver.execute_script(f"arguments[0].value = '{abay_station[arr_station]}';", input_element)

            # choose departure date
            dep_date = driver.find_element(By.XPATH, '//*[@id="cphSubColumn_ctl01_txtDepartureDate"]')
            driver.execute_script(f"arguments[0].value = '{str(ngay).zfill(2)}/{str(thang).zfill(2)}/{nam}';", dep_date)


            # click the search button
            search_button = driver.find_element(By.ID, 'cphSubColumn_ctl01_btnSearchFlight')
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") #scroll windows to the end of page
            search_button.click()
            break
        except: pass



    # wait untill all flights are available
    while True:
        try: 
            if driver.find_element(By.XPATH, f'//*[@id="main"]/section[3]/div[1]').get_attribute('class') == 'captions-container':
                break
        except: 
            pass

    # get the data
    i = 2
    name = []
    dep_time = []
    stop_point = []
    price_adt = []

    while True:
        try: flight = driver.find_element(By.XPATH, f'//*[@id="main"]/section[3]/div[{i}]')
        except:
            break
        
        name.append(driver.find_element(By.XPATH, f'//*[@id="main"]/section[3]/div[{i}]/div[2]/div/ul[4]/li[2]/code').get_attribute("textContent"))
        dep_time.append(flight.find_element(By.XPATH, './/div[1]/ul/li/div[2]/strong[1]').text)
        stop_point.append(flight.get_attribute('data-stop-points'))
        price_adt.append(flight.get_attribute('data-base-price-adt'))
        i += 1

    # cast into a dataframe
    df = pd.DataFrame({'name':name,'time': dep_time,'bag':0,'meal':0,'stop_point': stop_point,'price': price_adt, 'date':f'{nam}-{thang}-{ngay}'})
    df['time'] = df['time'].str.replace('|', '', regex=False)
    df['time'] = df['time'] + ' - ' + df['time']
    df['stop_point'] = df['stop_point'].str.replace('|', '', regex=False)
    df = df[df['stop_point'] == '0']
    df = df[['name', 'time', 'bag', 'meal', 'price', 'date']]

    # CSV exporting
    csv_path = f'{location}\\Abay_scrapping_price_{dep_station}-{arr_station}_{nam}{str(thang).zfill(2)}{str(ngay).zfill(2)}.csv'
    df.to_csv(csv_path, index=False)

In [3]:
err_list = []
for x in tqdm(route_pricing('20230620', '20230731', 'SGN-UIH-SGN'), desc='SGN-UIH-SGN'):
    i = 0
    while True:
        i += 1
        if i == 3:
            err_list.append([x[0].day, x[0].month, x[0].year, x[1], x[2], x[3]])
            break
        try:
            abay_scrapping_domestic(x[0].day, x[0].month, x[0].year, x[1], x[2], x[3])
            time.sleep(0.5)
        except:
            continue
        break
        
        
for x in tqdm(route_pricing('20230620', '20230731', 'SGN-PQC-SGN'), desc='SGN-PQC-SGN'):
    i = 0
    while True:
        i += 1
        if i == 3:
            err_list.append([x[0].day, x[0].month, x[0].year, x[1], x[2], x[3]])
            break
        try:
            abay_scrapping_domestic(x[0].day, x[0].month, x[0].year, x[1], x[2], x[3])
            time.sleep(0.5)
        except:
            continue
        break
        
        
for x in tqdm(route_pricing('20230620', '20230731', 'SGN-DAD-SGN'), desc='SGN-DAD-SGN'):
    i = 0
    while True:
        i += 1
        if i == 3:
            err_list.append([x[0].day, x[0].month, x[0].year, x[1], x[2], x[3]])
            break
        try:
            abay_scrapping_domestic(x[0].day, x[0].month, x[0].year, x[1], x[2], x[3])
        except:
            continue
        break
        
for x in tqdm(route_pricing('20230620', '20230731', 'HAN-CXR-HAN'), desc='HAN-CXR-HAN'):
    i = 0
    while True:
        i += 1
        if i == 3:
            err_list.append([x[0].day, x[0].month, x[0].year, x[1], x[2], x[3]])
            break
        try:
            abay_scrapping_domestic(x[0].day, x[0].month, x[0].year, x[1], x[2], x[3])
        except:
            continue
        break
        
        
err_list2 = []
for x in tqdm(err_list, desc='err_list'):
    i = 0
    while True:
        i += 1
        if i == 3:
            err_list2.append([x[0], x[1], x[2], x[3], x[4], x[5]])
            break
        try:
            abay_scrapping_domestic(x[0], x[1], x[2], x[3], x[4], x[5])
        except:
            continue
        break       
        
print(err_list2)
driver.quit()

SGN-UIH-SGN: 100%|█████████████████████████████████████████████████████████████████████| 84/84 [02:07<00:00,  1.52s/it]
SGN-PQC-SGN: 100%|█████████████████████████████████████████████████████████████████████| 84/84 [03:58<00:00,  2.84s/it]
SGN-DAD-SGN: 100%|█████████████████████████████████████████████████████████████████████| 84/84 [04:18<00:00,  3.08s/it]
HAN-CXR-HAN: 100%|█████████████████████████████████████████████████████████████████████| 84/84 [03:23<00:00,  2.42s/it]
err_list: 0it [00:00, ?it/s]


[]
