In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from datetime import date
from datetime import datetime
from selenium.webdriver.support.ui import Select
import os
import pandas as pd
import time
from tqdm import tqdm
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

# define scrapping function

abay_routes = {'SGN':'Tp_Ho_Chi_Minh', 'HAN':'Ha_Noi', 'DAD':'Da_Nang', 'UIH':'Quy_Nhon', 'PQC':'Phu_Quoc'}
abay_station = {'SGN':'TP Hồ Chí Minh (SGN)', 'HAN':'Hà Nội (HAN)', 'DAD':'Đà Nẵng (DAD)', 'UIH':'Quy Nhơn (UIH)', 'PQC':'Phú Quốc (PQC)'}

def create_dir(route:str):
    sub_dir1 = route[:7]
    sub_dir2 = route[4:]

    parent_dir = r'C:\Users\admin\Desktop\VTA Jupyter\webscrapping'
    dir_1 = os.path.join(parent_dir, sub_dir1)
    dir_2 = os.path.join(parent_dir, sub_dir2)

    try:
        os.mkdir(dir_1)
    except FileExistsError:
        pass
    
    try:
        os.mkdir(dir_2)
    except FileExistsError:
        pass
    return [dir_1, dir_2]


def route_pricing(from_date:str, to_date:str, route:str, parent_dir:str = r'C:\Users\admin\Desktop\VTA Jupyter\webscrapping'):
    input_format_date = '%Y%m%d'
    from_date = datetime.strptime(from_date, input_format_date)
    to_date = datetime.strptime(to_date, input_format_date)
    scrapping_dates = pd.date_range(start=from_date, end=to_date)
    
    dep1 = route[:3]
    arr1 = route[4:7]
    dep2 = arr1
    arr2 = dep1

    
#     define and create directory
    sub_dir1 = route[:7]
    sub_dir2 = route[4:]
    dir_1 = os.path.join(parent_dir, sub_dir1)
    dir_2 = os.path.join(parent_dir, sub_dir2)
    
    try:
        os.mkdir(dir_1)
    except FileExistsError:
        pass
    
    try:
        os.mkdir(dir_2)
    except FileExistsError:
        pass
    
#     create a dataframe
    to_scrape1 = pd.DataFrame({'dates':scrapping_dates, 'dep':dep1, 'arr':arr1})
    to_scrape1['dir'] = dir_1
    to_scrape2 = pd.DataFrame({'dates':scrapping_dates, 'dep':dep2, 'arr':arr2})
    to_scrape2['dir'] = dir_2
    
    to_scrape = pd.concat([to_scrape1, to_scrape2])

#     flattern and return into a list
    return to_scrape.values.tolist()

def abay_scrapping(ngay, thang, nam, dep_station, arr_station, location):
    driver.get('https://www.abay.vn')
    # choose the route
    driver.execute_script(f'''document.getElementById("cphMain_ctl00_usrSearchFormDV2_txtFrom").setAttribute('value', '{abay_station[dep_station]}');''')
    driver.execute_script(f'''document.getElementById("cphMain_ctl00_usrSearchFormDV2_txtTo").setAttribute('value', '{abay_station[arr_station]}');''')

    # select day
    dep_day = Select(driver.find_element(By.XPATH, '//*[@id="cphMain_ctl00_usrSearchFormDV2_cboDepartureDay"]'))
    dep_day.select_by_value(str(ngay))
    # select month
    dep_month = Select(driver.find_element(By.XPATH, '//*[@id="cphMain_ctl00_usrSearchFormDV2_cboDepartureMonth"]'))
    dep_month.select_by_value(f'{str(thang).zfill(2)}/{nam}')
    # click the button
    search_button = driver.find_element(By.XPATH, '//*[@id="cphMain_ctl00_usrSearchFormDV2_btnSearch"]')
    search_button.click()
    

    time.sleep(1)
    # scrape information
    flights_table = driver.find_element(By.XPATH, '//table[@class="f-result"]')
    flight_rows = flights_table.find_elements(By.XPATH, '//tr[@class="i-result"]')

    f_name = []
    f_time = []
    f_baggage_meal = []
    f_price = []
    final_baggage_meal = []

    for row in flight_rows:
        f_name.append(row.find_element(By.XPATH, './td[2]').text)
        f_time.append(row.find_element(By.XPATH, './td[3]').text)
        f_baggage_meal.append(row.find_element(By.XPATH, './td[4]').find_elements(By.TAG_NAME, 'img'))
        f_price.append(row.find_element(By.XPATH, './td[5]').text)

    for bag_meal in f_baggage_meal:
        tmp_str = ''
        for element in bag_meal:
            tmp_str += '-' + element.get_attribute('src').split('/')[-1]
        final_baggage_meal.append(tmp_str)

    f_date = [f'{nam}-{thang}-{ngay}'] * len(f_name)

    df = pd.DataFrame({'name': f_name, 'time': f_time, 'baggage_meal': final_baggage_meal, 'price': f_price, 'date' : f_date})
    df['price'] = df['price'].str[:-1]

    df['bag'] = df['baggage_meal'].str.contains('hanhly')*1
    df['meal'] = df['baggage_meal'].str.contains('suatan')*1
    column_order = ['name', 'time', 'bag', 'meal', 'price', 'date']
    df = df.reindex(columns=column_order)
    
    csv_path = f'{location}\\Abay_scrapping_price_{dep_station}-{arr_station}_{nam}{str(thang).zfill(2)}{str(ngay).zfill(2)}.csv'
    df.to_csv(csv_path, index=False)

In [None]:
err_list = []
for x in tqdm(route_pricing('20221026', '20221130', 'HAN-DAD-HAN'), desc='HAN-DAD-HAN'):
    i = 0
    while True:
        i += 1
        if i == 5:
            err_list.append([x[0].day, x[0].month, x[0].year, x[1], x[2], x[3]])
            break
        try:
            abay_scrapping(x[0].day, x[0].month, x[0].year, x[1], x[2], x[3])
        except:
            continue
        break

for x in tqdm(route_pricing('20221026', '20221130', 'HAN-UIH-HAN'), desc='HAN-UIH-HAN'):
    i = 0
    while True:
        i += 1
        if i == 5:
            err_list.append([x[0].day, x[0].month, x[0].year, x[1], x[2], x[3]])
            break
        try:
            abay_scrapping(x[0].day, x[0].month, x[0].year, x[1], x[2], x[3])
            time.sleep(1.5)
        except:
            continue
        break
        
for x in tqdm(route_pricing('20221026', '20221130', 'HAN-SGN-HAN'), desc='HAN-SGN-HAN'):
    i = 0
    while True:
        i += 1
        if i == 5:
            err_list.append([x[0].day, x[0].month, x[0].year, x[1], x[2], x[3]])
            break
        try:
            abay_scrapping(x[0].day, x[0].month, x[0].year, x[1], x[2], x[3])
        except:
            continue
        break
        
        
err_list2 = []
for x in tqdm(err_list, desc='err_list'):
    i = 0
    while True:
        i += 1
        if i == 5:
            err_list2.append([x[0], x[1], x[2], x[3], x[4], x[5]])
            break
        try:
            abay_scrapping(x[0], x[1], x[2], x[3], x[4], x[5])
        except:
            continue
        break       
        
print(err_list2)
driver.quit()