In [None]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
import os
import requests
import urllib.request
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
import time



In [None]:
route_id = '9847'
page_url = 'https://data.gov.il/dataset/arrivaltostationdayandhours/resource/c96b9865-006d-4c69-8d6a-bfbdea5d6da5/view/1e7d5e63-e899-46f0-84ec-ad225fadae36'
user_agent = 'datagov-external-client'
filters_list = [
        ('route_id',route_id),    
]
route_id_ls=[]
month_ls=[]
day_of_week_ls=[]
hour_source_time_ls=[]
stop_sequence_ls=[]
stop_code_ls=[]
arrival_time_ls=[]
distance_from_src_ls=[]

In [None]:
def set_filters(driver):
    # Wait for the filter element to be loaded
    wait = WebDriverWait(driver, 10)  # Maximum wait time of 10 seconds
    filter_btn = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".menu-right .btn-group button[data-action='valueFilter']")))
    filter_btn.click()
    
    add_filters(driver,filters_list)

In [None]:
def add_filters(driver, filters_list):
    for filter_obj in filters_list:
        option_name, value = filter_obj
        # click add filter to open select element         
        add_filter_btn = driver.find_element_by_css_selector('button.add-filter')
        add_filter_btn.click()
        
        # select element
        select_element = driver.find_element_by_css_selector('fieldset select')
        select_element.click()
        
        # option element     
        route_id_option = select_element.find_element_by_css_selector(f'option[value={option_name}]')
        route_id_option.click()
        select_element.click()
    
        add_btn = driver.find_element_by_css_selector('fieldset button[type="submit"]')
        add_btn.click()

        input_element = driver.find_element_by_css_selector(f'fieldset input[data-filter-field={option_name}]')
        input_element.clear()
        input_element.send_keys(value)

        update_btn = driver.find_element_by_css_selector('button.update-filter')
        update_btn.click()
    time.sleep(5)
    return

In [None]:
def next_page(driver, css_selector):
    el = driver.find_element_by_css_selector(css_selector)
    el.click()

In [None]:
def extract_data_from_row_element(el):
    route_id_ls.append(el.find_element_by_css_selector('.l2').text)
    month_ls.append(el.find_element_by_css_selector('.l1').text)
    day_of_week_ls.append(el.find_element_by_css_selector('.l3').text)
    hour_source_time_ls.append(el.find_element_by_css_selector('.l4').text)
    stop_sequence_ls.append(el.find_element_by_css_selector('.l5').text)
    stop_code_ls.append(el.find_element_by_css_selector('.l6').text)
    arrival_time_ls.append(el.find_element_by_css_selector('.l8').text)
    distance_from_src_ls.append(el.find_element_by_css_selector('.l10').text)

In [None]:
def get_data_from_current_page(driver):

    try:
        # get first row element    
        row = driver.find_element_by_css_selector('.grid-canvas .slick-row')
        while(row != None):
            # marker the row and scroll element into view 
            driver.execute_script("arguments[0].style.border='2px solid red';", row)
            driver.execute_script("arguments[0].scrollIntoView();", row)
            
            extract_data_from_row_element(row)
            

            # get next row         
            row = row.find_element_by_xpath("following-sibling::*")
    except NoSuchElementException as e:
        row=None



In [None]:
def get_total_results_number(driver, css_selector):
    el = driver.find_element_by_css_selector(css_selector)
    return int(el.text)

In [None]:
def scrape_data(page_url,user_agent,css_selectors=[]):
    
    options = webdriver.ChromeOptions()
    options.add_argument(f"--user-agent={user_agent}")
    driver = webdriver.Chrome(executable_path='./chromedriver', options=options)
    driver.get(page_url)
    
    # set filters
    set_filters(driver)
    desired_number_of_rows = get_total_results_number(driver,'.doc-count')
    
    while(len(route_id_ls) < desired_number_of_rows):
        get_data_from_current_page(driver)
        next_page(driver, 'li.next a')
        time.sleep(3)    
    
    driver.quit()

In [None]:
route_id_ls=[]
month_ls=[]
day_of_week_ls=[]
hour_source_time_ls=[]
stop_sequence_ls=[]
stop_code_ls=[]
arrival_time_ls=[]
distance_from_src_ls=[]

scrape_data(page_url,user_agent)
len(route_id_ls)

In [None]:
dic = {
    'route_id':route_id_ls,
    'month':month_ls,
    'day_of_week':day_of_week_ls,
    'hour_source_time':hour_source_time_ls,
    'stop_sequence':stop_sequence_ls,
    'stop_code':stop_code_ls,
    'distance_from_src':distance_from_src_ls,
    'arrival_time':arrival_time_ls,
}

df = pd.DataFrame(dic)

In [None]:
df.to_csv('dataset.csv', index=False)