In [1]:
try:
    import selenium
except ModuleNotFoundError:
    %pip install selenium

import os
import numpy as np
import pandas as pd
from joblib import Parallel, delayed

from tqdm.notebook import tqdm, trange
from time import sleep, time
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException

!rm log.txt

def print_it(*args, **kwargs):
    print(*args, **kwargs)
    with open('log.txt', 'a') as f:
        print(*args, **kwargs, file=f)

global_init = time()

In [2]:
# Set up WebDriver
op = webdriver.ChromeOptions()
op.add_argument('headless')
# add download location
op.add_experimental_option("prefs", {
    "download.default_directory": "/home/patel_zeel/cpcb_helper/another_files",
})

driver = webdriver.Chrome(options=op)

# Navigate to the website and manually solve the CAPTCHA
driver.get("https://airquality.cpcb.gov.in/ccr/#/caaqm-dashboard-all/caaqm-landing/caaqm-data-repository")
# wait untill "ng-select" becomes available
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "ng-select")))

<selenium.webdriver.remote.webelement.WebElement (session="2ddc4893dc7628db5aceb207595b065d", element="03A0536ACDE0BEAAC9C56669C876469E_element_7")>

## Create a database

In [3]:
def initialize():
    elements = driver.find_elements(By.CSS_SELECTOR, "ng-select")

    # set raw data
    data_type = elements[0]
    data_type.click()
    data_type.find_elements(By.TAG_NAME, "li")[0].click()
    
    # Select "15 min" frequency
    frequency = elements[1]
    frequency.click()
    frequency.find_elements(By.TAG_NAME, "li")[0].click()
    
    return elements

def select_nth_state(n):
    elements = initialize()
    state_element = elements[2]
    state_element.click()
    states = state_element.find_elements(By.TAG_NAME, "li")
    states[n].click()
    return elements

def select_nth_city(state_idx, city_idx):
    elements = select_nth_state(state_idx)
    city_element = elements[3]
    city_element.click()
    cities = city_element.find_elements(By.TAG_NAME, "li")
    cities[city_idx].click()
    return elements

def select_nth_station(state_idx, city_idx, station_idx):
    elements = select_nth_city(state_idx, city_idx)
    station_element = elements[4]
    station_element.click()
    stations = station_element.find_elements(By.TAG_NAME, "li")
    stations[station_idx].click()
    return elements

if not os.path.exists("data.pkl"):    
    elements = initialize()
    state_element = elements[2]
    state_element.click()
    states = state_element.find_elements(By.TAG_NAME, "li")
    data = {state_idx: {"name": state.text, "cities": {}} for state_idx, state in enumerate(states)}
    for state_idx in trange(len(states)):
        elements = select_nth_state(state_idx)
        
        city_element = elements[3]
        city_element.click()
        cities = city_element.find_elements(By.TAG_NAME, "li")
        data[state_idx]["cities"] = {city_idx: {"name": city.text, "stations": {}} for city_idx, city in enumerate(cities)}
        for city_idx in range(len(data[state_idx]["cities"])):
            elements = select_nth_city(state_idx, city_idx)
            
            station_element = elements[4]
            station_element.click()
            stations = station_element.find_elements(By.TAG_NAME, "li")
            data[state_idx]["cities"][city_idx]["stations"] = {station_idx: {"name": station.text} for station_idx, station in enumerate(stations)}
    print_it(data[0])
    pd.to_pickle(data, "data.pkl")
else:
    print_it("Data already exists")
    data = pd.read_pickle("data.pkl")
    print_it(data[0])

Data already exists
{'name': 'Andhra Pradesh', 'cities': {0: {'name': 'Amaravati', 'stations': {0: {'name': 'Secretariat, Amaravati - APPCB'}}}, 1: {'name': 'Anantapur', 'stations': {0: {'name': 'Gulzarpet, Anantapur - APPCB'}}}, 2: {'name': 'Chittoor', 'stations': {0: {'name': 'Gangineni Cheruvu, Chittoor - APPCB'}}}, 3: {'name': 'Kadapa', 'stations': {0: {'name': 'Yerramukkapalli, Kadapa - APPCB'}}}, 4: {'name': 'Rajamahendravaram', 'stations': {0: {'name': 'Anand Kala Kshetram, Rajamahendravaram - APPCB'}}}, 5: {'name': 'Tirupati', 'stations': {0: {'name': 'Tirumala, Tirupati - APPCB'}, 1: {'name': 'Vaikuntapuram, Tirupati - APPCB'}}}, 6: {'name': 'Vijayawada', 'stations': {0: {'name': 'PWD Grounds, Vijayawada - APPCB'}, 1: {'name': 'Rajiv Nagar, Vijayawada - APPCB'}, 2: {'name': 'HB Colony, Vijayawada - APPCB'}, 3: {'name': 'Kanuru, Vijayawada - APPCB'}, 4: {'name': 'Rajiv Gandhi Park, Vijayawada - APPCB'}}}, 7: {'name': 'Visakhapatnam', 'stations': {0: {'name': 'GVM Corporation, V

In [4]:
driver.quit()

## Download

In [5]:
def download(config_tuple):
    state_idx, city_idx, station_idx = config_tuple
    select_nth_station(state_idx, city_idx, station_idx)            
    
    # Search Submit button and press it
    buttons = driver.find_elements(By.TAG_NAME, "button")
    buttons[1].click()
    
    # Wait for download links to appear
    WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CLASS_NAME, "fa-download")))
    
    downloads = driver.find_elements(By.CLASS_NAME, "fa-download")
    if len(downloads) > 0:
        for download in downloads:
            download.click()
            sleep(1)
            
def wait_for_downloads(download_dir, timeout=120):
    """
    Wait until all downloads are finished in the specified download directory.
    
    Args:
    - download_dir: The directory where downloads are saved.
    - timeout: The maximum time to wait for downloads to finish, in seconds. Default is 60 seconds.
    """
    start_time = time()
    while time() - start_time < timeout:
        if not any(filename.endswith('.crdownload') for filename in os.listdir(download_dir)):
            # If there are no files with the .crdownload extension, all downloads are finished
            return
        print_it("Waiting for downloads to finish...")
        sleep(1)  # Wait for 1 second before checking again

In [6]:
config_tuples = []
for state_idx, state in data.items():
    for city_idx, city in state["cities"].items():
        for station_idx, station in city["stations"].items():
            config_tuples.append((state_idx, city_idx, station_idx))
            
len(config_tuples)

544

In [7]:
for config_tuple in tqdm(config_tuples):
    state_idx, city_idx, station_idx = config_tuple
    state_name = data[state_idx]["name"]
    city_name = data[state_idx]["cities"][city_idx]["name"]
    station_name = data[state_idx]["cities"][city_idx]["stations"][station_idx]["name"]
    
    print_it("Downloading", "State:", state_name, "City:", city_name, "Station:", station_name)
    
    try:
        # Set up WebDriver
        op = webdriver.ChromeOptions()
        op.add_argument('headless')
        
        download_path = f"/home/patel_zeel/cpcb_helper/another_files/{state_name}/{city_name}/{station_name}"
        os.makedirs(download_path, exist_ok=True)
        
        # add download location
        op.add_experimental_option("prefs", {
            "download.default_directory": download_path,
        })

        driver = webdriver.Chrome(options=op)

        # Navigate to the website and manually solve the CAPTCHA
        driver.get("https://airquality.cpcb.gov.in/ccr/#/caaqm-dashboard-all/caaqm-landing/caaqm-data-repository")
        # wait untill "ng-select" becomes available
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "ng-select")))
        
        download(config_tuple)
        wait_for_downloads(download_path)
        driver.quit()
        
    except Exception as e:
        print_it("#"*50)
        print_it(e)
        print_it("State:", state_name, "City:", city_name, "Station:", station_name, "failed")
        print_it("#"*50)

  0%|          | 0/544 [00:00<?, ?it/s]

Downloading State: Andhra Pradesh City: Amaravati Station: Secretariat, Amaravati - APPCB
Waiting for downloads to finish...
Waiting for downloads to finish...
Downloading State: Andhra Pradesh City: Anantapur Station: Gulzarpet, Anantapur - APPCB
Downloading State: Andhra Pradesh City: Chittoor Station: Gangineni Cheruvu, Chittoor - APPCB
Downloading State: Andhra Pradesh City: Kadapa Station: Yerramukkapalli, Kadapa - APPCB


KeyboardInterrupt: 

: 

In [None]:
global_end = time()
time_taken = global_end - global_init
print_it("Time taken:", time_taken, "in seconds")
print_it("Time taken:", time_taken/60, "in minutes")
print_it("Time taken:", time_taken/3600, "in hours")