---
author: Zeel B Patel
badges: true
categories:
- Data
date: '2024-12-10'
description: Download CPCB data with selenium
title: Download CPCB live data
toc: true
---

In [8]:
import pandas as pd
from tqdm.notebook import tqdm
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from time import sleep

HOME_URL = "https://airquality.cpcb.gov.in/ccr/#/caaqm-dashboard-all/caaqm-landing"
DOWNLOAD_PAGE_URL = "https://airquality.cpcb.gov.in/ccr/#/caaqm-dashboard-all/caaqm-landing/data"

## Dry run to get metadata

In [219]:
driver = webdriver.Chrome()
driver.get(HOME_URL)

Enter Captcha manually before moving ahead

In [214]:
driver.get(DOWNLOAD_PAGE_URL)

In [215]:
dropdowns = driver.find_elements("css selector", ".select-box")
assert len(dropdowns) == 3, len(dropdowns)

AssertionError: 0

In [None]:
dropdowns[0].click() # Open
states = dropdowns[0].text.replace("Select ...\n▲\n", "").split("\n")
dropdowns[0].click() # Close
assert len(states) == 31
for state in states:
    metadata_dict[state] = {}

In [None]:
metadata_df = pd.DataFrame(columns=["State", "City", "Station"])
for state in tqdm(metadata_dict):
    print(f"{state=}")
    dropdowns = driver.find_elements("css selector", ".select-box")
    dropdowns[0].click() # Open
    # select state
    option = driver.find_element("xpath", f"//li[contains(text(), '{state}')]")
    option.click() # Select and Close
    
    sleep(0.1)
    
    dropdowns = driver.find_elements("css selector", ".select-box")
    dropdowns[1].click() # Open
    # Get all cities
    cities = dropdowns[1].text.replace("Select ...\n▲\n", "").split("\n")
    print(f"{cities=}")
    metadata_dict[state] = {city: {} for city in cities}
    dropdowns[1].click() # Close
    
    sleep(0.1)
    
    for city in cities:
        print(f"{state=}, {city=}")
        dropdowns = driver.find_elements("css selector", ".select-box")
        dropdowns[1].click()
        option = driver.find_element("xpath", f"//li[contains(text(), '{city}')]")
        option.click() # Select and Close
        
        dropdowns = driver.find_elements("css selector", ".select-box")
        
        sleep(0.1)

        # Get all stations
        dropdowns[2].click()
        stations = dropdowns[2].text.replace("Select ...\n▲\n", "").split("\n")
        for station in stations:
            metadata_df.loc[len(metadata_df)] = [state, city, station]
        sleep(0.1)

  0%|          | 0/31 [00:00<?, ?it/s]

state='Andhra Pradesh'
cities=['Amaravati', 'Anantapur', 'Chittoor', 'Kadapa', 'Rajamahendravaram', 'Tirupati', 'Vijayawada', 'Visakhapatnam']
state='Andhra Pradesh', city='Amaravati'
state='Andhra Pradesh', city='Anantapur'
state='Andhra Pradesh', city='Chittoor'
state='Andhra Pradesh', city='Kadapa'
state='Andhra Pradesh', city='Rajamahendravaram'
state='Andhra Pradesh', city='Tirupati'
state='Andhra Pradesh', city='Vijayawada'
state='Andhra Pradesh', city='Visakhapatnam'
state='Arunachal Pradesh'
cities=['Naharlagun']
state='Arunachal Pradesh', city='Naharlagun'
state='Assam'
cities=['Byrnihat', 'Guwahati', 'Nagaon', 'Nalbari', 'Silchar', 'Sivasagar']
state='Assam', city='Byrnihat'
state='Assam', city='Guwahati'
state='Assam', city='Nagaon'
state='Assam', city='Nalbari'
state='Assam', city='Silchar'
state='Assam', city='Sivasagar'
state='Bihar'
cities=['Araria', 'Arrah', 'Aurangabad', 'Begusarai', 'Bettiah', 'Bhagalpur', 'Bihar Sharif', 'Buxar', 'Chhapra', 'Darbhanga', 'Gaya', 'Haji

In [None]:
len(metadata_df)

560

In [None]:
metadata_df.head()

Unnamed: 0,State,City,Station
0,Andhra Pradesh,Amaravati,"Secretariat, Amaravati - APPCB"
1,Andhra Pradesh,Anantapur,"Gulzarpet, Anantapur - APPCB"
2,Andhra Pradesh,Chittoor,"Gangineni Cheruvu, Chittoor - APPCB"
3,Andhra Pradesh,Kadapa,"Yerramukkapalli, Kadapa - APPCB"
4,Andhra Pradesh,Rajamahendravaram,"Anand Kala Kshetram, Rajamahendravaram - APPCB"


In [None]:
metadata_df.tail()

Unnamed: 0,State,City,Station
555,West Bengal,Kolkata,"Jadavpur, Kolkata - WBPCB"
556,West Bengal,Kolkata,"Rabindra Bharati University, Kolkata - WBPCB"
557,West Bengal,Kolkata,"Rabindra Sarobar, Kolkata - WBPCB"
558,West Bengal,Kolkata,"Victoria, Kolkata - WBPCB"
559,West Bengal,Siliguri,"Ward-32 Bapupara, Siliguri - WBPCB"


In [None]:
metadata_df.to_csv("metadata.csv", index=False)

## Once metadata is saved, run from here

In [23]:
metadata_df = pd.read_csv('metadata.csv')
len(metadata_df)

560

In [24]:
# add download directory
options = webdriver.ChromeOptions()
options.add_experimental_option("prefs", {
    "download.default_directory": "/Users/project561/blog/cpcb_downloads"
})

driver = webdriver.Chrome(options=options)
driver.get(HOME_URL)

Enter Captcha manually before moving ahead

In [15]:
driver.get(DOWNLOAD_PAGE_URL)

In [7]:
i = 0
entry = metadata_df.loc[i]
state = entry["State"]
city = entry["City"]
station = entry["Station"]
print(f"{state=}, {city=}, {station=}")

state='Andhra Pradesh', city='Amaravati', station='Secretariat, Amaravati - APPCB'


In [224]:
# State selection
dropdowns = driver.find_elements("css selector", ".select-box")
dropdowns[0].click() # Open
# select state
option = driver.find_element("xpath", f"//li[contains(text(), '{state}')]")
option.click() # Select and Close

sleep(0.1)

# City selection
dropdowns = driver.find_elements("css selector", ".select-box")
dropdowns[1].click() # Open

sleep(0.1)

# select city
option = driver.find_element("xpath", f"//li[contains(text(), '{city}')]")
option.click() # Select and Close

sleep(0.1)

# Station selection
dropdowns = driver.find_elements("css selector", ".select-box")
dropdowns[2].click() # Open

sleep(0.1)

# select station
option = driver.find_element("xpath", f"//li[contains(text(), '{station}')]")
option.click() # Select and Close

In [225]:
multi_select = driver.find_element(By.XPATH, "//angular2-multiselect//div[@class='c-btn']")
multi_select.click() # Open
sleep(0.1)

pm25_checkbox = driver.find_element(By.XPATH, "//label[text()='PM2.5']/preceding-sibling::input")
pm10_checkbox = driver.find_element(By.XPATH, "//label[text()='PM10']/preceding-sibling::input")
actions = ActionChains(driver)
if not pm25_checkbox.is_selected():
    actions.move_to_element(pm25_checkbox).click().perform()
    sleep(0.1)
if not pm10_checkbox.is_selected():
    actions.move_to_element(pm10_checkbox).click().perform()
    sleep(0.1)

multi_select.click() # Close

In [226]:
dropdowns = driver.find_elements("css selector", ".select-box")
len(dropdowns)

5

In [227]:
dropdowns[4].click() # Open
option = driver.find_element("xpath", "//li[contains(text(), '15 Minute')]")
option.click() # Select and Close

In [228]:
date_pickers = driver.find_elements(By.CSS_SELECTOR, ".wc-date-container")
assert len(date_pickers) == 2, len(date_pickers)

# Select start date
date_pickers[0].click()
sleep(0.5)
desired_month = driver.find_element(By.CLASS_NAME, "month-year")
desired_month.click()
sleep(0.5)
option = driver.find_element(By.ID, "JAN")
option.click()
sleep(0.5)
try:
    desired_date = driver.find_element(By.XPATH, "//td[@class='calendar-day']/span[text()='1']")
    driver.execute_script("arguments[0].click();", desired_date)
except:
    print("Already selected")

date_pickers = driver.find_elements(By.CSS_SELECTOR, ".wc-date-container")
assert len(date_pickers) == 2, len(date_pickers)

# Select end date
date_pickers[1].click()
sleep(0.5)
desired_months = driver.find_elements(By.CLASS_NAME, "month-year")
desired_months[1].click()
sleep(0.5)

try:
    option = driver.find_element(By.ID, "DEC")
    option.click()
except:
    print("Already in December")

try:
    desired_date = driver.find_element(By.XPATH, "//td[@class='calendar-day']/span[text()='1']")
    driver.execute_script("arguments[0].click();", desired_date)
except:
    print("Already selected")

Already in December


In [229]:
# click on submit
submit_button = driver.find_element(By.XPATH, "//button[text()='Submit']")
submit_button.click()

In [231]:
excel_button = WebDriverWait(driver, 20).until(
    EC.element_to_be_clickable((By.CLASS_NAME, "fa-file-excel-o"))
)
excel_button.click()

## Full loop

In [29]:
def download(i):
    print("Downloading data for entry", i)
    driver.get(DOWNLOAD_PAGE_URL)
    entry = metadata_df.loc[i]
    state = entry["State"]
    city = entry["City"]
    station = entry["Station"]
    print(f"{state=}, {city=}, {station=}")

    # State selection
    WebDriverWait(driver, 60).until(
        EC.presence_of_all_elements_located((By.CLASS_NAME, "select-box"))
    )
    dropdowns = driver.find_elements("css selector", ".select-box")
    dropdowns[0].click() # Open
    # select state
    option = driver.find_element("xpath", f"//li[contains(text(), '{state}')]")
    option.click() # Select and Close

    sleep(0.1)

    # City selection
    dropdowns = driver.find_elements("css selector", ".select-box")
    dropdowns[1].click() # Open

    sleep(0.1)

    # select city
    option = driver.find_element("xpath", f"//li[contains(text(), '{city}')]")
    option.click() # Select and Close

    sleep(0.1)

    # Station selection
    dropdowns = driver.find_elements("css selector", ".select-box")
    dropdowns[2].click() # Open

    sleep(0.1)

    # select station
    option = driver.find_element("xpath", f"//li[contains(text(), '{station}')]")
    option.click() # Select and Close

    multi_select = driver.find_element(By.XPATH, "//angular2-multiselect//div[@class='c-btn']")
    multi_select.click() # Open
    sleep(0.1)

    actions = ActionChains(driver)
    try:
        pm10_checkbox = driver.find_element(By.XPATH, "//label[text()='PM10']/preceding-sibling::input")
        if not pm10_checkbox.is_selected():
            actions.move_to_element(pm10_checkbox).click().perform()
        sleep(0.1)
    except:
        print("PM10 not available")
    
    try:
        pm25_checkbox = driver.find_element(By.XPATH, "//label[text()='PM2.5']/preceding-sibling::input")
        if not pm25_checkbox.is_selected():
            actions.move_to_element(pm25_checkbox).click().perform()
            sleep(0.1)
    except:
        print("PM2.5 not available")        

    multi_select.click() # Close

    dropdowns = driver.find_elements("css selector", ".select-box")
    len(dropdowns)

    dropdowns[4].click() # Open
    option = driver.find_element("xpath", "//li[contains(text(), '15 Minute')]")
    option.click() # Select and Close

    date_pickers = driver.find_elements(By.CSS_SELECTOR, ".wc-date-container")
    assert len(date_pickers) == 2, len(date_pickers)
    
    # Select start date
    date_pickers[0].click()
    sleep(0.1)
    desired_month = driver.find_element(By.CLASS_NAME, "month-year")
    desired_month.click()
    sleep(0.1)
    try:
        option = driver.find_element(By.ID, "JAN")
        option.click()
    except:
        print("Already in January")
    sleep(0.1)
    try:
        desired_date = driver.find_element(By.XPATH, "//td[@class='calendar-day']/span[text()='1']")
        driver.execute_script("arguments[0].click();", desired_date)
    except:
        print("Already 1st selected")
        
    date_pickers = driver.find_elements(By.CSS_SELECTOR, ".wc-date-container")
    assert len(date_pickers) == 2, len(date_pickers)
    
    # Select end date
    date_pickers[1].click()
    sleep(0.1)
    desired_months = driver.find_elements(By.CLASS_NAME, "month-year")
    desired_months[1].click()
    sleep(0.1)
    try:
        option = driver.find_element(By.ID, "DEC")
        option.click()
    except:
        print("Already in December")
        
    try:
        desired_date = driver.find_element(By.XPATH, "//td[@class='calendar-day']/span[text()='1']")
        driver.execute_script("arguments[0].click();", desired_date)
    except:
        print("Already 1st selected")
        
    # click on submit
    submit_button = driver.find_element(By.XPATH, "//button[text()='Submit']")
    submit_button.click()
    
    excel_button = WebDriverWait(driver, 20).until(
        EC.element_to_be_clickable((By.CLASS_NAME, "fa-file-excel-o"))
    )
    excel_button.click()
    sleep(10)
    
for i in range(44, len(metadata_df)):
    # open a new tab
    driver.execute_script("window.open('');")
    driver.switch_to.window(driver.window_handles[-1])
    download(i)

Downloading data for entry 44
state='Bihar', city='Motihari', station='Gandak Colony, Motihari - BSPCB'
Already in December
Downloading data for entry 45
state='Bihar', city='Munger', station='Town Hall, Munger - BSPCB'
Already in December
Downloading data for entry 46
state='Bihar', city='Muzaffarpur', station='Buddha Colony, Muzaffarpur - BSPCB'
Already in December
Downloading data for entry 47
state='Bihar', city='Muzaffarpur', station='MIT-Daudpur Kothi, Muzaffarpur - BSPCB'
Already in December
Downloading data for entry 48
state='Bihar', city='Muzaffarpur', station='Muzaffarpur Collectorate, Muzaffarpur - BSPCB'
PM10 not available
Already in December
Downloading data for entry 49
state='Bihar', city='Patna', station='DRM Office Danapur, Patna - BSPCB'
Already in December
Downloading data for entry 50
state='Bihar', city='Patna', station='Govt. High School Shikarpur, Patna - BSPCB'
Already in December
Downloading data for entry 51
state='Bihar', city='Patna', station='IGSC Planetar

TimeoutException: Message: 
