In [1]:
import requests
import re
import pandas as pd
import psutil
from joblib import Parallel, delayed
from tqdm.notebook import tqdm

In [2]:
df = pd.read_csv("site_ids.csv")
df.head(10)

Unnamed: 0,state,city,station,site_id
0,Andhra Pradesh,Amaravati,"Secretariat, Amaravati - APPCB",site_1406
1,Andhra Pradesh,Anantapur,"Gulzarpet, Anantapur - APPCB",site_5632
2,Andhra Pradesh,Chittoor,"Gangineni Cheruvu, Chittoor - APPCB",site_5665
3,Andhra Pradesh,Kadapa,"Yerramukkapalli, Kadapa - APPCB",site_5693
4,Andhra Pradesh,Rajamahendravaram,"Anand Kala Kshetram, Rajamahendravaram - APPCB",site_1399
5,Andhra Pradesh,Tirupati,"Tirumala, Tirupati - APPCB",site_258
6,Andhra Pradesh,Tirupati,"Vaikuntapuram, Tirupati - APPCB",site_5666
7,Andhra Pradesh,Vijayawada,"HB Colony, Vijayawada - APPCB",site_5848
8,Andhra Pradesh,Vijayawada,"Kanuru, Vijayawada - APPCB",site_5685
9,Andhra Pradesh,Vijayawada,"Rajiv Gandhi Park, Vijayawada - APPCB",site_5849


Example URL: https://airquality.cpcb.gov.in/dataRepository/download_file?file_name=Raw_data/15Min/2023/site_1421_Dr._Karni_Singh_Shooting_Range_Delhi_DPCC_15Min.csv

In [3]:
def process_station_name(station):
    # replace "-", "," and " " (space) with underscore "_"
    return station.replace(", ", "_").replace(" - ", "_").replace(" ", "_").replace(",", "_")

def get_url(row, year):
    processed_station_name = process_station_name(row.station)
    return f"https://airquality.cpcb.gov.in/dataRepository/download_file?file_name=Raw_data/15Min/{year}/{row.site_id}_{processed_station_name}_15Min.csv"

URLs = []
for index, row in df.iterrows():
    for year in range(2017, 2024):
        URLs.append(get_url(row, year))

In [4]:
len(URLs)

3766

## Download

In [5]:
downloaded_urls = []
for url in tqdm(URLs):
    year_part = url.split('/')[-2]
    station_part = url.split('/')[-1]
    save_path = f"../files/{year_part}_{station_part}"
    try:
        df = pd.read_csv(save_path, nrows=1)
        downloaded_urls.append(url)
    except FileNotFoundError:
        continue

len(downloaded_urls)

  0%|          | 0/3766 [00:00<?, ?it/s]

3048

In [6]:
def download_fn(url):
    year_part = url.split('/')[-2]
    station_part = url.split('/')[-1]
    save_path = f"../files/{year_part}_{station_part}"
    # print("Downaloding URL:", url, "to", save_path)
    response = requests.get(url)
    if response.status_code == 200:
        with open(save_path, 'wb') as f:
            f.write(response.content)
            
    # some urls have multiple consecutive "_" in them. We try all combinations of "_" to find the correct url
    elif response.status_code == 500:
        # find index of all "_" in the url
        indices = [i for i, a in enumerate(url) if a == '_']
        # insert "_" for each of the indices one by one and try to download
        for i in indices:
            new_url = url[:i] + "_" + url[i:]
            response = requests.get(new_url)
            if response.status_code == 200:
                with open(save_path, 'wb') as f:
                    f.write(response.content)
                break
    
remaining_urls = sorted(set(URLs) - set(downloaded_urls))
print("Total URLs:", len(URLs))
print("Downloaded URLs:", len(downloaded_urls))
print("Remaining URLs:", len(remaining_urls))

n_cpus = psutil.cpu_count()
cpus_to_use = max(1, n_cpus//2)
print("Running download on", cpus_to_use, "CPUs")

if len(remaining_urls) == 0:
    print("All files are downloaded")
else:
    Parallel(n_jobs=cpus_to_use)(delayed(download_fn)(url) for url in tqdm(remaining_urls[:5]))
    pass

Total URLs: 3766
Downloaded URLs: 3048
Remaining URLs: 718
Running download on 32 CPUs


  0%|          | 0/5 [00:00<?, ?it/s]

In [8]:
remaining_urls[12:15]

['https://airquality.cpcb.gov.in/dataRepository/download_file?file_name=Raw_data/15Min/2017/site_5938_Forest_Office_Barbil_OSPCB_15Min.csv',
 'https://airquality.cpcb.gov.in/dataRepository/download_file?file_name=Raw_data/15Min/2017/site_5939_Hakimapada_Angul_OSPCB_15Min.csv',
 'https://airquality.cpcb.gov.in/dataRepository/download_file?file_name=Raw_data/15Min/2017/site_5940_Lingraj_Mandir_Bhubaneswar_OSPCB_15Min.csv']