In [1]:
import os
from os.path import join
import pandas as pd
from glob import glob
from joblib import Parallel, delayed
import xarray as xr
from tqdm.notebook import tqdm

## Basic statistics

In [2]:
files = glob("../another_files/*/*/*/*.csv")
print("Total files:", len(files))
files[0]

Total files: 3080


'../another_files/Punjab/Khanna/Kalal Majra, Khanna - PPCB/Raw_data_15Min_2020_site_1450_Kalal_Majra_Khanna_PPCB_15Min.csv'

In [3]:
stations = list(set([file.split("/")[-2] for file in files]))
print("Total stations:", len(stations))
stations[0]

Total stations: 537


'Shivaji Nagar, Rishikesh - UKPCB'

In [4]:
cities = list(set([file.split("/")[-3] for file in files]))
print("Total cities:", len(cities))
cities[0]

Total cities: 279


'Khurja'

In [5]:
states = list(set([file.split("/")[-4] for file in files]))
print("Total states:", len(states))
states[0]

Total states: 31


'Uttarakhand'

## Load location info

In [6]:
station_data = pd.read_pickle("station_data.pkl")

In [7]:
set(station_data.keys()) - set(stations)

{'Charitra Van, Buxar - BSPCB (Formerly known as Central Jail)',
 'Rajiv Nagar, Vijayawada - APPCB',
 'Vasundhara Nagar_UIT, Bhiwadi - RSPCB'}

These 3 stations are not available in data because data for them is not available in the CPCB portal.

## Process all files

In [8]:
def process_file(file):
    try:
        df = pd.read_csv(file)
        state_name = file.split("/")[-4]
        city_name = file.split("/")[-3]
        station_name = file.split("/")[-2]
        if len(df) == 0:
            return None
        df["Timestamp"] = pd.to_datetime(df["Timestamp"], format="%Y-%m-%d %H:%M:%S")
        lon = station_data[station_name]["longitude"]
        lat = station_data[station_name]["latitude"]
        df["station"] = station_name
        df.set_index(["Timestamp", "station"], inplace=True)
        ds = df.to_xarray()
        ds.coords["longitude"] = ("station", [lon])
        ds.coords["latitude"] = ("station", [lat])
        
        # preprocess variable names
        for var_name in ds.data_vars:
            # print(var_name)
            name, unit = var_name.split("(")
            unit = unit.replace(")", "")
            # add metadata
            ds[var_name].attrs["unit"] = unit.strip()
            # change variable name
            # print(name, unit)
            name = name.strip()
            ds = ds.rename({var_name: name})
        
        base_path = "/home/patel_zeel/cpcb_helper/nc_files"
        mid_path = f"{state_name}/{city_name}/{station_name}"
        save_dir = join(base_path, mid_path)
        os.makedirs(save_dir, exist_ok=True)
        save_path = join(save_dir, file.split('/')[-1].replace('.csv', '.nc'))
        ds.to_netcdf(save_path)
        return save_path

    except Exception as e:
        print(e)
        raise e

example_path = process_file(files[0])
print(example_path)
with xr.open_dataset(example_path) as ds:
    pass
ds

/home/patel_zeel/cpcb_helper/nc_files/Punjab/Khanna/Kalal Majra, Khanna - PPCB/Raw_data_15Min_2020_site_1450_Kalal_Majra_Khanna_PPCB_15Min.nc


In [9]:
nc_data_list = Parallel(n_jobs=32)(delayed(process_file)(file) for file in tqdm(files))

  0%|          | 0/3080 [00:00<?, ?it/s]

Filter None files because of 0 entries in the CSV.

In [13]:
len(nc_data_list)

3080

In [14]:
clean_nc_data_list = [nc_data for nc_data in nc_data_list if nc_data is not None]
len(clean_nc_data_list)

2954

## Export station data as csv

In [18]:
df = pd.DataFrame(station_data).T
df.index.name = "station"
df.reset_index(inplace=True)
df.head(2)

Unnamed: 0,station,address,latitude,longitude
0,"SIDCO Kurichi, Coimbatore - TNPCB","SIDCO Kurichi, Coimbatore, Tamil Nadu.",10.942451,76.978996
1,"Muradpur, Patna - BSPCB","S K Memorial Hall Premises, Near Gandhi Maidan...",25.619651,85.147382


In [19]:
df.to_csv("station_data.csv", index=None)

## Some postprocessing due to duplication of data

Som states have same district names and thus data from stations from both districts are present at both places. We need to delete irrelevant data from corresponding states.

In [3]:
!rm -r "../another_files/Bihar/Aurangabad/More Chowk Waluj, Aurangabad - MPCB/"
!rm -r "../nc_files/Bihar/Aurangabad/More Chowk Waluj, Aurangabad - MPCB/"

rm: cannot remove '../another_files/Bihar/Aurangabad/More Chowk Waluj, Aurangabad - MPCB/': No such file or directory
rm: cannot remove '../nc_files/Bihar/Aurangabad/More Chowk Waluj, Aurangabad - MPCB/': No such file or directory


In [5]:
!rm -r "../another_files/Bihar/Aurangabad/MIDC Chilkalthana, Aurangabad - MPCB/"
!rm -r "../nc_files/Bihar/Aurangabad/MIDC Chilkalthana, Aurangabad - MPCB/"

rm: cannot remove '../another_files/Bihar/Aurangabad/MIDC Chilkalthana, Aurangabad - MPCB/': No such file or directory
rm: cannot remove '../nc_files/Bihar/Aurangabad/MIDC Chilkalthana, Aurangabad - MPCB/': No such file or directory


In [7]:
!rm -r "../another_files/Maharashtra/Aurangabad/Gurdeo Nagar, Aurangabad - BSPCB/"
!rm -r "../nc_files/Maharashtra/Aurangabad/Gurdeo Nagar, Aurangabad - BSPCB/"

rm: cannot remove '../another_files/Maharashtra/Aurangabad/Gurdeo Nagar, Aurangabad - BSPCB/': No such file or directory
rm: cannot remove '../nc_files/Maharashtra/Aurangabad/Gurdeo Nagar, Aurangabad - BSPCB/': No such file or directory


In [9]:
!rm -r "../another_files/Bihar/Aurangabad/Rachnakar Colony, Aurangabad - MPCB/"
!rm -r "../nc_files/Bihar/Aurangabad/Rachnakar Colony, Aurangabad - MPCB/"

rm: cannot remove '../another_files/Bihar/Aurangabad/Rachnakar Colony, Aurangabad - MPCB/': No such file or directory
rm: cannot remove '../nc_files/Bihar/Aurangabad/Rachnakar Colony, Aurangabad - MPCB/': No such file or directory
