## conda environments

When accessing this notebook via the JASMIN Jupyter Notebook service, select the correct conda environment from the list of available kernels.

## Imports

In [70]:
import requests
import pandas as pd
from os import makedirs, path
from tqdm import tqdm

## Code
LAQN class for downloading &  processing of LAQN data. This code is adapted from my [COVID-19 repo](https://github.com/michellewl/COVID-19/blob/master/data/LAQN_class.py) (note that the borough averaging code has not been included here). See [here](https://github.com/michellewl/NO2-breast-cancer/tree/master/data/LAQN/download) for details on the source code (originally written for MRes project).

In [71]:
class LAQNData():
    def __init__(self, url, home_folder, species, start_date, end_date):
        self.url = url
        self.home_folder = home_folder
        self.species = species
        self.start_date = start_date
        self.end_date = end_date
        
        if not path.exists(self.home_folder):
            makedirs(self.home_folder)
        
        london_sites = requests.get(self.url)
        self.sites_df = pd.DataFrame(london_sites.json()['Sites']['Site'])
        self.site_codes = self.sites_df["@SiteCode"].tolist()

    def download(self, verbose=True):
        laqn_df = pd.DataFrame()
        
        if verbose:
            progress_bar = tqdm(self.site_codes)
        else:
            progress_bar = self.site_codes
            
        for site_code in progress_bar:
            if verbose:
                progress_bar.set_description(f'Working on site {site_code}')
            url_species = f"http://api.erg.kcl.ac.uk/AirQuality/Data/SiteSpecies/SiteCode={site_code}/SpeciesCode={self.species}/StartDate={self.start_date}/EndDate={self.end_date}/csv"
            cur_df = pd.read_csv(url_species)
            cur_df.columns = ["date", site_code]
            cur_df.set_index("date", drop=True, inplace=True)

            try:
                if laqn_df.empty:
                    laqn_df = cur_df.copy()
                else:
                    laqn_df = laqn_df.join(cur_df.copy(), how="outer")

            except ValueError:  # Trying to join with duplicate column names
                rename_dict = {}
                for x in list(set(cur_df.columns).intersection(laqn_df.columns)):
                    rename_dict.update({x: f"{x}_"})
                    print(f"Renamed duplicated column:\n{rename_dict}")
                laqn_df.rename(mapper=rename_dict, axis="columns", inplace=True)
                if laqn_df.empty:
                    laqn_df = cur_df.copy()
                else:
                    laqn_df = laqn_df.join(cur_df.copy(), how="outer")
                if verbose:
                    print(f"Joined.")

            except KeyError:  # Trying to join along indexes that don't match
                print(f"Troubleshooting {site_code}...")
                cur_df.index = cur_df.index + ":00"
                if laqn_df.empty:
                    laqn_df = cur_df.copy()
                else:
                    laqn_df = laqn_df.join(cur_df.copy(), how="outer")
                print(f"{site_code} joined.")

        print("Data download complete. Removing sites with 0 data...")
        laqn_df.dropna(axis="columns", how="all", inplace=True)
        laqn_df.to_csv(path.join(self.home_folder, f"{self.species}_hourly_{self.start_date}_{self.end_date}.csv"))
        print("Data saved.")

    def resample_time(self, df, key, quantile_step):
        df.set_index("date", drop=True, inplace=True)
        df.index = pd.to_datetime(df.index)

        if key == "D":
            keyword = "daily"
        if key == "W":
            keyword = "weekly"

        save_folder = path.join(self.home_folder, keyword)
        if not path.exists(save_folder):
            makedirs(save_folder)

        aggregation = np.round(np.arange(0, 1 + quantile_step, quantile_step), 2).tolist()

        for method in aggregation:
            aggregated_df = df.copy().resample(key).quantile(method)
            method = f"{int(method * 100)}_quantile"
            aggregated_df.to_csv(path.join(save_folder, f"{self.species}_{keyword}_{method}.csv"), index=True)
            print(aggregated_df.shape)

In [72]:
home_folder = path.join(path.abspath(""), "LAQN_data")
species = "NO2"
url = "http://api.erg.kcl.ac.uk/AirQuality/Information/MonitoringSites/GroupName=London/Json"
start_date = "2000-01-01"
end_date = "2021-01-01"

In [73]:
NO2_hourly = LAQNData(url, home_folder, species, start_date, end_date)

In [74]:
print(len(NO2_hourly.site_codes))

236


In [None]:
NO2_hourly.download()

Working on site BN1:   3%|▎         | 6/236 [00:53<32:33,  8.49s/it]