# Imports

In [1]:
from data_classes import LAQNData, HealthData
from os import path
import pandas as pd

# Load data

### Load the hourly NO$_2$ data:

In [2]:
data_folder = path.join(path.abspath(""), "LAQN_data")
species = "NO2"
url = "http://api.erg.kcl.ac.uk/AirQuality/Information/MonitoringSites/GroupName=London/Json"
start_date = "2000-01-01"
end_date = "2021-01-01"

In [3]:
NO2_hourly = LAQNData(url, data_folder, species, start_date, end_date)

In [4]:
hourly_df = NO2_hourly.read_csv(index_col="date")
print(hourly_df.shape)

Reading NO2_hourly_2000-01-01_2021-01-01.csv...
(184105, 192)


In [5]:
hourly_df.head()

Unnamed: 0_level_0,TD0,BG3,BG1,BG2,BN2,BN3,BN1,BX5,BX2,BQ7,...,WMD,WM0,MY1,WM6,WMZ,WMB,NB1,WM8,WM9,VS1
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-01-01 00:00:00,,,,,,,34.4,,,,...,,,78.2,,,,,,,
2000-01-01 01:00:00,20.3,,,,,,46.9,,34.3,,...,,,84.9,,,,,,,
2000-01-01 02:00:00,21.2,,,,,,48.5,33.4,27.3,,...,,,115.9,,,,,,,
2000-01-01 03:00:00,21.2,,,,,,34.0,20.3,18.6,,...,,,97.6,,,,,,,
2000-01-01 04:00:00,20.5,,,,,,25.2,18.7,19.7,,...,,,74.0,,,,,,,


### Load the weekly mortality per capita data:

In [9]:
data_folder = path.join(path.abspath(""), "mortality_data")
filename = "weekly_mortality_London.csv"

In [11]:
mortality_weekly = HealthData(data_folder, filename=filename)

In [12]:
mortality_df = mortality_weekly.read_csv(index_col="date")
print(mortality_df.shape)

Reading weekly_mortality_London.csv...
(581, 1)


In [13]:
mortality_df.head()

Unnamed: 0_level_0,weekly_deaths
date,Unnamed: 1_level_1
2010-01-08,1226
2010-01-15,1262
2010-01-22,1186
2010-01-29,1093
2010-02-05,1035


# Process data

In [18]:
weekly_dates = mortality_df.index

In [19]:
weekly_dates

DatetimeIndex(['2010-01-08', '2010-01-15', '2010-01-22', '2010-01-29',
               '2010-02-05', '2010-02-12', '2010-02-19', '2010-02-26',
               '2010-03-05', '2010-03-12',
               ...
               '2020-12-18', '2020-12-25', '2021-01-01', '2021-01-08',
               '2021-01-15', '2021-01-22', '2021-01-29', '2021-02-05',
               '2021-02-12', '2021-02-19'],
              dtype='datetime64[ns]', name='date', length=581, freq=None)

In [24]:
weekly_NO2_df = hourly_df.groupby(weekly_dates[weekly_dates.searchsorted(hourly_df.index)]).mean()

We've now generated a dataframe of weekly mean NO$_2$ concentrations (from the hourly data) using the weekly dates given by the mortality dataset. 
- The `searchsorted()` function takes the `hourly_df` index and inserts it into `weekly_dates` to return an array of indices where the hourly elements should be inserted to maintain order in the weekly dates. See docs [here](https://pandas.pydata.org/pandas-docs/version/0.21.1/generated/pandas.DatetimeIndex.searchsorted.html).
- Placing these indices inside `weekly_dates[]` produces an array the same length as the hourly index, where each entry is the corresponding "weekly" date.
- The `groupby()` operation then performs the `mean()` function on the hourly data, grouping by the assigned weekly dates in the previous step. 

In [27]:
weekly_dates[weekly_dates.searchsorted(hourly_df.index)]

DatetimeIndex(['2010-01-08', '2010-01-08', '2010-01-08', '2010-01-08',
               '2010-01-08', '2010-01-08', '2010-01-08', '2010-01-08',
               '2010-01-08', '2010-01-08',
               ...
               '2021-01-01', '2021-01-01', '2021-01-01', '2021-01-01',
               '2021-01-01', '2021-01-01', '2021-01-01', '2021-01-01',
               '2021-01-01', '2021-01-01'],
              dtype='datetime64[ns]', name='date', length=184105, freq=None)