# Imports

In [1]:
from data_classes import LAQNData, HealthData, MetData
from os import path
import pandas as pd
import matplotlib.pyplot as plt

# Load data

### Load the hourly NO$_2$ data

In [None]:
data_folder = path.join(path.abspath(""), "LAQN_data")
species = "NO2"
url = "http://api.erg.kcl.ac.uk/AirQuality/Information/MonitoringSites/GroupName=London/Json"
start_date = "2000-01-01"
end_date = "2021-01-01"

In [None]:
NO2_hourly = LAQNData(url, data_folder, species, start_date, end_date)

In [None]:
hourly_NO2_df = NO2_hourly.read_csv(index_col="date")
print(hourly_NO2_df.shape)

### Load the hourly meteorology data

In [None]:
data_folder = path.join(path.abspath(""), "met_data")
url = "https://bulk.meteostat.net/hourly/03772.csv.gz"

In [None]:
heathrow = MetData(data_folder, url=url)

In [None]:
hourly_df = heathrow.read_csv()
print(hourly_df.shape)

### Load the weekly mortality per capita data

In [None]:
data_folder = path.join(path.abspath(""), "mortality_data")
filename = "weekly_mortality_percapita_London.csv"

In [None]:
mortality_weekly = HealthData(data_folder, filename=filename)

In [None]:
mortality_df = pd.DataFrame(mortality_weekly.read_csv(index_col="date"))
print(mortality_df.shape)

### Load the daily mortality per capita data

In [None]:
data_folder = path.join(path.abspath(""), "mortality_data/daily")
filename = "daily_mortality_percapita_London.csv"

In [None]:
mortality_daily = HealthData(data_folder, filename=filename)

In [None]:
mortality_df = pd.DataFrame(mortality_daily.read_csv(index_col="date"))
print(mortality_df.shape)

# Process data

## Weekly data

### Resample the hourly NO$_2$ data using a custom date list

In [None]:
weekly_dates = mortality_df.index

Only retain the NO$_2$ data up to the final mortality datapoint:

In [None]:
hourly_df = hourly_df.loc[hourly_df.index < weekly_dates.max()]

In [None]:
weekly_NO2_df = hourly_df.groupby(weekly_dates[weekly_dates.searchsorted(hourly_df.index)]).mean()

We've now generated a dataframe of weekly mean NO$_2$ concentrations (from the hourly data) using the weekly dates given by the mortality dataset. 
- The `searchsorted()` function takes the `hourly_df` index and inserts it into `weekly_dates` to return an array of indices where the hourly elements should be inserted to maintain order in the weekly dates. See docs [here](https://pandas.pydata.org/pandas-docs/version/0.21.1/generated/pandas.DatetimeIndex.searchsorted.html).
- Placing these indices inside `weekly_dates[]` produces an array the same length as the hourly index, where each entry is the corresponding "weekly" date.
- The `groupby()` operation then performs the `mean()` function on the hourly data, grouping by the assigned weekly dates in the previous step. See docs [here](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.groupby.html).

### Spatially average the NO$_2$ data for all of London

In [None]:
NO2_df = pd.DataFrame(weekly_NO2_df.mean(axis=1), columns=["mean_NO2"]).join(mortality_df)
NO2_df["deaths_per_100k"] = NO2_df["deaths_per_capita"]*100000

In [None]:
NO2_df.head()

In [None]:
NO2_df.plot.scatter(x="mean_NO2", y="deaths_per_100k")

## Daily data

### Resample the hourly NO$_2$ data using a custom date list

In [None]:
daily_dates = mortality_df.index

Only retain the NO$_2$ data up to the final mortality datapoint:

In [None]:
hourly_df = hourly_df.loc[hourly_df.index < daily_dates.max()]

In [None]:
daily_NO2_df = hourly_df.groupby(daily_dates[daily_dates.searchsorted(hourly_df.index)]).mean()

In [None]:
daily_NO2_df.shape

### Spatially average the NO$_2$ data for all of London

In [None]:
NO2_df = pd.DataFrame(daily_NO2_df.mean(axis=1), columns=["mean_NO2"]).join(mortality_df)
NO2_df["deaths_per_100k"] = NO2_df["deaths_per_capita"]*100000

In [None]:
NO2_df.head()

In [None]:
NO2_df.plot.scatter(x="mean_NO2", y="deaths_per_100k", legend=False, title="London mortality").set_ylabel("daily deaths per 100,000")
plt.xlabel("mean daily NO$_2$ (µg m$^{-3}$)")
plt.show()

In [None]:
NO2_df.plot(y="mean_NO2", legend=False, title="London NO$_2$").set_ylabel("mean daily NO$_2$ (µg m$^{-3}$)")
plt.show()

### Resample the hourly meteorology data using a custom date list

In [None]:
daily_dates = mortality_df.index

Only retain the met data up to the final mortality datapoint:

In [None]:
hourly_df = hourly_df.loc[hourly_df.index < daily_dates.max()]

In [None]:
daily_met_df = hourly_df.groupby(daily_dates[daily_dates.searchsorted(hourly_df.index)]).mean()

In [None]:
daily_met_df.shape

In [None]:
daily_met_df.head()

In [None]:
temperature_df = pd.DataFrame(daily_met_df["temperature"]).join(mortality_df)
temperature_df["deaths_per_100k"] = temperature_df["deaths_per_capita"]*100000

In [None]:
temperature_df.head()

In [None]:
temperature_df.plot.scatter(x="temperature", y="deaths_per_100k", legend=False, title="London mortality").set_ylabel("daily deaths per 100,000")
plt.xlabel("mean daily temperature (℃)")
plt.show()