In [None]:
%run ../notebook_preamble.ipy

In [None]:
import pyproj
import geopandas as gpd
import shapely
import os

shapely.speedups.enable()

## DEFRA Data Collection

Modelled data for air pollution across the UK is compiled by DEFRA. The values are obtained by using the data from monitoring stations and using atmospheric modelling to interpolate the data to a 1km by 1km grid across the whole country.

## PM10

In [None]:
years = range(2007, 2019)
base_url = 'https://uk-air.defra.gov.uk/datastore/pcm/mappm10{}g.csv'

In [None]:
for i, year in enumerate(years):
    df_year = pd.read_csv(base_url.format(year), header=5, na_values='MISSING')
    df_year.rename(columns={f'pm10{year}g': year}, inplace=True)
    if i == 0:
        df = df_year
    else:
        df[year] = df_year[year]
    
df = df.dropna()

In [None]:
df.head()

To match up with shapefiles from Eurostat, we need to convert the UK grid Coordinates (BNG) to decimal lat long coordinates.

In [None]:
bng = pyproj.Proj('epsg:27700')
wgs84 = pyproj.Proj('epsg:4326')

df['lat'], df['lon'] = pyproj.transform(bng, wgs84, df['x'].values, df['y'].values)

In [None]:
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['lon'], df['lat']))

In [None]:
gdf.head()

We are going to assign each point in the grid to a NUTS 2 2016 region.

In [None]:
nuts_year = 2016
file = f'{data_path}/raw/gis/eurostat/NUTS_RG_01M_{nuts_year}_4326_LEVL_2.shp/NUTS_RG_01M_{nuts_year}_4326_LEVL_2.shp'
eu_regions = region = gpd.read_file(file)
uk_regions = eu_regions[eu_regions['CNTR_CODE'] == 'UK']

In [None]:
points_in_poly = gpd.sjoin(gdf, uk_regions, op='within')

And finally we aggregate by NUTS region.

In [None]:
mean_pm10 = points_in_poly.groupby('NUTS_ID')[list(years)].max().reset_index()
mean_pm10 = (mean_pm10
             .set_index('NUTS_ID')
             .unstack(level=0)
             .reset_index()
             .rename(columns={'level_0': 'year', 'NUTS_ID': 'nuts_id', 0: 'air_pollution_mean_pm10'}))

In [None]:
mean_pm10['nuts_year_spec'] = 2016

In [None]:
mean_pm10 = mean_pm10[['nuts_id', 'nuts_year_spec', 'air_pollution_mean_pm10', 'year']]

In [None]:
defra_dir = f'{data_path}/processed/defra'

if not os.path.isdir(defra_dir):
    os.mkdir(f'{data_path}/processed/defra')

In [None]:
mean_pm10.to_csv(f'{data_path}/processed/defra/air_pollution_mean_pm10.csv', index=False)

## Open Geography Portal Boundaries

In [None]:
uk_nuts2 = gpd.read_file('https://opendata.arcgis.com/datasets/48b6b85bb7ea43699ee85f4ecd12fd36_4.geojson')
pm10_df = pd.read_csv(base_url.format(year), header=5, na_values='MISSING').dropna()
pm10_df['lat'], pm10_df['lon'] = pyproj.transform(bng, wgs84, pm10_df['x'].values, pm10_df['y'].values)
pm10_gdf = gpd.GeoDataFrame(pm10_df, geometry=gpd.points_from_xy(pm10_df['lon'], pm10_df['lat']))

In [None]:
data = gpd.sjoin(pm10_gdf, uk_nuts2, op='within')

In [None]:
data = data[['nuts218nm', 'pm102018g']]

In [None]:
agg = data.groupby('nuts218nm').agg(np.mean).reset_index()