#### Crop the simulator data to China only
#### Create dataframe of inputs and outputs

In [5]:
import pandas as pd
import xarray as xr
import geopandas as gpd
import numpy as np
from rasterio import features
from affine import Affine
import pickle
import re

In [6]:
inputs_train = pd.read_csv('/nfs/see-fs-02_users/earlacoa/emulator/latin_hypercube_inputs_train.csv')
inputs_test = pd.read_csv('/nfs/see-fs-02_users/earlacoa/emulator/latin_hypercube_inputs_test.csv')

In [7]:
path = '/nfs/b0122/Users/earlacoa/paper_aia_china/'

sectors = ['RES', 'IND', 'TRA', 'AGR', 'POW']
outputs = ['PM2_5_DRY', 'o3', 'AOD550_sfc', 'asoaX_2p5', 'bc_2p5', 'bsoaX_2p5', 'nh4_2p5', 'no3_2p5', 'oc_2p5', 'oin_2p5', 'so4_2p5']
aerosols = ['asoaX_2p5', 'bc_2p5', 'bsoaX_2p5', 'nh4_2p5', 'no3_2p5', 'oc_2p5', 'oin_2p5', 'so4_2p5']

sims_train = []
sims_test = []

for sim_number in range(1, 51):
    sims_train.extend(['t' + str(sim_number)])
    
for sim_number in range(51, 56):
    sims_test.extend(['t' + str(sim_number)])

In [8]:
def crop_china(path, sim, time, output, shapes_china):
    """Crop DataArray to given shapefile for specific output"""
    if output in aerosols:
        filename = f'{path}{sim}/wrfout_d01_global_0.25deg_2015-{time}_{output}_ugm-3.nc'
    else:
        filename = f'{path}{sim}/wrfout_d01_global_0.25deg_2015-{time}_{output}.nc'
    
    with xr.open_dataset(filename) as ds:
        try:
            conc = ds[output]
        except KeyError:    
            conc = ds['__xarray_dataarray_variable__']
            conc.name = output
            
        lon = ds.lon.values
        lat = ds.lat.values

    # mark shapefiles with 1 or np.nan (needs the extra step)
    conc['china'] = rasterize(shapes_china, conc.coords, longitude='lon', latitude='lat') # in shapefile == 0, outside == np.nan
    conc['china'] = conc['china'].where(cond=conc.china!=0, other=1) # if condition (outside china, as inside == 0) preserve, otherwise (1, to mark in china)
    
    # if condition is shapefile (==1) or not (!=1) preserve, otherwise replace with
    conc = conc.where(cond=conc.china==1, other=np.nan) # if condition (in china) preserve, otherwise (np.nan)

    return conc

In [9]:
def transform_from_latlon(lat, lon):
    """ input 1D array of lat / lon and output an Affine transformation """
    lat = np.asarray(lat)
    lon = np.asarray(lon)
    trans = Affine.translation(lon[0], lat[0])
    scale = Affine.scale(lon[1] - lon[0], lat[1] - lat[0])
    
    return trans * scale


def rasterize(shapes, coords, latitude='latitude', longitude='longitude',
              fill=np.nan, **kwargs):
    """Rasterize a list of (geometry, fill_value) tuples onto the given
    xray coordinates. This only works for 1d latitude and longitude
    arrays.

    usage:
    -----
    1. read shapefile to geopandas.GeoDataFrame
          `states = gpd.read_file(shp_dir+shp_file)`
    2. encode the different shapefiles that capture those lat-lons as different
        numbers i.e. 0.0, 1.0 ... and otherwise np.nan
          `shapes = (zip(states.geometry, range(len(states))))`
    3. Assign this to a new coord in your original xarray.DataArray
          `ds['states'] = rasterize(shapes, ds.coords, longitude='X', latitude='Y')`

    arguments:
    ---------
    : **kwargs (dict): passed to `rasterio.rasterize` function

    attrs:
    -----
    :transform (affine.Affine): how to translate from latlon to ...?
    :raster (numpy.ndarray): use rasterio.features.rasterize fill the values
      outside the .shp file with np.nan
    :spatial_coords (dict): dictionary of {"X":xr.DataArray, "Y":xr.DataArray()}
      with "X", "Y" as keys, and xr.DataArray as values

    returns:
    -------
    :(xr.DataArray): DataArray with `values` of nan for points outside shapefile
      and coords `Y` = latitude, 'X' = longitude.


    """
    transform = transform_from_latlon(coords[latitude], coords[longitude])
    out_shape = (len(coords[latitude]), len(coords[longitude]))
    raster = features.rasterize(shapes, out_shape=out_shape,
                                fill=fill, transform=transform,
                                dtype=float, **kwargs)
    spatial_coords = {latitude: coords[latitude], longitude: coords[longitude]}
    
    return xr.DataArray(raster, coords=spatial_coords, dims=(latitude, longitude))

In [10]:
shp_china = gpd.read_file('/nfs/a68/earlacoa/shapefiles/china/china_taiwan_hongkong_macao.shp')
shapes_china = [(shape, n) for n, shape in enumerate(shp_china.geometry)]

In [14]:
dict_train = {}

for sim in sims_train:
    for index, output in enumerate(outputs):
        if output == 'o3':
            time = '6mDM8h'
        else:
            time = 'annual-mean'
            
        conc = crop_china(path, sim, time, output, shapes_china)
        
        if index == 0:
            df_sim = conc.to_dataframe().dropna().reset_index()[['lat', 'lon']]
        
        df_sim[output] = conc.to_dataframe().dropna().reset_index()[output]
        
        if output == 'o3':
            df_sim[output] = df_sim[output] * 1000
            
        dict_train.update({sim: df_sim})

In [15]:
dict_test = {}

for sim in sims_test:
    for index, output in enumerate(outputs):
        if output == 'o3':
            time = '6mDM8h'
        else:
            time = 'annual-mean'
            
        conc = crop_china(path, sim, time, output, shapes_china)
        
        if index == 0:
            df_sim = conc.to_dataframe().dropna().reset_index()[['lat', 'lon']]
        
               
        df_sim[output] = conc.to_dataframe().dropna().reset_index()[output]
        
        if output == 'o3':
            df_sim[output] = df_sim[output] * 1000

        dict_test.update({sim: df_sim})

In [16]:
with open('/nfs/a336/earlacoa/paper_aia_china/emulator_annual/dict_train.pickle', 'wb') as ds:
    pickle.dump(dict_train, ds, protocol=pickle.HIGHEST_PROTOCOL)

In [17]:
with open('/nfs/a336/earlacoa/paper_aia_china/emulator_annual/dict_test.pickle', 'wb') as ds:
    pickle.dump(dict_test, ds, protocol=pickle.HIGHEST_PROTOCOL)

In [18]:
df_train = pd.concat(dict_train, ignore_index=True)
df_test = pd.concat(dict_test, ignore_index=True)

In [26]:
df_train.to_csv('/nfs/a336/earlacoa/paper_aia_china/emulator_annual/dict_train.csv')
df_test.to_csv('/nfs/a336/earlacoa/paper_aia_china/emulator_annual/dict_test.csv')

In [4]:
df_train = pd.read_csv('/nfs/a336/earlacoa/paper_aia_china/emulator_annual/dict_train.csv')
df_train

Unnamed: 0.1,Unnamed: 0,lat,lon,PM2_5_DRY,o3_6mDM8h,AOD550_sfc,asoaX_2p5,bc_2p5,bsoaX_2p5,nh4_2p5,no3_2p5,oc_2p5,oin_2p5,so4_2p5
0,0,18.25,109.00,16.059700,59.835756,0.551701,0.573486,0.275420,0.237946,0.499406,2.186306,1.155026,4.622030,1.109776
1,1,18.25,109.25,15.499481,58.893731,0.532734,0.554965,0.258137,0.229260,0.489017,2.135247,1.071643,4.484922,1.082499
2,2,18.25,109.50,15.311710,57.958811,0.521876,0.539000,0.251098,0.220787,0.486980,2.142926,1.043744,4.407179,1.064815
3,3,18.50,108.75,16.856476,58.548565,0.578438,0.637844,0.292437,0.282871,0.560916,2.220532,1.268883,5.151252,1.200744
4,4,18.50,109.00,15.696284,56.679188,0.553119,0.626654,0.270974,0.284494,0.511375,1.928000,1.189042,5.054855,1.169836
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
763895,763895,53.25,122.75,6.730077,45.505920,0.249792,0.073990,0.124858,0.068830,0.364789,0.350767,1.088245,2.525977,0.755509
763896,763896,53.25,123.00,6.795813,45.452459,0.252916,0.075352,0.127491,0.069991,0.369815,0.362826,1.108557,2.522508,0.759685
763897,763897,53.25,123.25,6.911898,45.417101,0.255855,0.077106,0.132399,0.071484,0.376979,0.379608,1.142596,2.535748,0.765747
763898,763898,53.25,123.50,7.001504,45.371421,0.257614,0.078651,0.134714,0.073001,0.383684,0.398539,1.176962,2.531012,0.769508
