In [3]:
%config Completer.use_jedi = False

In [4]:
import os
import glob
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
import matplotlib
import tensorflow as tf

# Predict whole history

- Given $t_{-n}, ..., t_{-1}$ inputs, predict outputs at $t_{-n}, ..., t_{-1}$
- concatanates (put pictures side by side) two features
- For a given month output, 16th of that month until 16th of the month before inputs correspond

In [None]:
batch_size = 32
models = ['GFDL-ESM4','IPSL-CM6A-LR','MPI-ESM1-2-HR']  # models for temp, prec, LAI
time_len = 10  # for gen_data_card_history(), how long each training sample should be, in months

def gen_data_card_history():  
    model = np.random.choice(np.array(models))  # which of 3 models to choose from
    
    # MONTHLY PICK
    start_year = np.random.randint(1850,2014+1) # randomly select a start year of a time slice
    start_month = np.random.randint(1,12+1)
    end_year = start_year + ((start_month+time_len-1) // 12)
    end_month = (start_month+time_len) % 12
    if end_month == 0:
        end_month = 12
    month_index_start = (start_year-1850)*12 + start_month  # convert date into index with 01-1850 as 0
    month_index_end = month_index_start + time_len
#     print("index:", month_index_start, month_index_end)
#     print(model,start_year, start_month, end_year, end_month)
    
    # select appropriate time slices
    temp = xr.open_mfdataset('data/near_surface_air_temperature/historical/{}/*.nc'.format(model))
    temp = temp.tas.loc["{}-{}-16".format(start_year, start_month):"{}-{}-16".format(end_year, end_month)]  
    
    prec = xr.open_mfdataset('data/precipitation_flux/historical/{}/*.nc'.format(model))
    prec = prec.pr.loc["{}-{}-16".format(start_year, start_month):"{}-{}-16".format(end_year, end_month)]  
    
    # predict only single time step
    lai = xr.open_mfdataset('data/leaf_area_index/historical/{}/*.nc'.format(model))
    lai = np.array(lai.lai)[month_index_end]
    # lai = np.array(lai.lai)[month_index_start:month_index_end]  # predict whole history
    
    npp_files = glob.glob('data/net_primary_production_on_land/historical/**/*.nc', recursive=True) 
    npp = xr.open_mfdataset(np.random.choice(np.array(npp_files)))
    npp = np.array(npp.npp)[month_index_end]
    # npp = np.array(npp.npp)[month_index_start:month_index_end]
                
    # concatanate data
    inputs = np.array(xr.concat((temp,prec), dim='lat'))  # two maps next to each other
    outputs = np.concatenate((lai,npp), axis=-2)
    
    yield(inputs, outputs)
    
    
    ### snippets to convert cftime.NoLeap to pandas.datetime
    #     try:  # for GFDL, index are in cftime.timenoleap
    #         datetimeindex = lai.indexes['time'].to_datetimeindex()
    #         lai['time'] = datetimeindex  
    #     except:  # else in pandas datetime index
    #         pass
    
    ### snippets to handle different sampling date for februrary
     #     try:  # TODO: feburary is 15th, try-catch still doesn't work sometimes?
    #         npp = npp.npp.loc["{}-{}-16".format(start_year, start_month):"{}-{}-16".format(end_year, end_month)]
    #         print("16,16")
    #     except ValueError:
    #         try:
    #             npp = npp.npp.loc["{}-{}-16".format(start_year, start_month):"{}-{}-15".format(end_year, end_month)]
    #             print('16,15')
    #         except ValueError:
    #             try:
    #                 npp = npp.npp.loc["{}-{}-15".format(start_year, start_month):"{}-{}-16".format(end_year, end_month)]
    #                 print('15,16')
    #             except ValueError:
    #                 npp = npp.npp.loc["{}-{}-15".format(start_year, start_month):"{}-{}-15".format(end_year, end_month)]
    #                 print('15,15')
    
    
    # DAILY PICK
#     start_time = np.random.choice(np.arange(0, (2014-1850+1)*365-time_slice, ))
#     start_time = np.random.randint(0, (2014-1850+1)*365-time_slice)  # choose a random slice start point
# #     file_i_start = (start_time-365) // (365*10)  # figure out which file should be opened
# #     file_i_end = (start_time+time_len-365) // (365*10)  # eventually, two or multiple should be opened
#     temp = xr.open_mfdataset('data/near_surface_air_temperature/historical/{}/*.nc'.format(model))  # load all in one xarray
#     temp = temp.tas[start_time : start_time+time_len]  # select the right slice
#     prec = xr.open_mfdataset('data/precipitation_flux/historical/{}/*.nc'.format(model))
#     prec = prec.pr[start_time : start_time+time_len]
#     lai = xr.open_mfdataset('data/leaf_area_index/historical/{}/*.nc'.format(model))
#     lai = lai.lai[start_time//31 : ]  # TODO: daily slice and monthly slice don't match (17.03-25.04 /= 01.03-01.05)
    
    # LOAD SEPARATE DATA
#     files = sorted(glob.glob('./data/near_surface_air_temperature/historical/{}/*'.format(model)))  # all files in dir
#     years_i = np.random.randint(0, len(files))  # which years data should be used?
#     temp = xr.open_dataset(files[years_i]).tas  # choose a random dataset
#     day = np.random.randint(0, temp.shape[0]-time_len)  # choose a day where slice start # TODO: doesn't work with 1850
#     temp = temp[day:day+time_len]  # select a slice
    
#     # precipitation
#     files = glob.glob('./data/precipitation_flux/historical/{}/*'.format(model))  # all files in dir
#     temp = xr.open_dataset(np.random.choice(files)).tas  # choose a random dataset
#     day = np.random.randint(0, temp.shape[0]-time_slice)  # choose a day where slice start # TODO: doesn't work with 1850
#     temp = temp[day:day+time_slice]  # select a slice

# #     lai = 
# #     npp = 
#     inputs = xr.concat((temp,prec), dim='lat')

In [31]:
data_gen = gen_data_card_history()
din, dout = next(data_gen)

In [32]:
ds = tf.data.Dataset.from_generator(generator=gen_data_card, output_types=(tf.float32, tf.float32))
for inputs, outputs in ds.take(1):
    print(inputs.shape, outputs.shape)

(305, 72, 72) (72, 72)


# Predict single timeslice

- Given $t_{-n}, ..., t_{-1}$ inputs, predict outputs at $t_{-1}$
- Give two features as an extra dimension
- Given an output month, use fixed length of days input

Demonstration of how conversion from cftime to timestamp and computing difference works

In [173]:
dx_in = xr.open_mfdataset('data/precipitation_flux/historical/GFDL-ESM4/*.nc')
dx_out = xr.open_mfdataset('data/leaf_area_index/historical/GFDL-ESM4/lai_Lmon_GFDL-ESM4_historical_r1i1p1f1_185001-201412.nc')

day_len = 20
output_month_i = np.random.randint(0+day_len//30, (2014-1850+1)*12) # month index
outs = dx_out.lai[output_month_i]  # actual output
print(outs.time)
print(dx_out.indexes['time'].to_datetimeindex()[output_month_i])
endstamp = dx_out.indexes['time'].to_datetimeindex()[output_month_i]
startstamp = pd.Timestamp('1850-01-01T12')
(endstamp - startstamp).days

<xarray.DataArray 'time' ()>
array(cftime.DatetimeNoLeap(1914, 8, 16, 12, 0, 0, 0), dtype=object)
Coordinates:
    time     object 1914-08-16 12:00:00
Attributes:
    standard_name:  time
    long_name:      time
    bounds:         time_bnds
    axis:           T
1914-08-16 12:00:00


  print(dx_out.indexes['time'].to_datetimeindex()[output_month_i])
  endstamp = dx_out.indexes['time'].to_datetimeindex()[output_month_i]


23602

In [8]:
batch_size = 32
models = ['GFDL-ESM4','IPSL-CM6A-LR','MPI-ESM1-2-HR']  # models for temp, prec, LAI
day_len = 300  # for gen_data_card()


def gen_data_card():
    model = np.random.choice(np.array(models))
    
    output_month_i = np.random.randint(0+day_len//30, (2014-1850+1)*12)  # y_pred timepoint in int

    # month-based metrics
    lai = xr.open_mfdataset('data/leaf_area_index/historical/{}/*.nc'.format(model))

    # compute day index
    try:
        endstamp = lai.indexes['time'].to_datetimeindex()[output_month_i]  # cfttimeindex to datetime
    except:
        endstamp = lai.indexes['time'][output_month_i]
    output_day_i = (endstamp - pd.Timestamp('1850-01-01T12')).days  # output is i-th day in int

    # continue with month-based metrics
    lai = np.array(lai.lai)[output_month_i]
    npp_files = glob.glob('data/net_primary_production_on_land/historical/**/*.nc', recursive=True) 
    npp = xr.open_mfdataset(np.random.choice(np.array(npp_files)))
    npp = np.array(npp.npp)[output_month_i]
    
    # day-based metrics
    temp = xr.open_mfdataset('data/near_surface_air_temperature/historical/{}/*.nc'.format(model))
    temp = np.array(temp.tas)[output_day_i-day_len:output_day_i]
    prec = xr.open_mfdataset('data/precipitation_flux/historical/{}/*.nc'.format(model))
    prec = np.array(prec.pr)[output_day_i-day_len:output_day_i]

    inputs = np.stack((temp,prec), axis=-1)  # two features
    outputs = np.stack((lai,npp), axis=-1)

    yield (inputs, outputs)

In [9]:
mygen = gen_data_card()
next(mygen)

0
1
2


  endstamp = lai.indexes['time'].to_datetimeindex()[output_month_i]  # cfttimeindex to datetime


3
4


(array([[[[2.2659537e+02, 0.0000000e+00],
          [2.2767284e+02, 0.0000000e+00],
          [2.2861615e+02, 0.0000000e+00],
          ...,
          [2.2215324e+02, 0.0000000e+00],
          [2.2451089e+02, 0.0000000e+00],
          [2.2558388e+02, 0.0000000e+00]],
 
         [[2.5098602e+02, 0.0000000e+00],
          [2.4574818e+02, 0.0000000e+00],
          [2.4295480e+02, 0.0000000e+00],
          ...,
          [2.4758501e+02, 0.0000000e+00],
          [2.4894252e+02, 1.4415543e-06],
          [2.5050822e+02, 0.0000000e+00]],
 
         [[2.5336284e+02, 9.2184555e-06],
          [2.5415399e+02, 5.7816605e-06],
          [2.5363763e+02, 3.9785091e-06],
          ...,
          [2.5807098e+02, 1.8015415e-05],
          [2.4921672e+02, 7.7423419e-06],
          [2.4815082e+02, 5.5783867e-06]],
 
         ...,
 
         [[2.6764053e+02, 5.6493632e-06],
          [2.6815396e+02, 3.9198017e-06],
          [2.6913226e+02, 3.7880461e-06],
          ...,
          [2.6918216e+02, 1.45521