In [56]:
'''
Prepare prior for DeepDA

OUTPUT:
    example:
    prior2proxyunit hdf5 file saved: /mnt/c/Users/mul450/Dropbox/git/deepDA/mlwrk/proxy/petmproxy3slices_v0.0.10gt1.csv.hdf5

Mingsong Li
1/15/2020
'''
from DeepDA_lib import modules_nc
from DeepDA_lib import modules_psm_linear
import h5py
import time
import yaml
import numpy as np
import pandas
import os
from netCDF4 import Dataset

try:
    import bayspline
except ImportError as e1:
    print('Warning:', e1)
try:
    import bayspar
except ImportError as e2:
    print('Warning:', e2)
try:
    import bayfox
except ImportError as e3:
    print('Warning:', e3)
try:
    import baymag
except ImportError as e4:
    print('Warning:', e4)

In [57]:
dum_lon_offset = -180

nc_file ='fields_biogem_2d.nc'
#nc_field = 'ocn_sur_temp'
t = 12  # last time slice, cGENIE
k = 0   # first layer, SST

f = open("DeepDA_config.yml", 'r')
yml_dict = yaml.load(f, Loader=yaml.FullLoader)
#print(yml_dict)
f.close()

In [58]:
# build Ye
# If there is no field in the model, convert model unit to proxy unit

dir_prior = yml_dict['core']['prior_dir']
dir_prior_full = os.listdir(dir_prior)
prior_len = len(dir_prior_full)
#print('dir_prior: {}'.format(dir_prior))
print('>>  Prior member size: {}'.format(prior_len))

# prepare variable list for Xb
prior_variable_dict = yml_dict['prior']['state_variables_info']
prior_variable_len = len(prior_variable_dict)
print('>>  Number of prior variables is: {}. List:'.format(prior_variable_len))
print('      {}'.format(prior_variable_dict))

# read first variable data, first time slice, to get the shape of prior grid
x0 = Dataset(dir_prior+'/'+dir_prior_full[0]+'/'+nc_file).variables[prior_variable_dict[0]][0,:,:]
print('    Shape of prior 2d grid {}'.format(x0.shape))
dum_imax = x0.shape[0]  # lon
dum_jmax = x0.shape[1]  # lat
dum_ijmax = dum_imax*dum_jmax  # lonn * latn
Xb_shape = (dum_ijmax*prior_variable_len, prior_len)  # lonn * latn * varn

# read prior and save Xb
#Xb = np.full((dum_ijmax, prior_len),np.nan)
Xb = np.full(Xb_shape,np.nan)

# loop for each member of a prior
for i in range(prior_len):
    name_nc = dir_prior+'/'+dir_prior_full[i]+'/'+nc_file
    # loop for each variable of each member
    for j in range(prior_variable_len):
        j0 = dum_ijmax * j
        j1 = dum_ijmax * (j+1)
        nc_field = prior_variable_dict[j]
        x = Dataset(name_nc).variables[nc_field][t,:,:]
        
        Xb[j0:j1,i] = x.reshape(dum_ijmax)
        # print the last one data
        if i > prior_len-2:
            print('>>  Last member: x.shape {}'.format(x.shape))
            print('      {}: {}: {}'.format(i, dir_prior_full[i], prior_variable_dict[j]))
    #print(x)
print('>>  OKAY. Xb ready, to be saved')

>>  Prior member size: 150
>>  Number of prior variables is: 5. List:
      ['ocn_sur_temp', 'atm_temp', 'atm_pCO2', 'ocn_sur_sal', 'ocn_ben_DIC_13C']
    Shape of prior 2d grid (36, 36)
>>  Last member: x.shape (36, 36)
      149: ML.petm008.ID.9: ocn_sur_temp
>>  Last member: x.shape (36, 36)
      149: ML.petm008.ID.9: atm_temp
>>  Last member: x.shape (36, 36)
      149: ML.petm008.ID.9: atm_pCO2
>>  Last member: x.shape (36, 36)
      149: ML.petm008.ID.9: ocn_sur_sal
>>  Last member: x.shape (36, 36)
      149: ML.petm008.ID.9: ocn_ben_DIC_13C
>>  OKAY. Xb ready, to be saved


In [59]:
# Now, prepare Ye
# read config.yml settings
dir_proxies = yml_dict['proxies'][yml_dict['proxies']['use_from'][0]]['datadir_proxy'] +'/'+ yml_dict['proxies'][yml_dict['proxies']['use_from'][0]]['dbversion']
proxy_psm_type = yml_dict['proxies'][yml_dict['proxies']['use_from'][0]]['proxy_psm_type']
proxy_assim2 = yml_dict['proxies'][yml_dict['proxies']['use_from'][0]]['proxy_assim2']
psm_d18osw_adjust = yml_dict['psm']['bayesreg_d18o_pooled']['psm_d18osw_adjust']
#print(proxy_psm_type)
#print(proxy_assim2)
#print(psm_d18osw_adjust)
data_period_id    = yml_dict['proxies'][yml_dict['proxies']['use_from'][0]]['data_period_id']
data_period_idstd = yml_dict['proxies'][yml_dict['proxies']['use_from'][0]]['data_period_idstd']
recon_period = yml_dict['core']['recon_period']
recon_timescale = yml_dict['core']['recon_timescale_interval']
recon_period_full = np.arange(recon_period[0],recon_period[1]+1,recon_timescale)
recon_period_len = recon_period_full.shape[0]
print('>>  recon_period {} - {}. List: '.format(recon_period[0], recon_period[1]))
print('      {}'.format(recon_period_full))

# read proxies database
proxies = pandas.read_csv(dir_proxies)
proxies_len = proxies.shape[0]

# for saving proxy unit data Ye
Ye       = np.full((proxies_len,prior_len),np.nan)
Yevar    = np.full((proxies_len,prior_len),np.nan)
obvalue  = np.full((proxies_len,recon_period_len),np.nan)
ob_err   = np.full((proxies_len,recon_period_len),np.nan)
print('>>  OKAY.')

>>  recon_period 0 - 2. List: 
      [0 1 2]
>>  OKAY.


In [61]:
# precal_Ye

proi = 0
for j in range(proxies_len):
    # read lon lat for each line of proxy
    dum_lat = proxies['Lat'][j]  # (paleo)latitude of this site
    dum_lon = proxies['Lon'][j]  # (paleo)longitude of this site
    lonlat = modules_nc.cal_find_ij(dum_lon,dum_lat,dum_lon_offset,dum_imax,dum_jmax) 
    
    ######################## TO DO: including d13C or other proxies ##############
    # find 1d grid location
    lonlati = lonlat[1] * dum_jmax + lonlat[0]
    # read prior
    prior_1grid = np.copy(Xb[lonlati,:])   # prior
    print('')
    print('>>  Data row {}, grid [lon lat] {}, id {}'.format(j,lonlat,lonlati))
    print(prior_1grid.shape)
    ######################## TO DO: add  dum_ijmax * j etc. ##############
    
    #print(Xb[lonlati,i])
    #print(name_nc)
    #result = np.where(Xb[:,i] == Xb[lonlati,i])
    #print(result)
    
    # Read proxy type from the database
    data_psm_type = proxies['Proxy'][j]
    # Read allowed proxy from the DTDA-config.yml
    data_psm_type_find = 0
    for key, value in proxy_assim2.items():
        #print(key,value)
        # find this proxy type exist or not, how many times it occurrs
        if data_psm_type in proxy_assim2[key]:
            data_psm_type_find = data_psm_type_find + 1
    if data_psm_type_find == 1:
        for key, value in proxy_psm_type.items():
            if data_psm_type in proxy_assim2[key]:
                data_psm_key = key
        proxy_psm_type_i = proxy_psm_type[data_psm_key]
        print('PSM for {} is {}'.format(data_psm_type,proxy_psm_type_i))
        
    elif data_psm_type_find == 0:
        print('Warning, this proxy type in database is not find in DTDA-config.yml dictionary')
    else:
        print('Warning, this proxy type in database appears more than 1 time in DTDA-config.yml dictionary')
    
    
    # Now PSM type has been found. Let's precal Ye
    
    if proxy_psm_type_i in ['bayesreg_d18o_pooled']:
        # bayfox
        d18o_localsw = modules_psm_linear.d18o_localsw(abs(dum_lat))
        psm_d18osw_adjust = yml_dict['psm']['bayesreg_d18o_pooled']['psm_d18osw_adjust']
        # total d18osw = d18o_localsw + d18o_adj + psm_d18osw_adjust
        # d18o_adj has been included in the bayfox model
        prediction_d18O = bayfox.predict_d18oc(prior_1grid,d18o_localsw+psm_d18osw_adjust) # pool model for bayfox
        print('>>  prediction_d18O.ensemble shape {}'.format(prediction_d18O.ensemble.shape))
        Ye[proi,:] = np.mean(prediction_d18O.ensemble, axis = 1)
        Yevar[proi,:] = np.var(prediction_d18O.ensemble, axis = 1, ddof=1)
        for reconi in range(recon_period_len):
            obvalue[proi,reconi] = proxies[data_period_id[reconi]][j]
            ob_err[proi,reconi] = proxies[data_period_idstd[reconi]][j] ** 2
            #obvalue[proi,] = proxies['Lat'][j]
        print('>>  bayesreg_d18o_pooled')
        print('>>  id {}, Ye, first example {}'.format(proi,Ye[j,0:10]))
        print('>>  id {}, Yevar, first example {}'.format(proi,Yevar[j,0:10]))
        proi = proi + 1  # increasement
        
    elif proxy_psm_type_i in ['bayesreg_tex86']:
        # bayfox
        try:
            # bayspar
            search_tol_i = yml_dict['psm']['bayesreg_tex86']['search_tol']
            nens_i = yml_dict['psm']['bayesreg_tex86']['nens']
            prediction = bayspar.predict_tex_analog(prior_1grid, temptype = 'sst', search_tol = search_tol_i, nens=nens_i)
            Ye[proi,:] = np.mean(prediction.ensemble, axis = 1)
            Yevar[proi,:] = np.var(prediction.ensemble, axis = 1, ddof=1)
            for reconi in range(recon_period_len):
                obvalue[proi,reconi] = proxies[data_period_id[reconi]][j]
                ob_err[proi,reconi] = proxies[data_period_idstd[reconi]][j] ** 2
                #obvalue[proi,] = proxies['Lat'][j]
            
            print('>>  bayesreg_tex86')
            print('>>  id {}, Ye, first example {}'.format(proi,Ye[j,0:10]))
            print('>>  id {}, Yevar, first example {}'.format(proi,Yevar[j,0:10]))
            proi = proi + 1  # increasement
        except:
            print('search_tol too small for {}: mean sst is {}'.format(ii, np.mean(sst)))
    elif proxy_psm_type_i in ['bayesreg_uk37']:
        # 
        a = 1
    elif proxy_psm_type_i in ['bayesreg_mgca_pooled_red']:
        #
        a = 1
    elif proxy_psm_type_i in ['bayesreg_mgca_pooled_bcp']:
        #
        a = 1
    else:
        a = 1
    print(np.mean(Ye,axis=1))
print('obvalue {},  ob_err {}'.format(obvalue, ob_err))
print('>>  OKAY.')


>>  Data row 0, grid [lon lat] [18, 35], id 1278
(150,)
PSM for tex86 is bayesreg_tex86
>>  bayesreg_tex86
>>  id 0, Ye, first example [0.34961011 0.37440732 0.33591337 0.36470728 0.36358269 0.3612555
 0.34315449 0.33642042 0.34082318 0.36992774]
>>  id 0, Yevar, first example [0.00370087 0.00324385 0.00382364 0.00353545 0.00347256 0.00345201
 0.00370822 0.00378728 0.00377111 0.0033797 ]
[ 0.37318309 -2.93529643         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan]

>>  Data row 1, grid [lon lat] [16, 29], id 1060
(150,)
PSM for d18o_m.subb is bayesreg_d18o_pooled
>>  prediction_d18O

In [62]:
hdf5name = dir_proxies + '.hdf5'
with h5py.File(hdf5name, 'w') as f:
    #g = f.create_group('proxy')
    #dset1 = g.create_dataset('data', data=proxies)
    #g = f.create_group('prior2proxyunit')
#    g = f.create_group('ML.petm004.SST')
    f.create_dataset('Xb', data=Xb)
    f.create_dataset('obvalue', data=obvalue)
    f.create_dataset('Ye', data=np.transpose(Ye))
    f.create_dataset('Yevar', data=np.transpose(Yevar))
    f.create_dataset('ob_err', data=ob_err)
    
    #dset1 = g.create_dataset('Xb', data=Xb)
    #dset2 = g.create_dataset('obvalue', data=obvalue)
    #dset3 = g.create_dataset('Ye', data=Ye)
    #dset4 = g.create_dataset('Yevar', data=Yevar)
    #dset5 = g.create_dataset('ob_err', data=ob_err)

    metadata = {'Date': time.time(),
                'proxy_dbversion':yml_dict['proxies'][yml_dict['proxies']['use_from'][0]]['dbversion'],
                'exp_dir':yml_dict['core']['prior_dir'],
               'nc_file':nc_file,
               'nc_field': nc_field,
               'Nens':str(prior_len)}
    f.attrs.update(metadata)
print('  prior2proxyunit hdf5 file saved: {}'.format(hdf5name))
print('  Step 1 finished. Run Step 2: DeepDA_main.ipynb now')
print('>>  Done!')

  prior2proxyunit hdf5 file saved: /mnt/c/Users/mul450/Dropbox/git/deepDA/mlwrk/proxy/petmproxy3slices_v0.0.10g.csv.hdf5
  Step 1 finished. Run Step 2: DeepDA_main.ipynb now
>>  Done!


In [13]:
print(Xb.shape)

(3888, 150)
