In [49]:
'''
Prepare prior for DeepDA

OUTPUT:
    example:
    prior2proxyunit hdf5 file saved: /mnt/c/Users/mul450/Dropbox/git/deepDA/mlwrk/proxy/petmproxy3slices_v0.0.10gt1.csv.hdf5

Mingsong Li
1/15/2020
'''
from DeepDA_lib import modules_nc
from DeepDA_lib import DeepDA_psm
import h5py
import time
import yaml
import numpy as np
import pandas
import os
from netCDF4 import Dataset

try:
    import bayspline
except ImportError as e1:
    print('Warning:', e1)
try:
    import bayspar
except ImportError as e2:
    print('Warning:', e2)
try:
    import bayfox
except ImportError as e3:
    print('Warning:', e3)
try:
    import baymag
except ImportError as e4:
    print('Warning:', e4)
    
print('>>  OKAY.')

>>  OKAY.


In [50]:
config_name = "DeepDA_config.yml"
#config_name = "petmproxy3slices_v0.0.10gt1.csvexp_petm78_og1_qc_obs_20200203_test2.yml"
f = open(config_name, 'r')
yml_dict = yaml.load(f, Loader=yaml.FullLoader)
f.close()

t = 0  # last time slice, cGENIE
k = 0   # surface layer, SST
# read config.yml settings
print(' ########## Load yml config file ######### ')
print('')
########## Proxy + PSM #########
dir_proxy         = yml_dict['core']['proxy_dir']
dir_proxy_data    = dir_proxy +'/'+ yml_dict['proxies'][yml_dict['proxies']['use_from'][0]]['dbversion']
dir_proxy_save    = yml_dict['core']['wrkdir'] + '/'+ yml_dict['proxies'][yml_dict['proxies']['use_from'][0]]['dbversion']
proxy_psm_type    = yml_dict['proxies'][yml_dict['proxies']['use_from'][0]]['proxy_psm_type']
proxy_assim2      = yml_dict['proxies'][yml_dict['proxies']['use_from'][0]]['proxy_assim2']
proxy_order       = yml_dict['proxies'][yml_dict['proxies']['use_from'][0]]['proxy_order']
proxy_err_eval   = yml_dict['proxies'][yml_dict['proxies']['use_from'][0]]['proxy_err_eval']
proxy_blacklist   = yml_dict['proxies'][yml_dict['proxies']['use_from'][0]]['proxy_blacklist']
proxy_list = [item for item in proxy_order if item not in proxy_blacklist]
psm_d18osw_adjust = yml_dict['psm']['bayesreg_d18o_pooled']['psm_d18osw_adjust']
proxy_qc          = yml_dict['proxies']['proxy_qc']
proxy_frac          = yml_dict['proxies']['proxy_frac']
prior_source = yml_dict['prior']['prior_source'] #
prior_state_variable = yml_dict['prior'][prior_source]['state_variable']  # note: ['2d': xxx; '3d': xxx]
dum_lon_offset = yml_dict['prior'][prior_source]['dum_lon_offset'] # longitude offset
dir_prior = yml_dict['core']['prior_dir']
dir_prior_full = os.listdir(dir_prior)
prior_len = len(dir_prior_full)

nexp = yml_dict['core']['nexp']
data_period_id    = yml_dict['proxies'][yml_dict['proxies']['use_from'][0]]['data_period_id']
data_period_idstd = yml_dict['proxies'][yml_dict['proxies']['use_from'][0]]['data_period_idstd']
recon_period = yml_dict['core']['recon_period']
recon_timescale = yml_dict['core']['recon_timescale_interval']
recon_period_full = np.arange(recon_period[0],recon_period[1]+1,recon_timescale)
recon_period_len = recon_period_full.shape[0]
geologic_age = yml_dict['core']['geologic_age']

#print('dir_prior: {}'.format(dir_prior))
print('>>  Prior member size: {}'.format(prior_len))
print('>>  Recon_period {} - {}. List: '.format(recon_period[0], recon_period[1]))
print('      {}'.format(recon_period_full))
print('>>  Proxy error evaluation: {}'.format(proxy_err_eval))
print('>>  Proxy full list:')
print('      {}'.format(proxy_order))
print('>>  Proxy blacklist:')
print('      {}'.format(proxy_blacklist))
print('>>  Proxy list to be assimilated: ')
print('      {}'.format(proxy_list))
print('>>  Proxy quality control selection: {}'.format(proxy_qc))
if 'Marine sediments_mgca_pooled_bcp' in proxy_list or 'Marine sediments_mgca_pooled_red' in proxy_list:
    data_psm_mgca_find = 1
    print('>>  Note: Mg/Ca proxy found')
else:
    data_psm_mgca_find = 0
print('')
print('########## Read prior ######### ')
print('')
########## Prior #########
# save prior variable list
prior_variable_dict = []  # variable list
prior_nc_file_list = []  # nc file list
prior_variable_dict_3d = []  # variable list
prior_nc_file_list_3d = []  # nc file list

for key, value in prior_state_variable.items():
    nc_keyvalue = prior_state_variable[key]['ncname']  # note: 2d dict
    print('      nc_keyvalue {}...'.format(nc_keyvalue))
    for key1, value1 in nc_keyvalue.items():
        print('      {}: {}'.format(key1,value1))
        for i in range(len(prior_state_variable[key][value1])):
            if key in ['2d']:
                prior_variable_dict.append(prior_state_variable[key][value1][i])
                prior_nc_file_list.append(key1+'/'+value1+'.nc')
            elif key in ['3d']:
                prior_variable_dict_3d.append(prior_state_variable[key][value1][i])
                prior_nc_file_list_3d.append(key1+'/'+value1+'.nc')

# prepare variable list for Xb
prior_variable2d_len = len(prior_variable_dict)
prior_variable3d_len = len(prior_variable_dict_3d)
print('>>  Number of 2d prior variables is: {}. List:'.format(prior_variable2d_len))
print('      {}'.format(prior_variable_dict))
print('>>  Prior nc file list {}'.format(prior_nc_file_list))
print('>>  Number of 3d prior variables is: {}. List:'.format(prior_variable3d_len))
print('      {}'.format(prior_variable_dict_3d))
print('>>  Prior nc file list {}'.format(prior_nc_file_list_3d))

# If there is no field in the model, convert model unit to proxy unit
print('>>  Reading prior state variables')
# read first variable data, first time slice, to get the shape of prior grid
try:
    #x0 = Dataset(dir_prior+'/'+dir_prior_full[0]+'/'+ nc_file_2d).variables[prior_variable_dict[0]][0,:,:]
    x1 = Dataset(dir_prior+'/'+dir_prior_full[0]+'/'+ prior_nc_file_list_3d[0]).variables[prior_variable_dict_3d[0]][0,:,:,:]
    #print('    Shape of prior 2d grid {}'.format(x0.shape))
    dum_dmax = x1.shape[0] # depth
    dum_imax = x1.shape[1]  # lon
    dum_jmax = x1.shape[2]  # lat
except:
    try:
        x0 = Dataset(dir_prior+'/'+dir_prior_full[0]+'/'+ prior_nc_file_list[0]).variables[prior_variable_dict[0]][0,:,:]
        dum_imax = 36 #x1.shape[0]  # lon
        dum_jmax = 36 #x1.shape[1]  # lat
        dum_dmax = 16
    except:
        dum_dmax = 16
        dum_imax = 36
        dum_jmax = 36
# prepare 2d Xb for lon-lat state 
dum_ijmax = dum_imax*dum_jmax  # lonn * latn
print('>>  Shape of dum_dmax {}, dum_imax {}, dum_jmax {}, dum_ijmax {}'.format(dum_dmax,dum_imax,dum_jmax,dum_ijmax))
# save units of each variable
prior_variable_units = list()
prior_variable_units_init = 0

# nan matrix for storing 2d and 3d variables
if prior_variable2d_len>0:
    Xb_shape = (prior_variable2d_len*dum_jmax*dum_imax, prior_len)  # lonn * latn * varn
    Xb   = np.full(Xb_shape,np.nan)
# prep 3d version of Xb
if prior_variable3d_len > 0:
    Xb3d_shape = (prior_variable3d_len*dum_dmax*dum_jmax*dum_imax, prior_len)  # lonn * latn * varn
    Xb3d = np.full(Xb3d_shape,np.nan)
    # read prior and save Xb
    #Xb = np.full((dum_ijmax, prior_len),np.nan)

if data_psm_mgca_find == 1:
    print('>>  Prepare Mg/Ca related state variable ...')
    # for Mg/Ca SST proxy salinity, ph, omega
    Xb_sal       = np.full(Xb_shape,np.nan)
    Xb_ph        = np.full(Xb_shape,np.nan)
    Xb_omega     = np.full(Xb_shape,np.nan)
    spp = 'all'
    # ``1`` for reductive, ``0`` for BCP (Barker).
    cleaningr = np.tile(np.array([1]),prior_len)
    cleaningb = np.tile(np.array([0]),prior_len)
# read units of each variable from prior and save as prior_variable_units
if prior_variable3d_len > 0:
    for j in range(prior_variable2d_len):
        name_nc_2d = dir_prior+'/'+dir_prior_full[0]+'/'+ prior_nc_file_list[j]
        nc_field = prior_variable_dict[j]
        unit_j = Dataset(name_nc_2d).variables[nc_field].units
        prior_variable_units.append((unit_j))
if prior_variable3d_len > 0:
    for j in range(prior_variable3d_len):
        name_nc_3d = dir_prior+'/'+dir_prior_full[0]+'/'+ prior_nc_file_list_3d[j]
        nc_field = prior_variable_dict_3d[j]
        try:
            unit_j = Dataset(name_nc_3d).variables[nc_field].units
            prior_variable_units.append((unit_j))
        except:
            prior_variable_units.append((''))
    
# loop for each member of an ensemble
for i in range(prior_len):
    # loop for each variable of each member
    if prior_variable2d_len>0:
        for j in range(prior_variable2d_len):
            # full directory of netcdf file
            name_nc_2d = dir_prior+'/'+dir_prior_full[i]+'/'+ prior_nc_file_list[j]
            j0 = dum_ijmax * j
            j1 = dum_ijmax * (j+1)
            nc_field = prior_variable_dict[j]
            x = Dataset(name_nc_2d).variables[nc_field][t,:,:]  # time-lat-lon
            
            Xb[j0:j1,i] = np.copy(x.reshape(dum_ijmax))  # var-lat-lon: Nx x 1
            
            if data_psm_mgca_find == 1:
                try:
                    name_nc_2d_mgca = dir_prior+'/'+dir_prior_full[i]+'/biogem/'+ 'fields_biogem_2d.nc'
                    x = Dataset(name_nc_2d_mgca).variables['ocn_sur_sal'][t,:,:] # time-lat-lon
                    Xb_sal[j0:j1,i] = np.copy(x.reshape(dum_ijmax)) # var-lat-lon: Nx x 1  | surface water salinity
                    name_nc_3d_mgca = dir_prior+'/'+dir_prior_full[i]+'/biogem/'+ 'fields_biogem_3d.nc'
                    x = Dataset(name_nc_3d_mgca).variables['misc_pH'][t,k,:,:] # time-lat-lon | core top pH
                    Xb_ph[j0:j1,i] = np.copy(x.reshape(dum_ijmax)) # var-lat-lon: Nx x 1
                    x = Dataset(name_nc_3d_mgca).variables['carb_ohm_cal'][t,k,:,:] # time-lat-lon  | bottom water omega
                    Xb_omega[j0:j1,i] = np.copy(x.reshape(dum_ijmax)) # var-lat-lon: Nx x 1
                except:
                    if i == 0:
                        # warning one time
                        print('>>  Warning: reading state variable error. ocn_sur_sal, misc_pH, carb_ohm_cal')
            # print the last one data
            if i > prior_len-2:
                print('    Last member: {}: {}: {}'.format(i, dir_prior_full[i], prior_variable_dict[j]))
        Xb = np.ma.MaskedArray(Xb, Xb >= 9.9692e+36)
    # if 3d variables are used
    if prior_variable3d_len > 0:
        for k in range(prior_variable3d_len):
            name_nc_3d = dir_prior+'/'+dir_prior_full[i]+'/'+ prior_nc_file_list_3d[k]
            nc_field = prior_variable_dict_3d[k]
            k0 = dum_ijmax*dum_dmax * k
            k1 = dum_ijmax*dum_dmax * (k+1)
            x = Dataset(name_nc_3d).variables[nc_field][t,:,:,:]  # time-depth-lat-lon
            Xb3d[k0:k1,i] = np.copy(x.reshape(dum_dmax*dum_ijmax)) # var-depth-lat-lon
        Xb3d = np.ma.MaskedArray(Xb3d, Xb3d >= 9.9692e+36)
print('>>  Units of state variables {}: {}'.format(prior_variable_dict+prior_variable_dict_3d,prior_variable_units))

print('>>  OKAY.')
print('')


 ########## Load yml config file ######### 

>>  Prior member size: 200
>>  Recon_period 0 - 2. List: 
      [0 1 2]
>>  Proxy error evaluation: proxy_err_psm_fixed
>>  Proxy full list:
      ['Marine sediments_uk37', 'Marine sediments_tex86', 'Marine sediments_d18o_pooled', 'Marine sediments_mgca_pooled_bcp', 'Marine sediments_mgca_pooled_red', 'Marine sediments_caco3', 'Marine sediments_caco3_13c']
>>  Proxy blacklist:
      []
>>  Proxy list to be assimilated: 
      ['Marine sediments_uk37', 'Marine sediments_tex86', 'Marine sediments_d18o_pooled', 'Marine sediments_mgca_pooled_bcp', 'Marine sediments_mgca_pooled_red', 'Marine sediments_caco3', 'Marine sediments_caco3_13c']
>>  Proxy quality control selection: None
>>  Note: Mg/Ca proxy found

########## Read prior ######### 

      nc_keyvalue {'biogem': 'fields_biogem_2d_13ccorr'}...
      biogem: fields_biogem_2d_13ccorr
      nc_keyvalue {'biogem': 'fields_biogem_3d'}...
      biogem: fields_biogem_3d
>>  Number of 2d prior var



    Last member: 199: ML.petm009.ID.99: ocn_sur_temp
    Last member: 199: ML.petm009.ID.99: sed_CaCO3
    Last member: 199: ML.petm009.ID.99: sed_CaCO3_13C_corr
    Last member: 199: ML.petm009.ID.99: atm_pCO2
    Last member: 199: ML.petm009.ID.99: ocn_sur_sal
    Last member: 199: ML.petm009.ID.99: misc_pH
    Last member: 199: ML.petm009.ID.99: carb_sur_ohm_cal
>>  Units of state variables ['ocn_sur_temp', 'sed_CaCO3', 'sed_CaCO3_13C_corr', 'atm_pCO2', 'ocn_sur_sal', 'misc_pH', 'carb_sur_ohm_cal']: []
>>  OKAY.



In [51]:
print(' ########## Read proxies database ######### ')
print('')
### read proxies database ###
proxies = pandas.read_csv(dir_proxy_data)
proxies_len0 = len(proxies)
#proxy_select = pandas.DataFrame()
#print(proxy_select)
proxy_select_0 = 0
proxy_d18o_glassy  = yml_dict['proxies']['proxy_d18o_glassy']
proxy_assim3 = yml_dict['proxies'][yml_dict['proxies']['use_from'][0]]['proxy_assim3']
data_glassy_label_blacklist = proxy_assim3['Marine sediments_d18o_pooled_glassy']
### check proxy data in the blacklist or not ###
for j in range(proxies_len0):
    # Read proxy type from the database
    data_psm_type = proxies['Proxy'][j]
    # initial default 0 : this proxy is not included
    data_assimilate_i = 0
    for jlist in range(len(proxy_list)):
        if data_psm_type in proxy_assim2[proxy_list[jlist]]:
            # find and save this proxy
            data_assimilate_i = 1
    if data_assimilate_i == 1:
        #print('>>    file {}, {} included'.format(proxies.loc[j,'File'], data_psm_type))
        if proxy_select_0 == 0:
            proxy_select0 = proxies.iloc[[j]]
            proxy_select0 = proxy_select0.reset_index(drop=True) # reset_index, avoid index error
            proxy_select_0 = 1
        else:
            #proxy_select.append(proxies.iloc[[j]])
            proxy_select0 = proxy_select0.append(proxies.iloc[[j]], ignore_index=True)
proxies_select_len0 = len(proxy_select0)
print('>>  Proxy: selected proxy dataset number {}: remove those in blacklist'.format(proxies_select_len0))

### check glassy only data or not
proxy_select_0 = 0
if proxy_d18o_glassy:
    for jj in range(proxies_select_len0):
        data_glassy_label = proxies['Glassy'][jj]
        if data_glassy_label not in data_glassy_label_blacklist:
            if proxy_select_0 == 0:
                proxy_select = proxy_select0.iloc[[jj]]
                proxy_select = proxy_select.reset_index(drop=True) # reset_index, avoid index error
                proxy_select_0 = 1
            else:
                proxy_select = proxy_select.append(proxy_select0.iloc[[jj]], ignore_index=True)

#print(proxy_select)
proxies_select_len0 = len(proxy_select)
print('>>  Proxy: selected proxy dataset number {}: remove those unknown/frosty'.format(proxies_select_len0))

### Select a fraction of proxy sites ###
if proxy_frac <= 1.0:
    print('>>  Proxy fraction is {}'.format(proxy_frac))
    sites_assim, sites_eval = DeepDA_psm.proxy_frac_4da_eval(proxy_select,proxy_frac)
else:
    sites_assim = proxy_select.copy()
    sites_eval = []
###
#print('>>  Randomly selected proxy sties: ')
#print(sites_assim)
#print('>>  Randomly un-selected proxy sties: ')
#print(sites_eval)
### sort proxy data using given order ###
proxies_frac_len = len(sites_assim)
proxy_select_1 = 0

#print(proxy_select)

for i in range(len(proxy_order)):
    proxy_order_i = proxy_assim2[proxy_order[i]]
    for j in range(proxies_frac_len):
#        print(j)
        # Read proxy type from the database
        #data_psm_type = proxy_select['Proxy'][j]
        data_psm_type = sites_assim['Proxy'][j]
        # initial default 0 : this proxy is not included
        data_assimilate_i = 0
        if data_psm_type in proxy_order_i:
            if proxy_select_1 == 0:
                #proxy_select_sort = proxy_select.iloc[[j]]
                proxy_select_sort = sites_assim.iloc[[j]]
                proxy_select_1 = 1
            else:
                #proxy_select_sort = proxy_select_sort.append(proxy_select.iloc[[j]], ignore_index=True)
                proxy_select_sort = proxy_select_sort.append(sites_assim.iloc[[j]], ignore_index=True)

### update proxies using sorted proxy order ###
proxies =   proxy_select_sort.copy()
proxies_len = len(proxies)

if proxies_len0 > proxies_len:
    print('>>  Selected proxy data length {}'.format(proxies_len))

######## Ye   ########
# for saving proxy unit data Ye
Ye       = np.full((proxies_len,prior_len),np.nan)
obvalue  = np.full((proxies_len,recon_period_len),np.nan)
ob_err   = np.full((proxies_len,recon_period_len),np.nan) # data obs error
ob_err0  = np.full((proxies_len,recon_period_len),np.nan) # PSM obs error
ob_err_comb  = np.full((proxies_len,recon_period_len),np.nan) # PSM obs error
yo_all = np.full((proxies_len,2),np.nan) # PSM obs error
print('>>  OKAY.')
print('')
print('###### Check the consistency of the config.yml file and proxy database ######')
print('')
# check the consistency of the config.yml file and proxy database
# AND get obs R
data_psm_mgca_find = 0
proxy_psm_type_dict = {}
for j in range(proxies_len):
    # Read proxy type from the database
    data_psm_type = proxies['Proxy'][j]
    # Read allowed proxy from the DTDA-config.yml
    data_psm_type_find = 0
    for key, value in proxy_assim2.items():
        #print(key,value)
        # find this proxy type exist or not, how many times it occurrs
        if data_psm_type in proxy_assim2[key]:
            data_psm_type_find = data_psm_type_find + 1
    if data_psm_type_find == 1:
        for key, value in proxy_psm_type.items():
            if data_psm_type in proxy_assim2[key]:
                data_psm_key = key
        proxy_psm_type_i = proxy_psm_type[data_psm_key]
        
        proxy_psm_type_dict[j] =proxy_psm_type_i
        
        print('>>  {}. PSM for {} is {}'.format(j, data_psm_type,proxy_psm_type_i))
        
    elif data_psm_type_find == 0:
        print('>>  Warning, {} in database is not find in DTDA-config.yml dictionary'.format(data_psm_type))
    else:
        print('>>  Warning, {} in database appears more than 1 time in DTDA-config.yml dictionary'.format(data_psm_type))
    
    # Now PSM type has been found. Let's precal Ye
    
    if proxy_psm_type_i in ['bayesreg_mgca_pooled_red','bayesreg_mgca_pooled_bcp']:
        data_psm_mgca_find = 1

#print('>>  Proxy_psm_type_dict: ')
#print(proxy_psm_type_dict)
print('>>  All looks good.')
print('')
##### Ye calculation ####
print('##### Ye calculation ####')
print('')
# precal_Ye
proi = 0
for j in range(proxies_len):
    # Read proxy type from the database
    data_psm_type = proxies['Proxy'][j]
    proxy_psm_type_i = proxy_psm_type_dict[j]
    psm_required_variable_key = list(yml_dict['psm'][proxy_psm_type_i]['psm_required_variables'].keys())[0]
    #print(psm_required_variable_key)
    if psm_required_variable_key in prior_variable_dict:
        psm_required_variable_key_index = prior_variable_dict.index(psm_required_variable_key)
        print('>>  FOUND: {} in prior_variable_dict 2d list, index = {}'.format(psm_required_variable_key, psm_required_variable_key_index))
    elif psm_required_variable_key in prior_variable_dict_3d:
        psm_required_variable_key_index = prior_variable_dict_3d.index(psm_required_variable_key)
        print('>>  FOUND: {} in prior_variable_dict_3d list, index = {}'.format(psm_required_variable_key, psm_required_variable_key_index))
    
######################## FOR 2D field ONLY TO DO: adjusted to include 3d proxies ##############
    # read lon lat for each line of proxy
    dum_lat = proxies['Lat'][j]  # (paleo)latitude of this site
    dum_lon = proxies['Lon'][j]  # (paleo)longitude of this site
    yo_all[proi,:] = np.array([dum_lon, dum_lat])  # save location of this site
    
    lonlat = modules_nc.cal_find_ij(dum_lon,dum_lat,dum_lon_offset,dum_imax,dum_jmax)
    # output [lon, lat], 
    # lon ranges from 0 (-180) to 35 (180), lat ranges from 0 (-90) to 35 (90)

    Filei = proxies['File'][j]
    # find 1d grid location
    lonlati = lonlat[1] * dum_jmax + lonlat[0] + psm_required_variable_key_index * dum_ijmax
    # read prior
    prior_1grid = np.copy(Xb[lonlati,:])   # prior
    #print(prior_1grid.shape)
    #print(prior_1grid)
######################## FOR 2D field ONLY TO DO: adjusted to include 3d proxies ##############

    print('')
    print('>>  {}. {}, grid [lon lat] {}, grid id {}, PSM for {} is {}'.format(j,Filei,lonlat,lonlati,data_psm_type,proxy_psm_type_i))
    print('>>      Mean of Prior is {}, variance is {}'.format(np.mean(prior_1grid), np.var(prior_1grid)))
    
    # Now PSM type has been found. Let's precal Ye
    
    if proxy_psm_type_i in ['bayesreg_d18o_pooled']:
        #try:
            # bayfox
        d18o_localsw = DeepDA_psm.d18o_localsw(abs(dum_lat))
        psm_d18osw_adjust = yml_dict['psm']['bayesreg_d18o_pooled']['psm_d18osw_adjust']
        # total d18osw = d18o_localsw + d18o_adj + psm_d18osw_adjust
        # d18o_adj has been included in the bayfox model
        #print('>>  Prior is {}'.format(prior_1grid))
        prediction_d18O = bayfox.predict_d18oc(prior_1grid,d18o_localsw + psm_d18osw_adjust) # pool model for bayfox
        #print('>>  prediction_d18O.ensemble shape {}'.format(prediction_d18O.ensemble.shape))
        Ye[proi,:] = np.mean(prediction_d18O.ensemble, axis = 1)
        #print('>>  Ye is {}'.format(Ye[proi,:]))
        print('>>      Mean of  Ye  is {:.6f}, variance is {:.6f} '.format(np.mean(Ye[proi,:]), np.var(Ye[proi,:],ddof=1)))
        for reconi in range(recon_period_len):
            obvalue[proi,reconi] = proxies[data_period_id[reconi]][j]
            ob_err[proi,reconi] = proxies[data_period_idstd[reconi]][j] ** 2
            if proxy_err_eval in ['proxy_err_psm']:
                ob_err0[proi,reconi]= DeepDA_psm.obs_estimate_r_d18o(obvalue[proi,reconi], d18o_localsw+psm_d18osw_adjust)
            else:
                ob_err0[proi,reconi]= DeepDA_psm.obs_estimate_r_fixed_d18o(15)
            ob_err_comb[proi,reconi] = np.nansum([ob_err[proi,reconi], ob_err0[proi,reconi]])
            if ob_err_comb[proi,reconi] == 0: ob_err_comb[proi,reconi] = np.nan
            print('>>   {}. Proxy variance from PSM is {:.6f}, from PSM and selected interval is {:.6f} '.format(reconi,ob_err0[proi,reconi], ob_err_comb[proi,reconi]))
            
            # Quality control
            if proxy_err_eval in ['proxy_err_psm']:
                qc_i = DeepDA_psm.obs_qc(Ye[proi,:], obvalue[proi,reconi], ob_err_comb[proi,reconi], proxy_qc)
            else:
                qc_i = DeepDA_psm.obs_qc(Ye[proi,:], obvalue[proi,reconi], ob_err0[proi,reconi], proxy_qc)
            #print(qc_i)
            if qc_i:
                if proxy_qc is not None:
                    print('    Pass QC. ye {}, obs {}, obs_var {}, qc {}'.format(np.mean(Ye[proi,:]), obvalue[proi,reconi], ob_err_comb[proi,reconi], proxy_qc))
            else:
                ob_err_comb[proi,reconi] = np.nan
                if proxy_qc is not None:                    
                    print('    Did not pass QC. ye {}, obs {}, obs_var {}, qc {}'.format(np.mean(Ye[proi,:]), obvalue[proi,reconi], ob_err_comb[proi,reconi], proxy_qc))
        proi = proi + 1  # increasement
        #except:
        #    print('>>  Warning {}'.format(proxy_psm_type_i))
    elif proxy_psm_type_i in ['cgenie_caco3', 'cgenie_caco3_13c']:
        Ye[proi,:] = np.mean(prior_1grid)
        for reconi in range(recon_period_len):
            obvalue[proi,reconi] = proxies[data_period_id[reconi]][j]
            ob_err[proi,reconi] = proxies[data_period_idstd[reconi]][j] ** 2
            ob_err0[proi,reconi] = yml_dict['psm'][proxy_psm_type_i]['psm_error']
            ob_err_comb[proi,reconi] = np.nansum([ob_err[proi,reconi], ob_err0[proi,reconi]])
            # Quality control
            if proxy_err_eval in ['proxy_err_psm']:
                qc_i = DeepDA_psm.obs_qc(Ye[proi,:], obvalue[proi,reconi], ob_err_comb[proi,reconi], proxy_qc)
            else:
                qc_i = DeepDA_psm.obs_qc(Ye[proi,:], obvalue[proi,reconi], ob_err0[proi,reconi], proxy_qc)
            if qc_i:
                if proxy_qc is not None:
                    print('    Pass QC. ye {}, obs {}, obs_var {}, qc {}'.format(np.mean(Ye[proi,:]), obvalue[proi,reconi], ob_err_comb[proi,reconi], proxy_qc))
            else:
                ob_err_comb[proi,reconi] = np.nan
                if proxy_qc is not None:                    
                    print('    Did not pass QC. ye {}, obs {}, obs_var {}, qc {}'.format(np.mean(Ye[proi,:]), obvalue[proi,reconi], ob_err_comb[proi,reconi], proxy_qc))
        proi = proi + 1  # increasement
    elif proxy_psm_type_i in ['bayesreg_tex86']:
        # bayspar
        #try:
        # bayspar
        search_tol_i = yml_dict['psm']['bayesreg_tex86']['search_tol']
        nens_i = yml_dict['psm']['bayesreg_tex86']['nens']
        prediction = bayspar.predict_tex_analog(prior_1grid, temptype = 'sst', search_tol = search_tol_i, nens=nens_i)
        Ye[proi,:] = np.mean(prediction.ensemble, axis = 1)
        print('>>      Mean of  Ye   is {:.6f}, variance is {:.6f} '.format(np.mean(Ye[proi,:]), np.var(Ye[proi,:],ddof=1)))
        for reconi in range(recon_period_len):
            obvalue[proi,reconi] = proxies[data_period_id[reconi]][j]
            ob_err[proi,reconi] = proxies[data_period_idstd[reconi]][j] ** 2
            if proxy_err_eval in ['proxy_err_psm']:
                ob_err0[proi,reconi]= DeepDA_psm.obs_estimate_r_tex86(np.array([31]), 'sst', 15)
            else:
                ob_err0[proi,reconi]= DeepDA_psm.obs_estimate_r_fixed_tex86(31)
            #obvalue[proi,] = proxies['Lat'][j]

            ob_err_comb[proi,reconi] = np.nansum([ob_err[proi,reconi], ob_err0[proi,reconi]])
            if ob_err_comb[proi,reconi] == 0: ob_err_comb[proi,reconi] = np.nan
            print('>>   {}. Proxy variance from PSM is {:.6f}, from PSM and selected interval is {:.6f} '.format(reconi,ob_err0[proi,reconi], ob_err_comb[proi,reconi]))
            # Quality control
            if proxy_err_eval in ['proxy_err_psm']:
                qc_i = DeepDA_psm.obs_qc(Ye[proi,:], obvalue[proi,reconi], ob_err_comb[proi,reconi], proxy_qc)
            else:
                qc_i = DeepDA_psm.obs_qc(Ye[proi,:], obvalue[proi,reconi], ob_err0[proi,reconi], proxy_qc)
            if qc_i:
                if proxy_qc is not None:
                    print('    Pass QC. ye {}, obs {}, obs_var {}, qc {}'.format(np.mean(Ye[proi,:]), obvalue[proi,reconi], ob_err_comb[proi,reconi], proxy_qc))
            else:
                ob_err_comb[proi,reconi] = np.nan
                if proxy_qc is not None:                    
                    print('    Did not pass QC. ye {}, obs {}, obs_var {}, qc {}'.format(np.mean(Ye[proi,:]), obvalue[proi,reconi], ob_err_comb[proi,reconi], proxy_qc))
        proi = proi + 1  # increasement
        #except:
        #    print('>>  Warning {}'.format(proxy_psm_type_i))
        #    print('>>  search_tol too small for {}: mean sst is {}'.format(j, np.mean(prior_1grid)))
            
    elif proxy_psm_type_i in ['bayesreg_uk37']:
        # 
        print('... bayesreg_uk37: To be done ...')
        
    elif proxy_psm_type_i in ['bayesreg_mgca_pooled_red', 'bayesreg_mgca_pooled_bcp']:
        if proxy_psm_type_i in ['bayesreg_mgca_pooled_red']:
            clearning_one = cleaningr
            proxy_explain = 'reductive'
        elif proxy_psm_type_i in ['bayesreg_mgca_pooled_bcp']:
            clearning_one = cleaningb
            proxy_explain = 'barker'
        #try:
        # prior_1grid = np.copy(Xb[lonlati,:])   # prior
        salinity =  np.copy(Xb_sal[lonlati,:])
        ph       =  np.copy(Xb_ph[lonlati,:])
        omega    =  np.copy(Xb_omega[lonlati,:])
        
        Xb_sal1 = np.copy(Xb_sal)
        Xb_sal1[Xb_sal1> 3.0e+36] = np.nan
        Xb_sal_mean = np.nanmean(Xb_sal1)
        Xb_ph1 = np.copy(Xb_ph)
        Xb_ph1[Xb_ph1> 3.0e+36] = np.nan
        Xb_ph_mean = np.nanmean(Xb_ph1)
        Xb_omega1 = np.copy(Xb_omega)
        Xb_omega1[Xb_omega1> 3.0e+36] = np.nan
        Xb_omega_mean = np.nanmean(Xb_omega1)
        #print('>>    mean of Xb_sal {}, Xb_ph {}, Xb_omega {}, cleaning {}'.format(Xb_sal_mean, Xb_ph_mean, Xb_omega_mean, clearning_one[0]))
        prediction_mgca = baymag.predict_mgca(prior_1grid, clearning_one, salinity, ph, omega, spp) # pool model for baymag reductive
        #prediction_mgca = baymag.predict_mgca(prior_1grid, cleaningr, salinity, ph, omega, spp) # pool model for baymag reductive
        pred_mgca_adj = baymag.sw_correction(prediction_mgca, np.array([geologic_age]))
        Ye[proi,:] = np.mean(pred_mgca_adj.ensemble, axis = 1)
        print('>>      Mean of  Ye   is {:.6f}, variance is {:.6f} '.format(np.mean(Ye[proi,:]), np.var(Ye[proi,:],ddof=1)))
        
        for reconi in range(recon_period_len):
            obvalue[proi,reconi] = proxies[data_period_id[reconi]][j]
            ob_err[proi,reconi]  = proxies[data_period_idstd[reconi]][j] ** 2
            #obs_estimate_r_mgca_pooled(obs, cleaning, salinity, ph, omega, spp, age):
            if proxy_err_eval in ['proxy_err_psm']:
                ob_err0[proi,reconi] = DeepDA_psm.obs_estimate_r_mgca_pooled(obvalue[proi,reconi], clearning_one[0], np.mean(salinity), np.mean(ph), np.mean(omega), spp, geologic_age)
            else:
                #ob_err0[proi,reconi] = DeepDA_psm.obs_estimate_r_fixed_mgca_pooled((15, 16), clearning_one[0], np.mean(salinity), np.mean(ph), np.mean(omega), spp, geologic_age)
                ob_err0[proi,reconi] = DeepDA_psm.obs_estimate_r_fixed_mgca_pooled((15, 16), clearning_one[0], Xb_sal_mean, Xb_ph_mean, Xb_omega_mean, spp, geologic_age)
            ob_err_comb[proi,reconi] = np.nansum([ob_err[proi,reconi], ob_err0[proi,reconi]])
            if ob_err_comb[proi,reconi] == 0: ob_err_comb[proi,reconi] = np.nan
            print('>>   {}. Proxy variance from PSM is {:.6f}, from PSM and selected interval is {:.6f} '.format(reconi,ob_err0[proi,reconi], ob_err_comb[proi,reconi]))
            # Quality control
            if proxy_err_eval in ['proxy_err_psm']:
                qc_i = DeepDA_psm.obs_qc(Ye[proi,:], obvalue[proi,reconi], ob_err_comb[proi,reconi], proxy_qc)
            else:
                qc_i = DeepDA_psm.obs_qc(Ye[proi,:], obvalue[proi,reconi], ob_err0[proi,reconi], proxy_qc)
            if qc_i:
                if proxy_qc is not None:
                    print('      Pass QC. ye {}, obs {}, obs_var {}, qc {}'.format(np.mean(Ye[proi,:]), obvalue[proi,reconi], ob_err_comb[proi,reconi], proxy_qc))
            else:
                ob_err_comb[proi,reconi] = np.nan
                if proxy_qc is not None:                    
                    print('      Warning! Did not pass QC. ye {}, obs {}, obs_var {}, qc {}'.format(np.mean(Ye[proi,:]), obvalue[proi,reconi], ob_err_comb[proi,reconi], proxy_qc))
            print('        {}: mean salinity {}, ph {}, omega {}'.format(proxy_explain,np.mean(salinity), np.mean(ph), np.mean(omega)))
        proi = proi + 1  # increasement
        
    else:
        a = 1
    
print('')
print('>>  Ye mean')
print('>>  {}'.format(np.mean(Ye,axis=1)))
print('>>  obvalue'.format())
print('>>  {}'.format(obvalue))
print('>>  ob_err0'.format())
print('>>  {}'.format(ob_err0))
print('>>  from psm and interval:  ob_err_comb'.format( ob_err_comb))
print('>>  {}'.format( ob_err_comb))

print('>>  OKAY.')
print('')
#print(Ye)


 ########## Read proxies database ######### 

>>  Proxy: selected proxy dataset number 55: remove those in blacklist
>>  Proxy: selected proxy dataset number 41: remove those unknown/frosty
>>  Proxy fraction is 1.0
>>  Selected index: [7, 31, 29, 1, 0, 30, 4, 25, 9, 20, 22, 17, 19, 32, 10, 26, 15, 8, 28, 33, 12, 36, 37, 14, 16, 2, 3, 40, 18, 27, 5, 21, 6, 24, 35, 23, 13, 11, 39, 34, 38]
>>  Unselected index: []
>>  Selected proxy data length 41
>>  OKAY.

###### Check the consistency of the config.yml file and proxy database ######

>>  0. PSM for tex86 is bayesreg_tex86
>>  1. PSM for tex86 is bayesreg_tex86
>>  2. PSM for tex86 is bayesreg_tex86
>>  3. PSM for tex86 is bayesreg_tex86
>>  4. PSM for tex86 is bayesreg_tex86
>>  5. PSM for tex86 is bayesreg_tex86
>>  6. PSM for tex86 is bayesreg_tex86
>>  7. PSM for tex86 is bayesreg_tex86
>>  8. PSM for tex86 is bayesreg_tex86
>>  9. PSM for tex86 is bayesreg_tex86
>>  10. PSM for tex86 is bayesreg_tex86
>>  11. PSM for tex86 is bayes

In [38]:

hdf5name = dir_proxy_save +'.' + nexp + '_precal_ye.hdf5'
proxy_psm_type_dict_df = pandas.DataFrame.from_dict(proxy_psm_type_dict, orient='index')

with h5py.File(hdf5name, 'w') as f:
    # if any 2d field selected
    if prior_variable2d_len>0:
        f.create_dataset('Xb', data=Xb)
    f.create_dataset('obvalue', data=obvalue)
    f.create_dataset('Ye', data=np.transpose(Ye))
    f.create_dataset('ob_err', data=ob_err)
    f.create_dataset('ob_err0', data=ob_err0)
    f.create_dataset('ob_err_comb', data=ob_err_comb)
    f.create_dataset('yo_all', data=yo_all)
    # If any 3d field saved
    if prior_variable3d_len>0:
        f.create_dataset('Xb3d', data=Xb3d)
    # if Mg/Ca proxy are used
    if data_psm_mgca_find == 1:
        f.create_dataset('Xb_sal', data=Xb_sal)
        f.create_dataset('Xb_ph', data=Xb_ph)
        f.create_dataset('Xb_omega', data=Xb_omega)
    
    metadata = {'Date': time.time(),
                'proxy_dbversion':yml_dict['proxies'][yml_dict['proxies']['use_from'][0]]['dbversion'],
                'exp_dir':yml_dict['core']['prior_dir'],
                'Nens':str(prior_len)}
    
    f.attrs.update(metadata)
    
# append proxy to hdf5 file
proxies.to_hdf(hdf5name, key='proxies')
proxy_psm_type_dict_df.to_hdf(hdf5name, key='proxy_psm_type_dict_df')
if proxy_frac < 1.0:
    sites_eval.to_hdf(hdf5name, key='sites_eval')
pandas.DataFrame(prior_variable_dict).to_hdf(hdf5name, key='prior_variable_dict')
pandas.DataFrame(prior_variable_dict_3d).to_hdf(hdf5name, key='prior_variable_dict_3d')
print('>>  prior2proxyunit hdf5 file saved: {}'.format(hdf5name))
print('>>  Step 1 finished. You may want to run Step 2: DeepDA_main.ipynb now')
print('>>  Done!')

>>  prior2proxyunit hdf5 file saved: /mnt/d/DeepDA/wrk/20200220_PETM_site_CaCO3_13C_test.csv.exp_petm9_caco3c13_20200221_test0_precal_ye.hdf5
>>  Step 1 finished. You may want to run Step 2: DeepDA_main.ipynb now
>>  Done!


In [57]:
print(i in range(0,2))

False
