<span style="color:hotpink; font-size:40px; font-weight:bold;">Packages and imports</span>

In [1]:
# Standard imports

import pandas as pd
import xarray as xr
import numpy as np
import numpy.ma as ma
from matplotlib.ticker import AutoMinorLocator
import matplotlib.pyplot as plt
import scipy
import sklearn.linear_model 
import pickle
import cmocean as cm
import os

import gcsfs
fs = gcsfs.GCSFileSystem()

%matplotlib inline

<span style="color:hotpink; font-size:40px; font-weight:bold;">Setting date range</span>

In [2]:
# Define date range
date_range_start = '1982-02-01T00:00:00.000000000'
date_range_end = '2023-12-31T00:00:00.000000000'

# create date vector, adds 14 days to start & end
dates = pd.date_range(start=date_range_start, 
                      end=date_range_end,freq='MS')

init_date = str(dates[0].year) + format(dates[0].month,'02d')
fin_date = str(dates[-1].year) + format(dates[-1].month,'02d')

<span style="color:hotpink; font-size:40px; font-weight:bold;">Setting paths</span>

In [None]:
### set paths ###

your_username = # leap pangeo username, for bucket. should be your github username

### paths for loading: ###

regridded_members_dir = 'gs://leap-persistent/abbysh/pco2_residual_1982-2023/00_regridded_members' # path to directory of output from notebook 00
pco2_recon_dir = f'gs://leap-persistent/{your_username}/pco2_residual/post02_xgb/reconstructions' # path to directory of ML output from notebook 02

<span style="color:hotpink; font-size:40px; font-weight:bold;">Loading list of ESMs and members in testbed</span>

In [None]:
### loads list of Earth System Models ("ensembles") and members for the full testbed ###

ensembles = []
for path in fs.ls(pco2_recon_dir):
    ens = path.split('/')[-1].split('.')[0]
    if ens not in ensembles:
        ensembles.append(ens)

mems_dict = dict()
a = fs.ls(pco2_recon_dir)
for ens_path in a:
    ens = ens_path.split('/')[-1]
    mems = fs.ls(pco2_recon_dir + '/' + ens)
    for mem in mems:
        memo = mem.split('/')[-1]
        
        if ens not in mems_dict:
            mems_dict[ens] = [memo]

        elif ens in mems_dict:
            mems_dict[ens].append(memo)

<span style="color:hotpink; font-size:40px; font-weight:bold;">Adding temperature to reconstructed pCO2 residual to get total pCO2</span>

We broke pco2 into its temperature and non-temperature components (pco2-T and pco2-residual, respecitvely, and saved them in notebook 00.
The ML reconstructed pco2-residual (the non temperature component of pco2). 
Here, we add the temperature component (pco2-T) back, to get total pco2.

 <center><span style=font-weight:bold> pCO2 = pCO2-residual + pCO2-temperature </center></span>

<span style="color:lightblue; font-size:30px; font-weight:bold;">Function to add pco2-T back:</span>

In [None]:
def calc_recon_pco2(regridded_members_dir, pco2_recon_dir):
    
    """
    Calculates reconstructed pco2 per member.
    
    Parameters
    ----------
    regridded_members_dir : str
        Path to regridded data from notebook 00, which contains pco2T.
    
    pco2_recon_dir : str
        Path to directory where ML reconstructions from notebook 02 are saved.
    """

    for ens, mem_list in mems_dict.items():
        print(f"Current ESM: {ens}")

        for member in mem_list:
            print(f"On member {member}")

            ### File paths ###
            
            ### Path to regridded data from notebook 00, so we can get the pCO2-T we calculated in 00
            ### pCO2-T calculated from model pCO2 and SST
            pco2T_path = f'{regridded_members_dir}/{ens}/{member}/{ens}.{member.split("_")[-1]}.Omon.zarr'
            print('pco2T path:',pco2T_path)    

            ### Path to reconstruction (ML output from notebook 02), where pCO2-residual was reconstructed
            pco2D_path = f"{pco2_recon_dir}/{ens}/{member}/recon_pC02residual_{ens}_{member}_mon_1x1_{init_date}_{fin_date}.zarr"
            print('pco2D path:',pco2D_path)

            ### Path to save calculated pCO2 (reconstructed pCO2-residual PLUS pCO2-T: Total pCO2 =  pCO2-residual + pCO2-T)
            file_out = f"{pco2_recon_dir}/{ens}/{member}/recon_pCO2_{ens}_{member}_mon_1x1_{init_date}_{fin_date}.zarr" # change this to just pco2
            print('save path:',file_out)

            ### Loading pCO2-T and reconstructed pCO2-residual:
            pco2T_series = xr.open_mfdataset(pco2T_path,engine='zarr').pco2_T.transpose("time","ylat","xlon")
            pco2_ml_output = xr.open_mfdataset(pco2D_path,engine='zarr')

            ### unseen reconstructed pCO2-Residual from XGB
            pco2D_unseen_series = pco2_ml_output.pCO2_recon_unseen.transpose("time","ylat","xlon")
            
            ### Full (seen and unseen) reconstructed pCO2-Residual from XGB
            pco2D_full_series = pco2_ml_output.pCO2_recon_full.transpose("time","ylat","xlon")
            
            ### training set for pco2 residual
            pco2D_train_series = pco2_ml_output.pCO2_recon_train.transpose("time","ylat","xlon")
            
            ### testing set for pco2 residual
            pco2D_test_series = pco2_ml_output.pCO2_recon_test.transpose("time","ylat","xlon")
            
            ### Get time coordinate correct
            pco2T_series = pco2T_series.assign_coords({"time":("time",pco2D_unseen_series.time.data)})

            ### Total pCO2 =  pCO2-residual + pCO2-T
            pco2_unseen = pco2T_series + pco2D_unseen_series   
            pco2_full =  pco2T_series + pco2D_full_series
            pco2_train =  pco2T_series + pco2D_train_series
            pco2_test =  pco2T_series + pco2D_test_series

            ### Creating xarray of pco2 ML output, but with temperature added back 
            comp = xr.Dataset({'pCO2_recon_unseen':(["time","ylat","xlon"],pco2_unseen.data), 
                            'pCO2_recon_full':(["time","ylat","xlon"],pco2_full.data),
                              'pCO2_recon_train':(["time","ylat","xlon"],pco2_train.data),
                              'pCO2_recon_test':(["time","ylat","xlon"],pco2_test.data)},
                            coords={'time': (['time'],pco2T_series.time.values),
                            'ylat': (['ylat'],pco2T_series.ylat.values),
                            'xlon':(['xlon'],pco2T_series.xlon.values)})

            ### to overwrite file if it exists already
            if fs.exists(file_out):
                fs.rm(file_out,recursive=True)

            ### for saving:
            comp = comp.chunk({'time':100,'ylat':45,'xlon':90})
            comp.to_zarr(file_out)

            print(f'finished with {member}')

<span style="color:lightblue; font-size:30px; font-weight:bold;">Adding pco2-T back to get total pco2 for all members</span>

In [None]:
calc_recon_pco2(regridded_members_dir, pco2_recon_dir)