In [1]:
import importlib
import pandas as pd
import xarray as xr
import numpy as np
import datetime as dt
from numpy import nan
from constants import *
import sys
import warnings
import math
import os
import cftime
from glob import glob
from timeit import default_timer as timer # try to measure time
from CASutils import readdata_utils as read
from CASutils import calendar_utils as cal

In [39]:
# define functions
def read_cmip6(filepath,datestart,dateend):
    #open netcdf dataset
    dat = xr.open_mfdataset(filepath, coords="minimal", join="override", decode_times = True, use_cftime=True)  
    #convert calendar to standard, setting missing values as NaNs
    dat = dat.convert_calendar("standard", use_cftime=True, align_on="date",missing=np.nan)
    #interpolate the dataset using cftim_range
    dateidx = xr.cftime_range(datestart,dateend,freq='D',calendar="standard") 
    dat = dat.interp(time=dateidx,method="nearest") 
    #take slice
    dat = dat.sel(time=slice(datestart, dateend))
    dat = xr.decode_cf(dat, use_cftime = True) 
    return dat

#def read_field(filepath, datestart, dateend,latmin,latmax,plev):
#    """Read in a time slice from datestart to dateend and calculate the zonal mean.
#    Try using datetime64 and if that doesn't work decode times manually.
#    Args:
#        filepath (string) = path to files e.g., "/path/to/files/*.nc"
#        datestart (string) = start date for time slice
#        dateend (string) = end date for time slice
#    """
#
#    try:
#        dat = xr.open_mfdataset(filepath, coords="minimal", join="override",
#                 decode_times=True, use_cftime=True).\
#                 sel(time=slice(datestart, dateend),lat=slice(latmin,latmax),plev=plev)
#
#    except:
#                
#        print("Something's wierd about the time axis, decoding manually")
#        dat = xr.open_mfdataset(filepath, coords="minimal", join="override",
#                   decode_times=False)
#
#        dat=xr.decode_cf(dat, use_cftime=True)
#        dat=dat.sel(time=slice(datestart, dateend),lat=slice(latmin,latmax),plev=plev)
#        datetimeindex=dat.indexes['time'].to_datetimeindex()
#        dat['time'] = datetimeindex
#
#    return dat

#def read_field(filepath, datestart, dateend,latmin,latmax,plev):
#    """Read in a time slice from datestart to dateend and calculate the zonal mean.
#    Try using datetime64 and if that doesn't work decode times manually.
#    Args:
#        filepath (string) = path to files e.g., "/path/to/files/*.nc"
#        datestart (string) = start date for time slice
#        dateend (string) = end date for time slice
#    """
#
#    try:
#        dat = xr.open_mfdataset(filepath, coords="minimal", join="override",
#                 decode_times=True, use_cftime=True)
#
#    except:
#                
#        print("Something's wierd about the time axis, decoding manually")
#        dat = xr.open_mfdataset(filepath, coords="minimal", join="override",
#                   decode_times=False)
#
#        dat=xr.decode_cf(dat, use_cftime=True)
#        datetimeindex=dat.indexes['time'].to_datetimeindex()
#        dat['time'] = datetimeindex
#        
#    dat=dat.sel(time=slice(datestart, dateend),lat=slice(latmin,latmax),plev=plev)
#    return dat

def read_field(filepath, datestart, dateend,latmin,latmax,lonmin,lonmax,plev):
    """Read in a time slice from datestart to dateend and calculate the zonal mean.
    Try using datetime64 and if that doesn't work decode times manually.
    Args:
        filepath (string) = path to files e.g., "/path/to/files/*.nc"
        datestart (string) = start date for time slice
        dateend (string) = end date for time slice
    """

    try:
        dat = xr.open_mfdataset(filepath, coords="minimal", join="override",
                 decode_times=True, use_cftime=True).\
                 sel(time=slice(datestart, dateend),lat=slice(latmin,latmax),lon=slice(lonmin,lonmax))
        
        if len(plev) == 1:
            dat = dat.sel(plev=plev,method="nearest", tolerance=1) #avoid issue for models with inaccurate plevs
        else:
            dat = dat.sel(plev=slice(plev[0]+1,plev[1]-1)) #manual tolerance of 1 because method="nearest" is not implemented for slices

    except:
        print("Something's wierd about the time axis, decoding manually")
        dat = xr.open_mfdataset(filepath, coords="minimal", join="override",
                   decode_times=False)
            
        dat=xr.decode_cf(dat, use_cftime=True)
    
        dat=dat.sel(time=slice(datestart, dateend),lat=slice(latmin,latmax),lon=slice(lonmin,lonmax))
        if len(plev) == 1:
            dat = dat.sel(plev=plev,method="nearest", tolerance=1) #avoid issue for models with inaccurate plevs
        else:
            dat = dat.sel(plev=slice(plev[0]+1,plev[1]-1))  #manual tolerance of 1 because method="nearest" is not implemented for slices

        datetimeindex=dat.indexes['time'].to_datetimeindex()
        dat['time'] = datetimeindex

    return dat


def get_lat_lon_res(ds):
    '''Function to obtain the average lat and lon gridspacing from a dataset of a non regular model grid. '''
    lat = ds.coords['lat']
    lon = ds.coords['lon']
    difflat = lat - lat.shift(lat=1)
    latres = difflat.mean().to_numpy()
    difflon = lon - lon.shift(lon=1)
    lonres = difflon.mean().to_numpy()
    return latres, lonres

def def_domain(ncdf,min_lat,max_lat,min_lon,max_lon):
    LatIndexer, LonIndexer = 'lat', 'lon'
    ncdf = ncdf.loc[{LatIndexer: slice(min_lat, max_lat),
                      LonIndexer: slice(min_lon, max_lon)}]
    return ncdf

def norm_lon(ncdf):
    ncdf.coords['lon'] = (ncdf.coords['lon'] + 180) % 360 - 180
    return ncdf.sortby(ncdf.lon)

def get_ONDJFM_day(ncdf, months=[1,2,3,10,11,12],timedim="day"):
    return ncdf.isel({timedim:ncdf[timedim].dt.month.isin(months)})

In [3]:
os.chdir('/home/lseverino/MT/scripts')
importlib.reload(read)
importlib.reload(cal)
warnings.filterwarnings('ignore')


In [4]:
## constants
#paths
histpath="/net/atmos/data/cmip6/historical/"
ssp119path="/net/atmos/data/cmip6/ssp119/"
ssp126path="/net/atmos/data/cmip6/ssp126/"
ssp245path="/net/atmos/data/cmip6/ssp245/"
ssp370path="/net/atmos/data/cmip6/ssp370/"
ssp585path="/net/atmos/data/cmip6/ssp585/"

scenlist = ["historical","ssp126","ssp245","ssp370","ssp585"]
pathlist = [histpath,ssp126path,ssp245path,ssp370path,ssp585path]
pathdic = {"historical":histpath,"ssp126":ssp126path,"ssp245":ssp245path,"ssp370":ssp370path,"ssp585":ssp585path}
tres = "Amon/"
var="ua"
pathout="../cmip6/"+var+'/'

cmip6models=pd.read_csv('../cmip6csvinfo/cmip6csvinfo_timeseries_ssp585_luca_daily.csv')




In [5]:
### constants
#select variable (cmip6 naming)
selvar = 'psl'
#dict for abbreviations of the cmip6 variables names
cmip6vars = {'sfcWindmax':'SWM','sfcWind':'SW','psl':'SLP','ua':'UA',}
##preprocessing and objects constants
gst_fact = 1.67
qt = 0.98
min_lat=0
max_lat=90
min_lon=0
max_lon=357.5

##climada constants
haz_type = 'WS'
haz_id = 1

## naming
#name base (meteo) variable
metvar = [cmip6vars[selvar]]
spaceres = ["br"] #base resolution regridded
timeres = ["mon"]
domain = ["NH"]
season = ["winE"]
processings = ["qt98","gst1-67","cutarea0","cal1"]
sep = "_"
lst_bn = metvar+spaceres+timeres+domain+season
basenamemet = sep.join(lst_bn)
lst_bn_proc = processings+metvar+spaceres+timeres+domain+season
basenamemet_proc = sep.join(lst_bn_proc)





In [6]:
# get directories and member names
nmems_hist = dict()
#nmems_ssp119 = dict()
nmems_ssp126 = dict()
nmems_ssp245 = dict()
nmems_ssp370 = dict()
nmems_ssp585 = dict()
models_df = pd.DataFrame(columns=scenlist)
for ind,path in enumerate(pathlist):
    for subdir in os.scandir(path+tres+var):
        models_df.loc[subdir.name,scenlist[ind]] = len(os.listdir(subdir))
    
models_df.loc["total",:] = models_df.count(axis=0)

In [7]:
models_df

Unnamed: 0,historical,ssp126,ssp245,ssp370,ssp585
AWI-CM-1-1-MR,5,1,1,5,1
BCC-CSM2-MR,3,1,1,1,1
BCC-ESM1,3,,,3,
CESM2-WACCM,3,1,5,3,5
CESM2,11,5,6,8,5
...,...,...,...,...,...
GISS-E2-2-H,5,,,,
ICON-ESM-LR,5,,,,
GISS-E2-2-G,11,5,5,5,5
UKESM1-1-LL,1,1,,1,


In [8]:
#select all models with minimum 3 members for the 5 scenarios
mods_3mem_allscen = models_df.where(models_df>=3).dropna(how='any').iloc[:-1,:]
mods_1mem_allscen = models_df.where(models_df>=1).dropna(how='any').iloc[:-1,:]
mods_1mem_hist_ssp585= models_df[["historical","ssp585"]].where(models_df>=1).dropna(how='any').iloc[:-1,:]


In [9]:
mods_1mem_hist_ssp585.drop(index=mods_3mem_allscen.index)

Unnamed: 0,historical,ssp585
AWI-CM-1-1-MR,5,1
BCC-CSM2-MR,3,1
CESM2-WACCM,3,5
HadGEM3-GC31-LL,5,4
GFDL-CM4,1,1
CAMS-CSM1-0,3,2
E3SM-1-0,5,5
NESM3,5,2
GFDL-ESM4,3,1
NorESM2-LM,3,1


In [10]:
models_diff = pd.DataFrame()
for scen in scenlist[1:]:
    colname =  scen+" - hist"
    models_diff.loc[:,colname] = abs(models_df.loc[::-1,scen]-models_df.loc[::-1,"historical"])

In [11]:
# get populate dict with model names and member names
# force to get the same members as for sfcWindmax
varSWM = 'sfcWindmax'
tresSWM = 'day/'
dicscen = dict()
for ind,scen in enumerate(scenlist):
    path = pathlist[ind]
    dicscen[scen] = dict()
    for subdir in os.scandir(path+tresSWM+varSWM):
        #models_rcp85[subdir.name]=[]
        dicscen[scen][subdir.name] = [len(os.listdir(subdir))]
        dicscen[scen][subdir.name].append(os.listdir(subdir))
nmems_hist = dicscen['historical']
nmems_ssp585 = dicscen['ssp585']

In [12]:
# turn dicts into pd DataFrames because more convenient
nmems_hist_df = pd.DataFrame(nmems_hist, index=["hist","memnames"])
nmems_ssp585_df = pd.DataFrame(nmems_ssp585, index=["ssp585","memnames"])

In [13]:
# get common models that are both in historical and rcp85
nmems_hist_com = nmems_hist_df.reindex(nmems_ssp585_df.columns,axis=1).dropna(axis=1)
nmems_ssp585_com = nmems_ssp585_df.reindex(nmems_hist_df.columns,axis=1).dropna(axis=1)

In [14]:
#consider ONJDFM: start in D, finishes in M, adjust to have same number of days in both periods
ybegp = 1980 ; monbegp = 10 ; yendp = 2010 ; monendp = 3 ; daybegp = 1 ; dayendp = 30# dates for Past period, only takes 30th 
#ybegf = 2070 ; monbegf = 1 ; yendf = 2099 ; monendf = 12 ; daybegf = 1 ; dayendf = 31# dates for Future period
ybegf = 2070 ; monbegf = 10 ; yendf = 2100 ; monendf = 3 ; daybegf = 1 ; dayendf = 30# otherwise dont have the same length

# total number of months (used for checking)
nmonthsp = (yendp-ybegp-1)*12 + (12-monbegp+1) + monendp
nmonthsf = (yendf-ybegf-1)*12 + (12-monbegf+1) + monendf

# set up date names
dateformat ='%Y-%m-%d'

datebegp=str(ybegp)+"-"+str(monbegp).zfill(2)+"-"+str(daybegp).zfill(2)
dateendp=str(yendp)+"-"+str(monendp).zfill(2)+"-"+str(dayendp).zfill(2)
datebegf=str(ybegf)+"-"+str(monbegf).zfill(2)+"-"+str(daybegf).zfill(2)
dateendf=str(yendf)+"-"+str(monendf).zfill(2)+"-"+str(dayendf).zfill(2)

#set up daterange indexes
#daysidp = pd.date_range(datebegp,dateendp,freq='D')
#daysidf = pd.date_range(datebegf,dateendf,freq='D')

daysidp = xr.cftime_range(datebegp,dateendp,freq='D',calendar='standard')
daysidf = xr.cftime_range(datebegf,dateendf,freq='D',calendar='standard')

#nb of days
ndaysp = len(daysidp)
ndaysf = len(daysidf)

dayrangep = np.arange(1,ndaysp+1,1)
dayrangef = np.arange(1,ndaysf+1,1)

latout=np.linspace(-90,90,73)
lonout=np.linspace(0,357.5,144)
#lonout=np.linspace(0,360,144) # try to remove issues at border

# plevuse=[100000,92500,85000,70000,60000,50000,40000,30000,25000,20000,15000,10000,
#        7000,5000,3000,2000,1000]
# plevuse=[1000]



In [15]:
## select models
#try with models that have at least 3 members per scenario
modlist_3mem = mods_3mem_allscen.index.tolist()
modlist_1mem_sel = mods_1mem_hist_ssp585.drop(index=mods_3mem_allscen.index)
#modlist_1mem_sel = modlist_1mem_sel.drop(index=['HadGEM3-GC31-MM','HadGEM3-GC31-LL'])
model_dl = ['AWI-CM-1-1-MR']
modlist = ['ACCESS-ESM1-5']
modlist = [model for model in modlist if model not in model_dl]

for model in modlist:
    print("Members, historical: "+str(nmems_hist[model][0])+"\nMembers, rcp85: ",str(dicscen['ssp585'][model][0]))
models = pd.Series(modlist)

Members, historical: 40
Members, rcp85:  40


In [16]:
##select scenarios
selscen = ['historical','ssp585']
var ="ua"
tres = "Amon/"

regrid = True
plev = [85000]
latrg = 2.5
lonrg = 2.5
#define domain
latmin,latmax = 0,90
lonmin,lonmax = 0,357.5
#nmems_hist = dicscen['historical']
#nmems_fut = dicscen['ssp585']
#select maximum of members to use 
nmem_max = 1
itercols = [selscen,range(nmem_max)]
col_idx= pd.MultiIndex.from_product(itercols,names=["scen","imem"])
#get member names
memname_df = pd.read_csv('/home/lseverino/MT/metadata/memnames_ssp585_hist_SWM.csv',header=[0,1],index_col=0)

In [17]:
nmem_max = 1
rangestr = [str(i) for i in range(nmem_max)]
itercols = [selscen,rangestr]
col_idx= pd.MultiIndex.from_product(itercols,names=["scen","imem"])

In [33]:
##load ua at 850hPa, regrid to 2.5x2.5 

nmods = len(models)
modout=np.arange(0,nmods)

for index, modname in models.iteritems():
    start_time = timer()
    #nmemsmin = modlist_1mem_sel.loc[modname].min()
    #nmems = min((nmemsmin,nmem_max))
    #nmems= 1
    #memlistp = nmems_hist[modname][1]
    #get model resolution
    
    
    for scen in selscen:    
           #----sort out the future----
            scenpath = pathdic[scen]
            memnames_mod = memname_df.loc[modname,scen]
            nmems = len(memnames_mod)
            memout=np.arange(0,nmems)
            for imem in range(nmems):
                
                #memname = str(memlist[imem])
                memname = memname_df.loc[modname,(scen,str(imem))]
                print("Processing "+scen+" for "+modname+" "+memname+"...")
                
                scendir = glob(scenpath+tres+var+"/"+modname+"/"+memname+"/*/")
                scendir = scendir[0]
                
                # read in zonal mean u
                if scen == 'historical':
                    datebeg = datebegp
                    dateend = dateendp
                else: 
                    datebeg = datebegf
                    dateend = dateendf
                #try:
                #    u=read_field(scendir+"*.nc", datebeg, dateend,min_lat,max_lat,plev)
                #except KeyError:
                #    plev = xr.open_mfdataset(scendir+"*.nc", coords="minimal", join="override",
                #                            decode_times=True, use_cftime=True).plev[2].data.tolist()
                #    print("Try with new plev: "+str(plev))
                #    u=read_field(scendir+"*.nc", datebeg, dateend,min_lat,max_lat,plev)
                
                try:
                    field = read_field(scendir+"*.nc", datebeg,dateend,latmin,latmax,None,None,plev)
                
                except ValueError:
                    #assume first file contains what we need
                    fpath = glob(scenpath+tres+var+"/"+modname+"/"+memname+"/*//*.nc")[0] 
                    print("Look into: "+fpath)
                    field = read_field(fpath, datebeg,dateend,latmin,latmax,None,None,plev)

                utemp = field[var]
                
                #take mean over pressure
                utemp = utemp.mean(dim='plev')
                
                #initiate array if first member
                if imem==0:
                    #get lat, lon, latres and lonres
                    lat = utemp.lat
                    lon = utemp.lon
                    min_lat=lat[0].to_numpy()
                    max_lat=lat[-1].to_numpy()
                    min_lon=lon[0].to_numpy()
                    max_lon=lon[-1].to_numpy()
                    latres, lonres = get_lat_lon_res(utemp)
                    if regrid:
                        latres = latrg
                        lonres = lonrg
                    else:
                        latres = latres.round(4)
                        lonres = lonres.round(4)
                    
                    latout = np.arange(min_lat,max_lat,latres)
                    lonout = np.arange(min_lon,max_lon,lonres)
                    daysid = utemp.time
                    ndays = len(daysid)
                    uzmem = xr.DataArray(np.zeros([ ndays, latout.size, lonout.size, nmems]), 
                                          coords=[daysid, latout, lonout, memout],dims=['time','lat','lon','member'], name=scen)
                
                # check the size
                if (utemp.time.size !=  ndays):
                    while (u.time.size !=  ndays):
                        print("something's wrong, ndaysf="+str(ndays)+" but u has size "+str(u.time.size))
                        inc = imem+nmem_max
                        memname = memname_df.loc[modname,(scen,str(inc))]#try next member
                        print("Processing "+scen+" for "+modname+" "+memname+"...")
                
                        scendir = glob(scenpath+tres+var+"/"+modname+"/"+memname+"/*/")
                        scendir = scendir[0]
                        field=read_field(scendir+"*.nc", datebeg, dateend,min_lat,max_lat,None,None,plev)
                        inc = inc+1
    
                utemp = field[var]
                utemp = utemp.mean(dim='plev')
                uinterp = utemp.interp(lat=latout,lon=lonout, method='linear',kwargs={"fill_value": None})
                uinterp = uinterp.rename({'time': 'time_o'}) #rename to avoid conflicts between indexing and indexed objects
        
                uzmem.loc[dict(member=imem)]=uinterp
        
        
            # normalize longitude
            uzmemn = norm_lon(uzmem)
            
            
            #interpolate nans at 0deg longitude
            if np.any(np.isnan(uzmemn.dropna(dim="time",how='all'))):
                print('interpolating nans...')
                uzmemn = uzmemn.interpolate_na(dim="lon", method="linear")
            
            #Usfc_EU = def_domain(uzmemn,min_lat,max_lat,min_lon,max_lon)
            
            #set time indexes
            dayrange = np.arange(1,ndays+1,1)
            uzmemn = uzmemn.assign_coords({"day":("time",dayrange)})
            
            #select winter months
            uzmemn_winE = get_ONDJFM_day(uzmemn,timedim="time")
            
            #try to remove abonormal values
            #Usfc_EU_win = Usfc_EU_win.where(Usfc_EU_win.values >= 0) # values below 0 are discarded
            #Usfc_EU_win = Usfc_EU_win.where(Usfc_EU_win.values < 100) # values above 100 are discarded
            
        
            #swap dims to assemble files in the same dataset
            
            
            uzmemn_winE = uzmemn_winE.swap_dims({"time":"day"})
            
            #set time index as a data variable
            uzmemn_winE = uzmemn_winE.reset_coords()
            
            if scen == 'historical':
                timename = 'timep'
            else:
                timename = 'timef'
            uzmemn_winE = uzmemn_winE.rename(time=timename)
            
            #save to netcdf
            
            try:# see if file exist and merge to it
                #datain = xr.open_dataset(pathout+modname+'_'+basenamemet+".nc")
                #merged = xr.merge([datain,Usfc_EU_win])
                #merged.to_netcdf(path=pathout+modname+'_'+basenamemet+".nc",mode="a",engine="scipy")
                uzmemn_winE.to_netcdf(path=pathout+modname+'_'+basenamemet+".nc",mode="a")
            except: #otherwise directly create the file
                #Usfc_EU_win.to_netcdf(path=pathout+modname+'_'+basenamemet+"_allscens"+".nc",engine="scipy")
                uzmemn_winE.to_netcdf(path=pathout+modname+'_'+basenamemet+".nc")
                          
        
            time_delta_fut = timer() - start_time
            print(time_delta_fut)


Processing historical for ACCESS-ESM1-5 r1i1p1f1...
Processing historical for ACCESS-ESM1-5 r3i1p1f1...
Processing historical for ACCESS-ESM1-5 r2i1p1f1...
interpolating nans...
203.69671250879765
Processing ssp585 for ACCESS-ESM1-5 r1i1p1f1...
Something's wierd about the time axis, decoding manually
Look into: /net/atmos/data/cmip6/ssp585/Amon/ua/ACCESS-ESM1-5/r1i1p1f1/gn/ua_Amon_ACCESS-ESM1-5_ssp585_r1i1p1f1_gn_201501-210012.nc
Processing ssp585 for ACCESS-ESM1-5 r3i1p1f1...
Something's wierd about the time axis, decoding manually
Look into: /net/atmos/data/cmip6/ssp585/Amon/ua/ACCESS-ESM1-5/r3i1p1f1/gn/ua_Amon_ACCESS-ESM1-5_ssp585_r3i1p1f1_gn_201501-210012.nc
Processing ssp585 for ACCESS-ESM1-5 r2i1p1f1...
Something's wierd about the time axis, decoding manually
Look into: /net/atmos/data/cmip6/ssp585/Amon/ua/ACCESS-ESM1-5/r2i1p1f1/gn/ua_Amon_ACCESS-ESM1-5_ssp585_r2i1p1f1_gn_201501-210012.nc
interpolating nans...
507.1749307997525


uzmemf_mri = uzmemf
utemp_mri = utemp
uinterp_mri = uinterp