# Get cmip6 data
## Utilities to get the pathnames from the CMIP analysis platform
### Approach: Ask for an experiment and variable, find out the models with all their path names
### Then, compare.
Naming consistent with [cmip archive](https://esgf-node.llnl.gov/search/cmip6/)

In [30]:
import sys
import os
import datetime

In [2]:
def getendpath(pathtovar,var):
    # basedir/institution/model/experiment/variant/Amon/"+variable / gridlabel / version {latest/vDATE} 
    # we begin from here:  basedir/institution/model/experiment/variant/Amon/variable/ 
    # the next level is the gridlabel
    gridlabel=os.listdir(pathtovar)
    extrabit = False
    # the next level is the version. options are: 'latest' or 'vYYYYMMDD'; sometimes there's a red herring something called 'files' 
    possible_versions = os.listdir(pathtovar+"/"+gridlabel[0])
    versions = []
    # get rid of folders that don't have netcdf files
    for aversion in possible_versions:
        files = os.listdir(pathtovar+"/"+gridlabel[0]+"/"+aversion)
        #print(aversion)
        #print(files)
        ncfiles = [item for item in files if item.endswith('.nc')]
        if len(ncfiles)>0:
            versions.append(aversion)
    if len(versions)==0:
        # try again with the variable name appended. models other than CESM have this.
        for aversion in possible_versions:
            files = os.listdir(pathtovar+"/"+gridlabel[0]+"/"+aversion+"/"+var)
            #print(aversion)
            #print(files)
            ncfiles = [item for item in files if item.endswith('.nc')]
            if len(ncfiles)>0:
                versions.append(aversion)
                extrabit = True
    #print(len(versions))
    if len(versions)==1:
        # if there's only one folder, this is easy
        theendpath = "/"+gridlabel[0]+"/"+versions[0]+"/"
    elif len(versions)==0:
        # nothing made the cut, return empty string
        theendpath = []
    else:
        # there are multiple possibilities
        # if one of them is "latest", we'll take it
        hasLatest = any(elem == 'latest' for elem in versions)
        if hasLatest:
            theendpath = "/"+gridlabel[0]+"/"+'latest'+"/"
        else:
            # if there's no latest, there should be folders of the form 'vYYYYMMDD'. 
            datevs = [item for item in versions if item.startswith('v')]
            if len(datevs)==1:
                # there was only one properly formatted folder - get it
                theendpath = "/"+gridlabel[0]+"/"+datevs[0]+"/"
            elif len(datevs)>1:
                # there are multiple dated versions. 
                # sort and take the most recent one
                datelist = [datetime.datetime.strptime(item[1:],'%Y%m%d') for item in versions]
                datelist.sort(reverse=True)
                latestversion='v'+datelist[0].strftime('%Y%m%d')
                theendpath = "/"+gridlabel[0]+"/"+latestversion+"/"
            else:
                # something went wrong; give up
                theendpath = []
    return theendpath,extrabit

basedir = '/glade/collections/cmip/CMIP6/CMIP/'
model='CESM2'
institution='NCAR'
thispath = basedir+institution+"/"+model+"/abrupt-4xCO2/r1i1p1f1/Amon/ts"
thisendpath,extrabit = getendpath(thispath,"ts")
thispath+thisendpath


'/glade/collections/cmip/CMIP6/CMIP/NCAR/CESM2/abrupt-4xCO2/r1i1p1f1/Amon/ts/gn/latest/'

In [3]:
def getvariants(pathtoexp,var):
    variants = []
    endpaths = [] 
    extrabit = []
    for variant in os.listdir(pathtoexp):
        try: 
            # check that there is actually a path for this particular variable in this experiment
            os.listdir(pathtoexp+variant+"/Amon/"+var)
            #print(variant)
            # we will now call a function that gets the end path. this is not trivial; it includes the grid label and finding the latest version. 
            thispath = pathtoexp+variant+"/Amon/"+var
            #print(thispath)
            thisendpath,extrabit = getendpath(thispath,var)
            # if this all worked, then add the variant / ensemble member, and its path info, to the list of existing ones
            if len(thisendpath)>0:
                endpaths.append(thisendpath)
                variants.append(variant)
        except:
            # in case there is not actually any data for this model + variant / ensemble member
            print(pathtoexp+" "+variant+" is missing")
    #print(variants)
    return variants,endpaths,extrabit
    
# get a list of ensemble members with data for a particular model + experiment + variable
basedir = '/glade/collections/cmip/CMIP6/CMIP/'
#var = "ts"
#path='NCAR/CESM2'
model='CESM2'
institution='NCAR'

thevariants,theendpaths,extrabit = getvariants(basedir+institution+"/"+model+"/abrupt-4xCO2/","ts")

In [4]:
class ModelExpVar:
  def __init__(self, model, experiment, variable,startpath,variants,endpaths,extraVarFolder):
    self.model = model
    self.experiment = experiment
    self.variable = variable
    self.startpath = startpath 
    self.variants = variants
    self.endpaths = endpaths
    self.extraVarFolder = extraVarFolder    # extra folder with variable name 
    # The rest are derived
    self.nmem = len(variants)
    ## basedir/institution/model/experiment/variant/Amon/"+variable / gridlabel / version {latest/vDATE} / SOMETIMES extrafolder
    paths = []
    files = []
    for i in range(len(variants)):
        #print(variants[i])
        #print(endpaths[i])
        thepath = startpath+"/"+experiment+"/"+variants[i]+"/Amon/"+variable+endpaths[i]
        if extraVarFolder:
            thepath = thepath+variable+"/"
        paths.append(thepath)
        ensfiles = os.listdir(thepath)
        files.append(ensfiles)
    self.paths = paths
    self.files = files

In [5]:
def getModelsForExperimentVar(experiment,var):
    basedir = '/glade/collections/cmip/CMIP6/CMIP/'
    evd_ofmodels = {}
    for institution in os.listdir(basedir):
        #print(institution)
        for model in os.listdir(basedir+institution):
            #print(institution+"/"+model)
            exps = os.listdir(basedir+institution+"/"+model)
            #print(exps)
            hasexp = any(elem == experiment for elem in exps)
            if (hasexp) : 
                # get ensemble members with data and also their end paths
                variants,endpaths,extrabit = getvariants(basedir+institution+"/"+model+"/"+experiment+"/",var)
                if len(variants)==0:
                    print("no "+experiment+" data for "+model)
                else:
                    evd_ofmodels[model] = ModelExpVar(model,experiment,'ts',basedir+institution+"/"+model,variants,endpaths,extrabit)

    return evd_ofmodels

In [9]:
var = "ts"
experiment = "abrupt-4xCO2"
ts4x_models = getModelsForExperimentVar(experiment,var)
tsPi_models = getModelsForExperimentVar("piControl",var)
## note, a few more models have tas than ts

/glade/collections/cmip/CMIP6/CMIP/NOAA-GFDL/GFDL-CM4/abrupt-4xCO2/ r1i1p1f1 is missing
no abrupt-4xCO2 data for GFDL-CM4
/glade/collections/cmip/CMIP6/CMIP/CCCma/CanESM5/abrupt-4xCO2/ r1i1p1f1 is missing
no abrupt-4xCO2 data for CanESM5
/glade/collections/cmip/CMIP6/CMIP/NOAA-GFDL/GFDL-CM4/piControl/ r1i1p1f1 is missing
no piControl data for GFDL-CM4
/glade/collections/cmip/CMIP6/CMIP/CCCma/CanESM5/piControl/ r1i1p1f1 is missing
no piControl data for CanESM5
/glade/collections/cmip/CMIP6/CMIP/EC-Earth-Consortium/EC-Earth3-LR/piControl/ r1i1p1f1 is missing
no piControl data for EC-Earth3-LR


In [10]:
for k,v in ts4x_models.items():
    print(k, v.variants[0])
    print(k, len(v.files[0]))

CNRM-CM6-1 r3i1p1f2
CNRM-CM6-1 1
CNRM-ESM2-1 r3i1p1f2
CNRM-ESM2-1 1
BCC-CSM2-MR r1i1p1f1
BCC-CSM2-MR 1
CESM2-WACCM r1i1p1f1
CESM2-WACCM 3
CESM2 r1i1p1f1
CESM2 1
UKESM1-0-LL r1i1p1f2
UKESM1-0-LL 2
GISS-E2-1-H r1i1p1f1
GISS-E2-1-H 3
GISS-E2-1-G r1i1p1f1
GISS-E2-1-G 6
MRI-ESM2-0 r2i1p1f1
MRI-ESM2-0 1
MIROC6 r1i1p1f1
MIROC6 2
IPSL-CM6A-LR r2i1p1f1
IPSL-CM6A-LR 1


In [11]:
list(ts4x_models)

['CNRM-CM6-1',
 'CNRM-ESM2-1',
 'BCC-CSM2-MR',
 'CESM2-WACCM',
 'CESM2',
 'UKESM1-0-LL',
 'GISS-E2-1-H',
 'GISS-E2-1-G',
 'MRI-ESM2-0',
 'MIROC6',
 'IPSL-CM6A-LR']

In [12]:
list(tsPi_models)

['CNRM-CM6-1',
 'CNRM-ESM2-1',
 'BCC-ESM1',
 'BCC-CSM2-MR',
 'CESM2-WACCM',
 'CESM2',
 'UKESM1-0-LL',
 'GISS-E2-1-H',
 'GISS-E2-1-G',
 'E3SM-1-0',
 'MRI-ESM2-0',
 'MIROC6',
 'IPSL-CM6A-LR']

In [13]:
ts4xPi = set(list(ts4x_models)).intersection(list(tsPi_models))
ts4xPi

{'BCC-CSM2-MR',
 'CESM2',
 'CESM2-WACCM',
 'CNRM-CM6-1',
 'CNRM-ESM2-1',
 'GISS-E2-1-G',
 'GISS-E2-1-H',
 'IPSL-CM6A-LR',
 'MIROC6',
 'MRI-ESM2-0',
 'UKESM1-0-LL'}

In [14]:
set(list(ts4x_models)).difference(list(tsPi_models))

set()

In [18]:
# The difference between two sets results in a third set with the element from the first, that are not present on the second.
missingmodel = set(list(tsPi_models)).symmetric_difference(list(ts4x_models))
missingmodel

{'BCC-ESM1', 'E3SM-1-0'}

In [28]:
for model in missingmodel:
    print(model)
    try: 
        tsPi_models.pop(model)
    except:
        print("")
    try: 
        ts4x_models.pop(model)
    except:
        print("")
    #try: ts4x_models[model]

E3SM-1-0


BCC-ESM1



In [29]:
set(list(ts4x_models)).union(list(tsPi_models)) == set(list(ts4x_models)).intersection(list(tsPi_models))

True

In [32]:
def keepModelsWithAllData(modeldict1,modeldict2):
    missingmodel = set(list(modeldict1)).symmetric_difference(list(modeldict2))
    for model in missingmodel:
        print(model)
        try: 
            modeldict1.pop(model)
        except:
            print("")
        try: 
            modeldict2.pop(model)
        except:
            print("")
    return modeldict1,modeldict2

experiment = "abrupt-4xCO2"
ts4x_models = getModelsForExperimentVar(experiment,var)
tsPi_models = getModelsForExperimentVar("piControl",var)
ts4x_models,tsPi_models = keepModelsWithAllData(ts4x_models,tsPi_models)

/glade/collections/cmip/CMIP6/CMIP/NOAA-GFDL/GFDL-CM4/abrupt-4xCO2/ r1i1p1f1 is missing
no abrupt-4xCO2 data for GFDL-CM4
/glade/collections/cmip/CMIP6/CMIP/CCCma/CanESM5/abrupt-4xCO2/ r1i1p1f1 is missing
no abrupt-4xCO2 data for CanESM5
/glade/collections/cmip/CMIP6/CMIP/NOAA-GFDL/GFDL-CM4/piControl/ r1i1p1f1 is missing
no piControl data for GFDL-CM4
/glade/collections/cmip/CMIP6/CMIP/CCCma/CanESM5/piControl/ r1i1p1f1 is missing
no piControl data for CanESM5
/glade/collections/cmip/CMIP6/CMIP/EC-Earth-Consortium/EC-Earth3-LR/piControl/ r1i1p1f1 is missing
no piControl data for EC-Earth3-LR
E3SM-1-0

BCC-ESM1



In [33]:
set(list(ts4x_models)).union(list(tsPi_models)) == set(list(ts4x_models)).intersection(list(tsPi_models))

True