### dreq_content

In [1]:
# See https://github.com/WCRP-CMIP/CMIP7_DReq_Software/pull/1
import dreq_content as dc

In [2]:
# dc.retrieve("all")
# dc.retrieve("latest")
# dc.retrieve("latest_stable")
# dc.retrieve("1.0.0b1")
# dc.retrieve("dev")
dc.retrieve("first_export")

{'first_export': '/home/ms/GIT/c7dreq/cmip7_request_scripts/dreq_api/dreq_res/first_export/dreq_raw_export.json'}

In [3]:
dreq = dc.load("first_export")
dreq.keys()

dict_keys(['Data Request Opportunities (Public)', 'Data Request Physical Parameters (Public)', 'Data Request Variables (Public)'])

In [4]:
dc.get_versions()

['dev']

In [5]:
dc.get_versions(local=True)

['1.0.0a', 'first_export']

In [6]:
dc.delete(version="all", keep_latest=True)

Deleting the following version(s):
first_export


In [7]:
dc.get_versions(local=True)

['1.0.0a']

### Adaptation of Matts example

https://gist.github.com/matthew-mizielinski/c20200a76e38244953803197d4ad7f85

In [8]:
# imports
from collections import defaultdict
import json
from dataclasses import dataclass
import time

In [9]:
# Define data classes for each object type.
# Have only pulled out a very limited set of objects and their parameters as a demonstration

@dataclass
class Experiment:
    record_id: str
    experiment_name: str
    mips: list
    experiment_groups: list

@dataclass
class Experiment:
    record_id: str
    experiment_name: str
    mips: list
    experiment_groups: list

@dataclass
class MIP:
    record_id: str
    title: str
    label: str
    website: list
    variable_groups: list

@dataclass
class ExperimentGroup:
    record_id: str
    experiments: list
    name: str
    opportunities: list


@dataclass
class Opportunity:
    record_id: str
    description: str
    experiment_groups: list
    variable_groups: list


@dataclass
class VariableGroup:
    record_id: str
    title: str
    variables: list
    priority: str


@dataclass
class Variable:
    record_id: str
    variable_id: str
    standard_name: str
    frequency: str
    name: str

In [10]:
# build dictionaries allowing Data request network to be navigated

def map_record_id(record, records, key):
    """
    Identifieds a record_id in list of records using key
    """
    recval=record[key]
    matches = [r for r,v in records.items() if v[key] == recval]
    if len(matches) == 1:
        return matches[0]
    else:
        raise KeyError("None or multiple matches.")

def build_dictionaries(version):
    """
    load json in filename and build dictionaries of data request objects
    """
    # Dictionaries to be returned
    object_dict = {}
    structured_dict = defaultdict(dict)

    raw_data = dc.load(version)

    # load experiments
    for record_id, entry in raw_data['Data Request Opportunities (Public)']['Experiment']['records'].items():
        try:
            expt = Experiment(
                record_id=record_id, 
                experiment_name=entry[' Experiment'].strip(), 
                mips=entry['MIP'],
                experiment_groups=entry['Experiment Group'])
        except:
            print("Could not interpret experiment: ", json.dumps(entry, indent=4))
            time.sleep(0.1)
            continue
        structured_dict['experiment'][record_id] = expt
        object_dict[record_id] = expt


    # load experiment groups
    for record_id, entry in raw_data['Data Request Opportunities (Public)']['Experiment Group']['records'].items():
        try:
            expt_group = ExperimentGroup(
                record_id=record_id,
                experiments=entry['Experiments'],
                name=entry['Name'],
                opportunities=entry['Opportunities']
            )
        except:
            print("Could not interpret experiment_group:", json.dumps(entry, indent=4))
            time.sleep(0.1)
            continue
        structured_dict['experiment_group'][record_id] = expt_group
        object_dict[record_id] = expt_group


    # load MIPs
    for record_id, entry in raw_data['Data Request Opportunities (Public)']['MIP']['records'].items():
        try:
            mip = MIP(
                record_id=record_id, 
                title=entry['MIP Long Name'], 
                label=entry['MIP Short Name'],
                variable_groups=entry['Variable Group'],
                website=entry['MIP website'])
        except:
            print("Could not interpret MIP: ", json.dumps(entry, indent=4))
            time.sleep(0.1)
            continue
        structured_dict['mip'][record_id] = mip
        object_dict[record_id] = mip

    
    # load opportunities
    for record_id, entry in raw_data['Data Request Opportunities (Public)']['Opportunity']['records'].items():
        try:
            opportunity = Opportunity(
                record_id=record_id,
                description=entry['Description'],
                experiment_groups=entry['Experiment Groups'],
                variable_groups=entry['Variable Groups']
            )
        except:
            print("could not interpret opportunity", json.dumps(entry, indent=4))
            time.sleep(0.1)
            continue
        structured_dict['opportunity'][record_id] = opportunity
        object_dict[record_id] = opportunity


    # load variable groups
    for record_id, entry in raw_data['Data Request Opportunities (Public)']['Variable Group']['records'].items():
        try:
            variable_group = VariableGroup(
                record_id=record_id,
                title=entry['Title'],
                variables=entry['Variables'],
                priority=entry['Priority Level']
            )
        except:
            print("Could not interpret variable group", json.dumps(entry, indent=4))
            time.sleep(0.1)
            continue
        structured_dict['variable_group'][record_id] = variable_group
        object_dict[record_id] = variable_group


    # load variables
    for record_id, entry in raw_data['Data Request Variables (Public)']['Variable']['records'].items():
        try:
            record_id_match = map_record_id(entry,
                                            raw_data['Data Request Opportunities (Public)']['Variables']['records'], 
                                            key = "Compound Name")            
            variable = Variable(
                record_id=record_id_match,
                variable_id=record_id,
                standard_name=entry['CF Standard Name (from MIP Variables)'],
                frequency=entry['Frequency'][0],
                name=entry['Compound Name']
            )
        except:
            print("Could not interpret variable:", json.dumps(entry, indent=4))
            time.sleep(0.1)
            continue
        structured_dict['variable'][record_id_match] = variable
        object_dict[record_id_match] = variable
        

    # return dictionaries
    return object_dict, structured_dict

In [11]:
object_dict, structured_dict = build_dictionaries("first_export")

Downloading data from 'https://raw.githubusercontent.com/WCRP-CMIP/CMIP7_DReq_Content/first_export/airtable_export/dreq_raw_export.json' to file '/home/ms/GIT/c7dreq/cmip7_request_scripts/dreq_api/dreq_res/first_export/dreq_raw_export.json'.
SHA256 hash of downloaded file: feb809bb72007964bbdf287e5f0d62ab96cd373cd663baee1769f3066b74f9a3
Use this value as the 'known_hash' argument of 'pooch.retrieve' to ensure that the file hasn't changed if it is downloaded again in the future.


Could not interpret experiment:  {
    " Experiment": "spin-up",
    "MIP": [
        "recdznPjoiemHR2o1"
    ]
}
Could not interpret MIP:  {
    "MIP Short Name": "Other",
    "Variable Group": [
        "recRDRk1w22DH1ccr"
    ]
}
Could not interpret MIP:  {
    "MIP Long Name": "Ice Sheet Model Intercomparison Project",
    "MIP Short Name": "ISMIP6",
    "MIP website": "https://climate-cryosphere.org/about-ismip6/"
}
Could not interpret MIP:  {
    "MIP Long Name": "Irrigation Model Intercomparison Project",
    "MIP Short Name": "IRRMIP",
    "MIP website": "https://hydr.vub.be/projects/irrmip"
}
Could not interpret MIP:  {
    "MIP Long Name": "MethaneMIP: Investigating the near-term climate benefits of methane mitigation",
    "MIP Short Name": "MethaneMIP",
    "MIP website": "tba"
}
Could not interpret MIP:  {
    "MIP Long Name": "Perturbed Parameter Ensemble Model Inter-comparaison Project",
    "MIP Short Name": "PPEMIP",
    "MIP website": "To be determined"
}
Could not in

In [12]:
# dictionary to allow lookup up of experiments
experiment_name_to_record_id = {i.experiment_name: i.record_id 
                                for i in structured_dict['experiment'].values()}

In [13]:
experiment_name_to_record_id['historical']

'rec7mTVv08z3iBObw'

In [14]:
experiment_name_to_record_id['amip']

'recR7YVoWYdbQp3wk'

In [15]:
# Walk data request network from experiment to variable list:

def walk_experiment_to_variables(experiment_id, object_dict):
    """
    Navigate data request from specified experiment ids to record ids of corresponding variables
    """
    expt_groups = set(object_dict[experiment_id].experiment_groups)
    print("found {} connected experiment groups".format(len(expt_groups)))

    opportunities = set()
    for expt_group_id in expt_groups:
        expt_group_object = object_dict[expt_group_id]        
        for opportunity_id in expt_group_object.opportunities:
            opportunities.add(opportunity_id)

    print("found {} connected opportunities".format(len(opportunities)))

    variable_groups = set()
    for opportunity_id in opportunities:
        try:
            opportunity_object = object_dict[opportunity_id]
        except KeyError:
            print(f"Opportunity '{opportunity_id}' could not be interpreted or does not exist")
        for variable_group_id in opportunity_object.variable_groups:
            variable_groups.add(variable_group_id)

    print("found {} connected variable groups".format(len(variable_groups)))

    variables = set()
    for variable_group_id in variable_groups:
        variable_group_object = object_dict[variable_group_id]
        for variable_id in variable_group_object.variables:
            variables.add(variable_id)

    print("found {} connected variables".format(len(variables)))

    return list(variables)

In [16]:
historical_variables = walk_experiment_to_variables(experiment_name_to_record_id['historical'],
                                                    object_dict)

found 6 connected experiment groups
found 71 connected opportunities
Opportunity 'recPEd5VYE2273wr4' could not be interpreted or does not exist
found 150 connected variable groups
found 1501 connected variables


In [17]:
# There is one opportunity without assigned variable groups
[i for i in dreq["Data Request Opportunities (Public)"]["Opportunity"]["records"] if "Variable Groups" not in dreq["Data Request Opportunities (Public)"]["Opportunity"]["records"][i].keys()]

['recPEd5VYE2273wr4']

In [18]:
dreq["Data Request Opportunities (Public)"]["Opportunity"]["records"]['recPEd5VYE2273wr4']

{'Comments': ['rec65jWSjBcjny3D0'],
 'Cross-thematic group review': 'Accept (pending actions)',
 'Cross-thematic group review comments': 'Scientific proposition is fine.\xa0\nIncomplete -no variables provided and daily is high demand, not clear if necessary. But noted why would require daily.\xa0\n',
 'Description': "This is a set of daily variables that would be useful to assess the ability of Earth system models to reproduce the observed changes in plant phenology. The proper representation of the plant's active season impacts the accuracy of the Earth system models in representing energy, carbon, and water exchanges between land and atmosphere.\nFor this reason, the availability of these variables at the vegetation type level can increase our understanding of model limitations (Li et al., 2024, <https://doi.org/10.1175/JCLI-D-23-0179.1>) and their expected future changes.\n",
 'Earth system author team review': 'Accept',
 'Earth system review comments': "The variable group is provid

In [19]:
# Show first few variable record_ids 
historical_variables[:10]

['reccYHboFxdQGnneY',
 'recdhyCim2B2oM1uI',
 'rec2r2N1ePMePtGNE',
 'recavF7FRGoU3GyKy',
 'rec8dVXIVqkB6uA0W',
 'recxsAs5fxvzjrcbB',
 'reccOzAa5W2d8Obfe',
 'rec9sjfRKkI9J8jly',
 'recbIQTx79eAJnor5',
 'reciWYwS6jELgSnpP']

In [20]:
# problem: record_ids for variables don't appear to match up in the json file used here
#  update: this problem seems fixed by mapping between the bases
any([i in object_dict for i in historical_variables])

True

In [21]:
# that seems no longer a problem, but still not all variables can be interpreted
all([i in object_dict for i in historical_variables])

False