In [1]:
# imports
from collections import defaultdict
import json
from dataclasses import dataclass

In [2]:
# Define data classes for each object type.
# Have only pulled out a very limited set of objects and their parameters as a demonstration

@dataclass
class Experiment:
    record_id: str
    experiment_name: str
    mips: list
    experiment_groups: list


@dataclass
class ExperimentGroup:
    record_id: str
    experiments: list
    name: str
    opportunities: list


@dataclass
class Opportunity:
    record_id: str
    description: str
    experiment_groups: list
    variable_groups: list


@dataclass
class VariableGroup:
    record_id: str
    title: str
    variables: list
    priority: str


@dataclass
class Variable:
    record_id: str
    standard_name: str
    frequency: str
    name: str

In [3]:
# build dictionaries allowing Data request network to be navigated

def build_dictionaries(filename):
    """
    load json in filename and build dictionaries of data request objects
    """
    # Dictionaries to be returned
    object_dict = {}
    structured_dict = defaultdict(dict)

    with open(filename) as fh:
        raw_data = json.load(fh)

    # load experiments
    for record_id, entry in raw_data['Experiments']['records'].items():
        try:
            expt = Experiment(
                record_id=record_id, 
                experiment_name=entry[' Experiment'].strip(), 
                mips=entry['MIP'],
                experiment_groups=entry['Experiment Group'])
        except:
            print("Could not interpret experiment: ", entry)
        structured_dict['experiment'][record_id] = expt
        object_dict[record_id] = expt

    # load experiment groups
    for record_id, entry in raw_data['Experiment Group']['records'].items():
        try:
            expt_group = ExperimentGroup(
                record_id=record_id,
                experiments=entry['Experiments'],
                name=entry['Name'],
                opportunities=entry['Opportunities']
            )
        except:
            print("Could not interpret experiment_group:", entry)
        structured_dict['experiment_group'][record_id] = expt_group
        object_dict[record_id] = expt_group

    # load opportunities
    for record_id, entry in raw_data['Opportunity']['records'].items():
        try:
            opportunity = Opportunity(
                record_id=record_id,
                description=entry['Description'],
                experiment_groups=entry['Experiment Groups'],
                variable_groups=entry['Variable Groups']
            )
        except:
            print("could not interpret opportunity", entry)
        structured_dict['opportunity'][record_id] = opportunity
        object_dict[record_id] = opportunity

    # load variable groups
    for record_id, entry in raw_data['Variable Group']['records'].items():
        try:
            variable_group = VariableGroup(
                record_id=record_id,
                title=entry['Title'],
                variables=entry['Variables'],
                priority=entry['Priority Level']
            )
        except:
            print("Could not interpret variable group", entry)
        structured_dict['variable_group'][record_id] = variable_group
        object_dict[record_id] = variable_group

    # load variables
    for record_id, entry in raw_data['Variables']['records'].items():
        try:
            variable = Variable(
                record_id=record_id,
                standard_name=entry['CF Standard Name (from MIP Variables)'],
                frequency=entry['Frequency'][0],
                name=entry['Compound Name']
            )
        except:
            print("Could not interpret as variable:", entry)
        structured_dict['variable'][record_id] = variable
        object_dict[record_id] = variable

    # return dictionaries
    return object_dict, structured_dict

In [4]:
object_dict, structured_dict = build_dictionaries('request_basic_dump2.json')

Could not interpret experiment_group: {'Experiments': ['recFIIkEdefZynPhh'], 'Name': 'spin-up', 'Title': 'The spin-up'}
could not interpret opportunity {'Experiment Groups': ['rec0J4qnPn2vktlhO'], 'References': ['recw7gRoApscR5fsW'], 'Status': 'New', 'Title of Opportunity': 'Benchmarking'}
Could not interpret as variable: {'Atmosphere author team review': 'In progress', 'CF Standard Name (from MIP Variables)': 'air_temperature', 'Cell Measures': ['recb1V00ayWZnmm79'], 'Cell Methods': ['recfYDayRm62sFUsp'], 'Coordinates': 'height2m', 'Description': 'For models with fractional land areas (SFTLF) we are not currently able to diagnose near-surface air temperatures over just the land part of coastal gridboxes, with the current CMIP diagnostic set. The TAS diagnostic will be a weighted mean of the air temperatures over land and sea. If the diagnostic (lets call it LTAS) is available, we can then diagnose the air temperatures just over land, and in conjunction with TAS and SFTLF diagnose the 

In [5]:
# dictionary to allow lookup up of experiments
experiment_name_to_record_id = {i.experiment_name: i.record_id 
                                for i in structured_dict['experiment'].values()}

In [6]:
experiment_name_to_record_id['historical']

'rec7mTVv08z3iBObw'

In [7]:
experiment_name_to_record_id['amip']


'recR7YVoWYdbQp3wk'

In [8]:
# Walk data request network from experiment to variable list:

def walk_experiment_to_variables(experiment_id, object_dict):
    """
    Navigate data request from specified experiment ids to record ids of corresponding variables
    """
    expt_groups = set(object_dict[experiment_id].experiment_groups)
    print("found {} connected experiment groups".format(len(expt_groups)))

    opportunities = set()
    for expt_group_id in expt_groups:
        expt_group_object = object_dict[expt_group_id]
        for opportunity_id in expt_group_object.opportunities:
            opportunities.add(opportunity_id)

    print("found {} connected opportunities".format(len(opportunities)))

    variable_groups = set()
    for opportunity_id in opportunities:
        opportunity_object = object_dict[opportunity_id]
        for variable_group_id in opportunity_object.variable_groups:
            variable_groups.add(variable_group_id)

    print("found {} connected variable groups".format(len(variable_groups)))

    variables = set()
    for variable_group_id in variable_groups:
        variable_group_object = object_dict[variable_group_id]
        for variable_id in variable_group_object.variables:
            variables.add(variable_id)

    print("found {} connected variables".format(len(variables)))

    return list(variables)

In [9]:
historical_variables = walk_experiment_to_variables(experiment_name_to_record_id['historical'], object_dict)

found 2 connected experiment groups
found 4 connected opportunities
found 9 connected variable groups
found 239 connected variables


In [10]:
# Show first few variable record_ids 
historical_variables[:10]

['recGfiAQ8b62b8D0H',
 'recc29R75UVqSGpFC',
 'reck2Ofq8KEbUw4Yz',
 'recH0aJ4cZJt7up4d',
 'recQMtSbbOQwkDBcL',
 'recyHFBDBY1oy7jib',
 'recIWQRAqz9K78QhF',
 'recS5sYw7lg0By4Dn',
 'reczhZLOX2BxzRT93',
 'rec57DB6wHX5cC9uo']

In [11]:
# problem: record_ids for variables don't appear to match up in the json file used here
any([i in object_dict for i in historical_variables])

False