# Loading and accessing data

In [1]:
import sys
sys.path.insert(0, "/work/rt17603/environments/openghg/bin/activate")

from openghg.modules import ObsSurface

In [2]:
import os
os.environ["OPENGHG_PATH"] = "/work/chxmr/objectStore"

### Macehead data

Start off by loading in Macehead ("MHD" or "macehead") data from all different sources. This includes:
 - data_type="GCWERKS", network="AGAGE" (instrument is "GCMD")
 - data_type="GCWERKS", network="AGAGE" (instrument is "GCMS")
 - data_type="GCWERKS", network="AGAGE" (instrument is "medusa")
 - data_type="ICOS", network="ICOS"

In [3]:
import os
import glob
from file_search import find_gc_files

# All MHD data files for GCMD instrument (GC)
mhd_gcmd_tuples = find_gc_files("MHD", "GCMD")

mhd_results_1 = ObsSurface.read_file(filepath=mhd_gcmd_tuples, 
                                    data_type="GCWERKS", 
                                    site="MHD", network="AGAGE")

Processing: macehead.99.C: 100%|██████████| 28/28 [06:25<00:00, 13.76s/it]


In [4]:
# All MHD data files for GCMS instrument (GC)
mhd_gcms_tuples = find_gc_files("MHD", "GCMS")

mhd_results_2 = ObsSurface.read_file(filepath=mhd_gcms_tuples, 
                                    data_type="GCWERKS", 
                                    site="MHD", network="AGAGE")

Processing: macehead-gcms.99.C: 100%|██████████| 8/8 [00:54<00:00,  6.81s/it]


In [5]:
# All MHD data files for medusa instrument (GC)
mhd_medusa_tuples = find_gc_files("MHD", "medusa")

mhd_results_3 = ObsSurface.read_file(filepath=mhd_medusa_tuples, 
                                    data_type="GCWERKS", 
                                    site="MHD", network="AGAGE")

Processing: macehead-medusa.21.C: 100%|██████████| 19/19 [18:27<00:00, 58.31s/it]


In [6]:
# All MHD data files for ICOS
## At the moment this doesn't appear within the object store - Issue #57
from file_search import find_icos_files
icos_files = find_icos_files("MHD")

mhd_results_4 = ObsSurface.read_file(filepath=icos_files, 
                                    data_type="ICOS", 
                                    site="MHD", network="ICOS")

Processing: mhd.co2.1minute.g2401.15m.dat: 100%|██████████| 3/3 [00:00<00:00,  3.04it/s]


In [7]:
from openghg.objectstore import visualise_store
visualise_store()

In [8]:
#from openghg.localclient import get_obs_surface
## Should be able to read this from ICOS but doesn't seem to be able to.
#data = get_obs_surface(site="mhd", species="co2", network="ICOS")
#data

In [9]:
from openghg.localclient import get_obs_surface
# Extracting data for one gas "sf6", not specifying network for now
data = get_obs_surface(site="mhd", species="sf6")
data

ObsData(data=<xarray.Dataset>
Dimensions:           (time: 53697)
Coordinates:
  * time              (time) datetime64[ns] 2003-11-15T14:43:00 ... 2021-02-0...
Data variables:
    mf                (time) float64 5.432 5.307 5.309 ... 10.77 10.72 10.58
    mf_repeatability  (time) float64 0.09644 0.09644 0.09644 ... 0.04115 0.04115
    status_flag       (time) int64 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
    integration_flag  (time) int64 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
Attributes: (12/20)
    data_owner:           Simon O'Doherty
    data_owner_email:     s.odoherty@bristol.ac.uk
    inlet_height_magl:    10m
    comment:              Medusa measurements. Output from GCWerks. See Mille...
    Conditions of use:    Ensure that you contact the data owner at the outse...
    Source:               In situ measurements of air
    ...                   ...
    instrument:           medusa
    site:                 mhd
    network:              agage
    units:          

In [10]:
# Extracting for nearest year start and year end when 
# including start and end dates
data = get_obs_surface(site="mhd", species="ch4",
                       start_date="2005-03-01", end_date="2008-02-01")
print(data.data["time"].values[0])
print(data.data["time"].values[-1])

2005-01-01T00:44:22.500000000
2008-12-31T23:14:22.500000000


Trying out searching based on data we know is present (loaded and accessed above).

In [11]:
from openghg.processing import search

In [12]:
# Search output produced includes a dictionary with the uuid
search_output = search(species="ch4", site="mhd"
                        ,inlet="10m")#, instrument="gcmd")
search_output

defaultdict(dict,
            {'d070e2e9-cd88-462f-b5be-aa4ab78cb0c7': {'keys': ['data/uuid/d070e2e9-cd88-462f-b5be-aa4ab78cb0c7/v28/2000-01-01-00:48:22.500000+00:00_2000-12-31-22:56:22.500000+00:00',
               'data/uuid/d070e2e9-cd88-462f-b5be-aa4ab78cb0c7/v28/2000-06-01-00:05:22.500000+00:00_2000-08-31-23:22:22.500000+00:00',
               'data/uuid/d070e2e9-cd88-462f-b5be-aa4ab78cb0c7/v28/2000-03-01-01:42:22.500000+00:00_2000-05-31-23:25:22.500000+00:00',
               'data/uuid/d070e2e9-cd88-462f-b5be-aa4ab78cb0c7/v28/2000-09-01-00:02:22.500000+00:00_2000-11-30-23:35:22.500000+00:00',
               'data/uuid/d070e2e9-cd88-462f-b5be-aa4ab78cb0c7/v28/2001-01-04-10:57:22.500000+00:00_2001-12-31-23:33:22.500000+00:00',
               'data/uuid/d070e2e9-cd88-462f-b5be-aa4ab78cb0c7/v28/2001-06-01-00:26:22.500000+00:00_2001-08-31-23:42:22.500000+00:00',
               'data/uuid/d070e2e9-cd88-462f-b5be-aa4ab78cb0c7/v28/2001-03-01-00:05:22.500000+00:00_2001-05-31-23:46:22.5000

In [13]:
## Extracting from code base 
#from openghg.modules import Datasource
#obs = ObsSurface.load()
#datasource_uuids = obs.datasources()
#datasources = (Datasource.load(uuid=uuid, shallow=True) for uuid in datasource_uuids)

In [14]:
#d = next(datasources)
#d.metadata()

### Heathfield data

Trying loading different data types for Heathfield ("HFD") data:

- data_type = "GCWERKS", network = "DECC" (instrument is "GCMD")
- data_type = "CRDS", network = "DECC"

In [15]:
# All HFD data files for GCMD instrument (GC), network="DECC"
sitecode = "HFD"

hfd_gcmd_tuples = find_gc_files(sitecode, "GCMD")

hfd_results_1 = ObsSurface.read_file(filepath=hfd_gcmd_tuples, 
                                    data_type="GCWERKS", 
                                    site=sitecode, network="DECC")

Processing: heathfield-md.21.C: 100%|██████████| 9/9 [00:11<00:00,  1.25s/it]


In [16]:
# All HFD files for CRDS instrument, network="DECC"
from file_search import find_crds_files

crds_files = find_crds_files(sitecode)

hfd_results_2 = ObsSurface.read_file(filepath=crds_files, 
                                   data_type="CRDS", 
                                   site=sitecode, network="DECC")

Processing: hfd.picarro.1minute.50m.dat: 100%|██████████| 2/2 [00:52<00:00, 26.32s/it] 


In [17]:
visualise_store()

In [18]:
# HFD data loaded for two instruments on the same network (DECC)
# No ranking explicitly set and no height specified this 
# returns the data from "100m"
data = get_obs_surface(site="hfd", species="ch4")
data

ObsData(data=<xarray.Dataset>
Dimensions:                    (time: 1897950)
Coordinates:
  * time                       (time) datetime64[ns] 2013-11-20T12:51:30 ... ...
Data variables:
    mf                         (time) float64 1.919e+03 1.918e+03 ... 1.965e+03
    mf_variability             (time) float64 1.614 0.549 0.407 ... 0.279 0.231
    mf_number_of_observations  (time) float64 19.0 19.0 20.0 ... 12.0 12.0 12.0
Attributes: (12/22)
    data_owner:           Simon O'Doherty
    data_owner_email:     s.odoherty@bristol.ac.uk
    inlet_height_magl:    100m
    comment:              Cavity ring-down measurements. Output from GCWerks
    Conditions of use:    Ensure that you contact the data owner at the outse...
    Source:               In situ measurements of air
    ...                   ...
    time_resolution:      1_minute
    inlet:                100m
    port:                 10
    type:                 air
    network:              decc
    scale:                WMO-X

In [19]:
# Can correctly extract data from alternative height when specified
data = get_obs_surface(site="hfd", species="ch4", inlet="50m")
print(data.metadata["inlet"])
data = get_obs_surface(site="hfd", species="ch4", inlet="100m")
print(data.metadata["inlet"])

50m
100m


In [20]:
# Can correctly extract data a different species
data = get_obs_surface(site="hfd", species="co2", inlet="50m")
print(data.metadata["species"], data.metadata["inlet"])

co2 50m


In [21]:
# Same issue with dates as for mhd data
data = get_obs_surface(site="hfd", species="ch4", instrument="picarro",
                      start_date="2014-02-01", end_date="2016-02-01")
print(data.data["time"].values[0])
print(data.data["time"].values[-1])

2014-01-01T00:00:30.000000000
2016-12-31T23:59:30.000000000
