In [9]:
# Adding HUGS and Acquire to the Python path
import sys
import os
sys.path.insert(0, os.path.abspath("../.."))
sys.path.insert(0, os.path.abspath("/Users/wm19361/Documents/Devel/acquire"))

# Searching the object store

In this example we will load in data from 3 sites. The data will be separated into sections and stored in the object store. We can then use a search function to search for data from the sites over a certain daterange, retrieve that data and visualize it.

In [12]:
from HUGS.Modules import CRDS, Datasource
from HUGS.ObjectStore import get_local_bucket, get_object_names
from HUGS.Processing import recombine_sections, search
from HUGS.Util import get_datetime
# Matplotlib for inline plotting
%matplotlib notebook
import matplotlib.pyplot as plt
# Suppress matplotlib and pandas warning on deprecation
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
# Pretty printer for nicer printing to console
import pprint
pp = pprint.PrettyPrinter(indent=2)


First we get an empty bucket to store the data we are going to process. We then use the CRDS static method `read_folder`. This reads all `*.dat` files in a folder and passes them to `CRDS.read_file` for processing.

In [13]:
bucket = get_local_bucket(empty=True)
test_data = "../../test/data/search_data/"

os.listdir(test_data)


['tac.picarro.1minute.100m.min.dat',
 'bsd.picarro.1minute.108m.min.dat',
 'hfd.picarro.1minute.100m.min.dat']

In [14]:
CRDS.read_folder(folder_path=test_data)

In [15]:
prefix = "datasource"
objects_in_store = get_object_names(bucket=bucket, prefix=prefix)

We can now list all the objects that have been created and stored in the object store

In [16]:
pp.pprint(objects_in_store)

[ 'datasource/uuid/072571bc-5adb-4a78-8dd9-8d77ef225944',
  'datasource/uuid/98a84145-c607-4dba-860b-e5318228b9cf',
  'datasource/uuid/645aebf5-e424-437d-bfb7-1b47394609ca',
  'datasource/uuid/b1ca576a-0dc8-4a63-bd12-e73d74e77ddf',
  'datasource/uuid/c392aa6f-fa9a-4f02-b734-0b4d465196cc',
  'datasource/uuid/bb575a55-3f7d-4c89-b6b3-a8c147b9ebcb',
  'datasource/uuid/075e8121-1b5e-428c-9731-346ee2bdbce6',
  'datasource/uuid/4ebfa188-ed39-48fc-b487-06382125e34d']


To search the data we search the label dictionary that is stored within each Datasource. To do this we load in the Datasources using the keys obtained from `get_object_names` and store these objects in a list. Note: we pass in `shallow=True` here to stop each Datasource loading in its data, we only require the JSON data that makes up the `name`, `uuid`, `labels` etc of the object

In [17]:
datasources = [Datasource.load(key=key, shallow=True) for key in objects_in_store]
pp.pprint(datasources)

[ <HUGS.Modules._datasource.Datasource object at 0x12211bb38>,
  <HUGS.Modules._datasource.Datasource object at 0x12211ba20>,
  <HUGS.Modules._datasource.Datasource object at 0x120dc9d68>,
  <HUGS.Modules._datasource.Datasource object at 0x120dc92b0>,
  <HUGS.Modules._datasource.Datasource object at 0x120dcc2e8>,
  <HUGS.Modules._datasource.Datasource object at 0x120d170f0>,
  <HUGS.Modules._datasource.Datasource object at 0x1215bd748>,
  <HUGS.Modules._datasource.Datasource object at 0x1215bd438>]


We can now create a list of search terms to be passed to the search function.

In [18]:
search_terms = ["bsd", "hfd", "tac"]

We must also provide the data type we are searching for and can optionally provide a start and end datetime for our search

In [24]:
data_type = "CRDS"
start = get_datetime(year=2016, month=1, day=1)
end = get_datetime(year=2019, month=1, day=1)

In [25]:
results = search(search_terms=search_terms, data_type=data_type, require_all=False, start_datetime=start, end_datetime=end)

for key, value in sorted(results.items()):
    print(key, ":", value, "\n")

bsd_ch4 : ['data/uuid/072571bc-5adb-4a78-8dd9-8d77ef225944/2017-01-01T02:00:30_2017-12-31T23:10:30', 'data/uuid/072571bc-5adb-4a78-8dd9-8d77ef225944/2018-01-01T04:10:30_2018-12-31T19:40:30', 'data/uuid/072571bc-5adb-4a78-8dd9-8d77ef225944/2016-01-19T17:17:30_2016-12-31T23:52:30'] 

bsd_co : ['data/uuid/b1ca576a-0dc8-4a63-bd12-e73d74e77ddf/2017-01-01T02:00:30_2017-12-31T23:10:30', 'data/uuid/b1ca576a-0dc8-4a63-bd12-e73d74e77ddf/2018-01-01T04:10:30_2018-12-31T19:40:30', 'data/uuid/b1ca576a-0dc8-4a63-bd12-e73d74e77ddf/2016-01-19T17:17:30_2016-12-31T23:52:30'] 

bsd_co2 : ['data/uuid/98a84145-c607-4dba-860b-e5318228b9cf/2017-01-01T02:00:30_2017-12-31T23:10:30', 'data/uuid/98a84145-c607-4dba-860b-e5318228b9cf/2018-01-01T04:10:30_2018-12-31T19:40:30', 'data/uuid/98a84145-c607-4dba-860b-e5318228b9cf/2016-01-19T17:17:30_2016-12-31T23:52:30'] 

hfd_ch4 : ['data/uuid/c392aa6f-fa9a-4f02-b734-0b4d465196cc/2016-01-01T00:38:30_2016-12-31T23:36:30', 'data/uuid/c392aa6f-fa9a-4f02-b734-0b4d465196cc/201

The search function returns a Python dictionary that is keyed using the search term and the species name
as `search_term_species_name`. The value part of the key:value pair consists of object store keys that point to the raw data stored in the HDF5 format in the object store.

The dictionary can be see as so

`bsd_ch4 : ['data/uuid/db5cbe8b-a66e-45dc-bba4-8b79eb61e261/2015-01-01T00:33:30_2015-10-24T08:57:30', 'data/uuid/db5cbe8b-a66e-45dc-bba4-8b79eb61e261/2014-01-30T13:33:30_2014-12-31T22:23:30']`

where the key is formed of `bsd` (the Bilsdale site) and `ch4` (the name of the species) and the value part containing object store keys in the form `data/uuid/{uuid_here}`.

We now pass this dictionary to the `recombine_sections` function that retrieves the data stored in the object store keys and joins the sections as per the keying in the dictionary.

In [26]:
recombined_sections = recombine_sections(data_keys=results)

We can now plot the methane readings from each site for comparison

In [27]:
hdf_ch4 = recombined_sections["hfd_ch4"]
bsd_ch4 = recombined_sections["bsd_ch4"]
tac_ch4 = recombined_sections["tac_ch4"]

fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(hdf_ch4.index.values, hdf_ch4["ch4 count"], label = "hdf ch4", linewidth = 1, color="#4e79a7")
ax.plot(bsd_ch4.index.values, bsd_ch4["ch4 count"], label = "bsd ch4", linewidth = 1, color="#59a14f")
ax.plot(tac_ch4.index.values, tac_ch4["ch4 count"], label = "tac ch4", linewidth = 1, color="#e15759")
ax.legend(frameon=False)

<IPython.core.display.Javascript object>

<matplotlib.legend.Legend at 0x1215b3160>