# Accessing OOI glider data from Glider DAC
9/24,6/2019

NANOOS, Emilio Mayorga  
https://github.com/nanoos-pnw/gliders

In [1]:
import warnings
warnings.simplefilter('ignore') # filter some warning messages

import datetime
import numpy as np
import pandas as pd
import xarray as xr
from erddapy import ERDDAP

import matplotlib.pyplot as plt
# import cartopy.crs as ccrs
# from cartopy.mpl.gridliner import LONGITUDE_FORMATTER, LATITUDE_FORMATTER
%matplotlib inline

# Attempts with OPeNDAP

In [2]:
# OOI
tds_opendap_url = 'http://gliders.ioos.us/thredds/dodsC/deployments/OOI-CE/ce_319-20190125T2248-delayed/ce_319-20190125T2248-delayed.nc3.nc'
erddap_opendap_url = 'https://gliders.ioos.us/erddap/tabledap/ce_319-20190125T2248-delayed'

In [3]:
# NANOOS Trinidad Head
tds_opendap_url = 'http://gliders.ioos.us/thredds/dodsC/deployments/mbari/UW646-20190409T0000/UW646-20190409T0000.nc3.nc'
erddap_opendap_url = 'https://gliders.ioos.us/erddap/tabledap/UW646-20190409T0000'

## Getting errors with `xr.open_dataset`

- **OOI**
    - thredds: `OSError: [Errno -45] NetCDF: Not a valid data type or _FillValue type mismatch: b'http://gliders.ioos.us/thredds/dodsC/deployments/OOI-CE/ce_319-20190125T2248-delayed/ce_319-20190125T2248-delayed.nc3.nc'`
    - erddap: `IndexError: The indexing operation you are attempting to perform is not valid on netCDF4.Variable object. Try loading your data into memory first by calling .load().`
- **NANOOS Trinidad**
    - thredds: `OSError: [Errno -45] NetCDF: Not a valid data type or _FillValue type mismatch: b'http://gliders.ioos.us/thredds/dodsC/deployments/mbari/UW646-20190409T0000/UW646-20190409T0000.nc3.nc'`
    - erddap: Success! But note that, at least for salinity, coordinate attributes don't match up with declared variables: `coordinates: lon lat depth time` vs `longitude` and `latitude`

### pydap
- Also tried with `engine='pydap'`, but got errors
- https://github.com/pydap/pydap

In [4]:
# ds = xr.open_dataset(erddap_opendap_url)
#        tds_opendap_url, decode_cf=False)

# Tests with erddapy (uses REST API rather than OPeNDAP?)

In [5]:
# Glider DAC server endpoint
server = 'https://data.ioos.us/gliders/erddap'
protocol = 'tabledap'

In [6]:
dataset_id = 'ce_319-20190125T2248-delayed'

## Explore dataset metadata

In [7]:
e_md = ERDDAP(server=server, protocol=protocol)

In [8]:
info_url = e_md.get_info_url(dataset_id=dataset_id, response='csv')
info = pd.read_csv(info_url)

In [9]:
info.head(10)

Unnamed: 0,Row Type,Variable Name,Attribute Name,Data Type,Value
0,attribute,NC_GLOBAL,_NCProperties,String,version=1|netcdflibversion=4.6.1|hdf5libversio...
1,attribute,NC_GLOBAL,acknowledgment,String,Funding provided by the National Science Found...
2,attribute,NC_GLOBAL,cdm_data_type,String,TrajectoryProfile
3,attribute,NC_GLOBAL,cdm_profile_variables,String,"time_uv,lat_uv,lon_uv,u,v,profile_id,time,lati..."
4,attribute,NC_GLOBAL,cdm_trajectory_variables,String,"trajectory,wmo_id"
5,attribute,NC_GLOBAL,comment,String,2019-01-25 Deployed on the Newport Hydrographi...
6,attribute,NC_GLOBAL,contributor_name,String,"Edward Dever, Jonathan Fram, Stuart Pearce, Ch..."
7,attribute,NC_GLOBAL,contributor_role,String,Endurance Array Principal Investigator/Project...
8,attribute,NC_GLOBAL,Conventions,String,"Unidata Dataset Discovery v1.0, COARDS, CF-1.6"
9,attribute,NC_GLOBAL,creator_email,String,spearce@ceoas.oregonstate.edu


In [10]:
info.groupby(['Variable Name', 'Row Type']).size()

Variable Name         Row Type 
CDOM                  attribute    17
                      variable      1
NC_GLOBAL             attribute    70
PAR                   attribute    16
                      variable      1
backscatter           attribute    18
                      variable      1
chlorophyll           attribute    18
                      variable      1
conductivity          attribute    21
                      variable      1
conductivity_qc       attribute     7
                      variable      1
ctd_timestamp         attribute    15
                      variable      1
density               attribute    16
                      variable      1
density_qc            attribute     7
                      variable      1
depth                 attribute    25
                      variable      1
depth_qc              attribute     7
                      variable      1
dissolved_oxygen      attribute    17
                      variable      1
instrument_ctd    

## Extracting `cdm_profile_variables` and coordinate variables

In [11]:
cdm_profile_variables = info.loc[
    info['Attribute Name'] == 'cdm_profile_variables', 'Value'
]

In [12]:
profile_variables = cdm_profile_variables.to_list()[0].split(',')
profile_variables

['time_uv',
 'lat_uv',
 'lon_uv',
 'u',
 'v',
 'profile_id',
 'time',
 'latitude',
 'longitude']

In [13]:
obs_coords_variables = e_md.get_var_by_attr(
    dataset_id=dataset_id,
    axis=lambda v: v in ['X', 'Y', 'Z', 'T'],
)

In [14]:
obs_coords_variables

['precise_lat',
 'pressure',
 'precise_lon',
 'depth',
 'time',
 'longitude',
 'precise_time',
 'latitude',
 'ctd_timestamp']

Add `profile_id` to the list of coordinate variables. It's not a coordinate variable proper, but it's key to being able to segment the glider data into profiles.

In [15]:
obs_coords_variables.append('profile_id')

```python
# For searching across datasets
kw = {
    "min_time": "2017-01-05T00:00:00Z",
    "min_time": "2019-06-12T00:00:00Z",
    "min_lon": -136,
    "max_lon": -120,
    "min_lat": 41.78,
    "max_lat": 52.24,
    "standard_name": "sea_water_practical_salinity",
    "cdm_data_type": "trajectory",
}

# For constraining response from a single dataset
e.constraints = {
    "time>=": "2018-08-05T00:00:00Z",
    "time<=": "2019-06-12T00:00:00Z",
    "longitude>=": -133.75,
    "longitude<=": -123.29,
    "latitude>=": 41.78,
    "latitude<=": 52.24,
}
```

## Explore a dataset with a time constraint filter

In [16]:
e_data = ERDDAP(server=server, protocol=protocol)

In [17]:
# requested variables and constraints
e_data.dataset_id = dataset_id
# e_data.variables = obs_coords_variables
e_data.constraints = {"time>=": "2019-04-20T00:00:00Z"}

In [18]:
e_data.__dict__

{'server': 'https://data.ioos.us/gliders/erddap',
 'protocol': 'tabledap',
 'constraints': {'time>=': '2019-04-20T00:00:00Z'},
 'dataset_id': 'ce_319-20190125T2248-delayed',
 'params': None,
 'requests_kwargs': {},
 'response': 'html',
 'variables': '',
 '_dataset_id': None,
 '_variables': {}}

### First query and explore with xarray, to examine all the metadata
Take advantage of xarray access to the full structure of netcdf variables and attributes (both global and variable attributes)

In [19]:
ds = e_data.to_xarray(decode_times=True)

In [20]:
ds

<xarray.Dataset>
Dimensions:               (row: 49336)
Coordinates:
    time_uv               (row) datetime64[ns] ...
    lat_uv                (row) float64 ...
    lon_uv                (row) float64 ...
Dimensions without coordinates: row
Data variables:
    trajectory            (row) object ...
    wmo_id                (row) object ...
    profile_id            (row) float64 ...
    time                  (row) datetime64[ns] ...
    latitude              (row) float64 ...
    longitude             (row) float64 ...
    precise_time          (row) datetime64[ns] ...
    depth                 (row) float32 ...
    pressure              (row) float32 ...
    temperature           (row) float32 ...
    conductivity          (row) float32 ...
    salinity              (row) float32 ...
    density               (row) float32 ...
    precise_lat           (row) float64 ...
    precise_lon           (row) float64 ...
    u                     (row) float64 ...
    v                   

Pull out only "true" data variables (observational, like salinity, temperature), excluding `_qc` variables as well. `ds.data_vars` is not strict enough (or maybe some of the coordinate variables are not properly defined in this file).

In [21]:
len(ds.data_vars), len(ds.variables)

(44, 47)

Remove qc variables and these metadata variables: instrument_ctd, wmo_id, platform_meta. Maybe should remove trajectory, too. Note that `pressure` will be removed too b/c it's in `obs_coords_variables`

In [22]:
obs_variables = [v for v in set(ds.data_vars).difference(set(obs_coords_variables))
                 if not v.endswith('_qc') 
                 and v not in ('instrument_ctd', 'wmo_id', 'platform_meta')]

In [23]:
obs_variables

['u',
 'conductivity',
 'temperature',
 'trajectory',
 'CDOM',
 'salinity',
 'backscatter',
 'dissolved_oxygen',
 'radiation_wavelength',
 'density',
 'chlorophyll',
 'PAR',
 'v',
 'oxygen_saturation']

In [24]:
ds.salinity

<xarray.DataArray 'salinity' (row: 49336)>
array([      nan,       nan,       nan, ...,       nan, 33.966694, 33.96678 ],
      dtype=float32)
Coordinates:
    time_uv  (row) datetime64[ns] ...
    lat_uv   (row) float64 ...
    lon_uv   (row) float64 ...
Dimensions without coordinates: row
Attributes:
    _ChunkSizes:            478
    actual_range:           [31.854792 34.41189 ]
    colorBarMaximum:        37.0
    colorBarMinimum:        30.0
    instrument:             instrument_ctd
    ioos_category:          Salinity
    long_name:              Sea Water Practical Salinity
    observation_type:       calculated
    OOI_data_level:         L2a
    OOI_data_product_name:  PRACSAL
    platform:               platform
    standard_name:          sea_water_practical_salinity
    units:                  1
    valid_max:              40.0
    valid_min:              0.0

In [25]:
ds.salinity.shape

(49336,)

In [26]:
ds.chlorophyll

<xarray.DataArray 'chlorophyll' (row: 49336)>
array([  nan, 0.469, 0.483, ...,   nan, 0.056,   nan])
Coordinates:
    time_uv  (row) datetime64[ns] ...
    lat_uv   (row) float64 ...
    lon_uv   (row) float64 ...
Dimensions without coordinates: row
Attributes:
    _ChunkSizes:            478
    actual_range:           [0.035 2.947]
    bytes:                  4
    comment:                Chlorophyll recalculated from signal due to calib...
    instrument:             instrument_flbbcd
    ioos_category:          Other
    long_name:              Chlorophyll Concentration
    observation_type:       measured
    OOI_data_level:         L1a
    OOI_data_product_name:  CHLAFLO
    platform:               platform
    resolution:             0.012
    source_sensor:          sci_flbbcd_chlor_units
    standard_name:          mass_concentration_of_chlorophyll_a_in_sea_water
    units:                  ug l-1
    valid_max:              50.0
    valid_min:              0.0

In [27]:
ds.profile_id

<xarray.DataArray 'profile_id' (row: 49336)>
array([797., 797., 797., ..., 815., 815., 815.])
Coordinates:
    time_uv  (row) datetime64[ns] ...
    lat_uv   (row) float64 ...
    lon_uv   (row) float64 ...
Dimensions without coordinates: row
Attributes:
    actual_range:         [797 815]
    ancillary_variables:  profile_time
    cf_role:              profile_id
    comment:              Sequential profile number within the trajectory. Th...
    ioos_category:        Identifier
    long_name:            Profile ID
    valid_max:            2147483647
    valid_min:            1

In [28]:
set(ds.profile_id.values)

{797.0,
 798.0,
 799.0,
 800.0,
 801.0,
 802.0,
 803.0,
 804.0,
 805.0,
 806.0,
 807.0,
 808.0,
 809.0,
 810.0,
 811.0,
 812.0,
 813.0,
 814.0,
 815.0}

In [29]:
obs_coords_variables

['precise_lat',
 'pressure',
 'precise_lon',
 'depth',
 'time',
 'longitude',
 'precise_time',
 'latitude',
 'ctd_timestamp',
 'profile_id']

### Now we'll request the data transformed into a Pandas dataframe
Bring in only the variables we're interested in. For access to metadata use the xarray dataset we created, above.

In [30]:
e_data.variables = obs_coords_variables + ['salinity', 'temperature']

In [31]:
e_data.__dict__

{'server': 'https://data.ioos.us/gliders/erddap',
 'protocol': 'tabledap',
 'constraints': {'time>=': '2019-04-20T00:00:00Z'},
 'dataset_id': 'ce_319-20190125T2248-delayed',
 'params': None,
 'requests_kwargs': {},
 'response': 'html',
 'variables': ['precise_lat',
  'pressure',
  'precise_lon',
  'depth',
  'time',
  'longitude',
  'precise_time',
  'latitude',
  'ctd_timestamp',
  'profile_id',
  'salinity',
  'temperature'],
 '_dataset_id': None,
 '_variables': {}}

In [32]:
df = e_data.to_pandas(
    index_col='time (UTC)',  # or use precise_time?
    parse_dates=True,
).dropna()

In [33]:
df.head()

Unnamed: 0_level_0,precise_lat (degree_north),pressure (dbar),precise_lon (degree_east),depth (m),longitude (degrees_east),precise_time (UTC),latitude (degrees_north),ctd_timestamp (UTC),profile_id,salinity (1),temperature (Celsius)
time (UTC),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2019-04-20 01:17:21+00:00,44.71396,-1.76,-125.059905,-1.74577,-125.044933,2019-04-20T00:11:24Z,44.711822,2019-04-20T00:11:23Z,797,32.40767,11.329
2019-04-20 01:17:21+00:00,44.713955,-1.79,-125.059905,-1.775527,-125.044933,2019-04-20T00:11:26Z,44.711822,2019-04-20T00:11:25Z,797,32.408794,11.3236
2019-04-20 01:17:21+00:00,44.713949,-1.86,-125.059917,-1.844962,-125.044933,2019-04-20T00:11:28Z,44.711822,2019-04-20T00:11:27Z,797,32.410355,11.3213
2019-04-20 01:17:21+00:00,44.713942,-1.83,-125.059918,-1.815204,-125.044933,2019-04-20T00:11:30Z,44.711822,2019-04-20T00:11:29Z,797,32.4095,11.3217
2019-04-20 01:17:21+00:00,44.713939,-1.74,-125.059915,-1.725931,-125.044933,2019-04-20T00:11:31Z,44.711822,2019-04-20T00:11:31Z,797,32.409695,11.3211


In [34]:
df.tail()

Unnamed: 0_level_0,precise_lat (degree_north),pressure (dbar),precise_lon (degree_east),depth (m),longitude (degrees_east),precise_time (UTC),latitude (degrees_north),ctd_timestamp (UTC),profile_id,salinity (1),temperature (Celsius)
time (UTC),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2019-04-21 03:44:01+00:00,44.592754,234.27,-124.780057,232.24554,-124.781832,2019-04-21T03:58:35Z,44.594201,2019-04-21T03:58:34Z,815,33.96687,7.3416
2019-04-21 03:44:01+00:00,44.59275,234.55,-124.780053,232.52296,-124.781832,2019-04-21T03:58:37Z,44.594201,2019-04-21T03:58:36Z,815,33.96706,7.3406
2019-04-21 03:44:01+00:00,44.592746,234.81,-124.780048,232.78058,-124.781832,2019-04-21T03:58:39Z,44.594201,2019-04-21T03:58:38Z,815,33.967808,7.3396
2019-04-21 03:44:01+00:00,44.592742,235.05,-124.780044,233.01837,-124.781832,2019-04-21T03:58:41Z,44.594201,2019-04-21T03:58:41Z,815,33.966694,7.3394
2019-04-21 03:44:01+00:00,44.592739,235.31,-124.780039,233.27597,-124.781832,2019-04-21T03:58:43Z,44.594201,2019-04-21T03:58:43Z,815,33.96678,7.3353
