# Generate "NVS-style" OOI glider sections file from JSON files
6/15, 4/26,22-21,3-1, 3/27,20-19/2020. 12/20-18/2019

**4/21 issues:**
- deployment_index = -5, ce_311_00007_sections.json: Edit output section file manually to use only the first section.

Current sample draft JSON file, for Newport Deep ("NHD")
https://github.com/nanoos-pnw/OOI/blob/master/gliders/NHD.json. BUT, for reading it programmatically, use https://raw.githubusercontent.com/nanoos-pnw/OOI/master/gliders/NHD.json

In [1]:
import warnings
warnings.simplefilter('ignore') # filter some warning messages

import datetime
import requests
from pandas.core.tools.datetimes import parse_time_string
from erddapy import ERDDAP
from erddapy.utilities import urlopen

## Read and parse JSON files

### Set up -- Transect and Deployment to process

OOI Newport Hydrographic Deep (NHD) Line.

Switch to http://agate.coas.oregonstate.edu/nvs/ooi/gliders/NHD.json once it's fully operational.

**4/26/2020 note:** Implemented the following manual decisions:
- NHD_ce_311_00007: use the 3 sections, all NHD; for the last section, the last profile id generated by this automatic approach (`499`) resulted in an "index" error in `ooinhdeep_nvs_gliderapp.py`. Changing it manually to `498` fixed the problem.
- `NHD_ce_312_00007`: Skip the first section, the only one that overlaps with ce311_00007 (plus it's only a half section).

In [22]:
transect_code = 'GHS'

# ooi_transect_json_url = "https://raw.githubusercontent.com/nanoos-pnw/OOI/master/gliders/NHD.json"
ooi_transect_json_url = f"http://agate.coas.oregonstate.edu/nvs/ooi/gliders/{transect_code}.json"

In [3]:
# -1 is the latest, -2 the next oldest, and so on. 0 is the oldest
deployment_index = -1

### Process transect file

In [4]:
r = requests.get(ooi_transect_json_url)
r.status_code

200

In [5]:
ooi_transect = r.json()

In [6]:
transect_name = ooi_transect['hydrographic_line']
direction_codes = ooi_transect['direction_codes']

Select the deployment to process. Use `if dep['is_main_line'] == True` if limiting it to main deployments.

In [7]:
deployments = [dep for dep in ooi_transect['deployments']]
#              if dep['is_main_line'] == True]

In [8]:
deployment = deployments[deployment_index]
deployment

{'is_main_line': True,
 'url': 'http://agate.coas.oregonstate.edu/nvs/ooi/gliders/ce_320_00005_sections.json',
 'datetime_start': '2019-10-12T04:48:00Z',
 'datetime_end': '2019-12-13T18:40:00Z'}

### Process selected deployment file

In [9]:
rdep = requests.get(deployment['url'])

In [10]:
deployment_json = rdep.json()

In [11]:
dep_sections = [section for section in deployment_json['sections'][transect_name]
                if section['is_valid']]

In [12]:
len(dep_sections), dep_sections

(24,
 [{'datetime_start': '2019-10-12T04:48:00Z',
   'datetime_end': '2019-10-15T15:47:00Z',
   'direction': 'W',
   'is_valid': True},
  {'datetime_start': '2019-10-15T15:47:00Z',
   'datetime_end': '2019-10-17T09:16:00Z',
   'direction': 'SE',
   'is_valid': True},
  {'datetime_start': '2019-10-17T09:16:00Z',
   'datetime_end': '2019-10-19T08:26:00Z',
   'direction': 'NE',
   'is_valid': True},
  {'datetime_start': '2019-10-19T08:26:00Z',
   'datetime_end': '2019-10-22T08:30:00Z',
   'direction': 'SW',
   'is_valid': True},
  {'datetime_start': '2019-10-22T08:30:00Z',
   'datetime_end': '2019-10-24T12:49:00Z',
   'direction': 'NE',
   'is_valid': True},
  {'datetime_start': '2019-10-24T12:49:00Z',
   'datetime_end': '2019-10-26T15:18:00Z',
   'direction': 'SW',
   'is_valid': True},
  {'datetime_start': '2019-10-26T15:18:00Z',
   'datetime_end': '2019-10-27T21:16:00Z',
   'direction': 'NE',
   'is_valid': True},
  {'datetime_start': '2019-10-27T21:16:00Z',
   'datetime_end': '2019-10

## Query ERDDAP

Sample dataset url: https://gliders.ioos.us/erddap/tabledap/ce_319-20190125T2248-delayed

In [13]:
# Glider DAC server endpoint
gliderdac_erddap_server = 'https://data.ioos.us/gliders/erddap'
protocol = 'tabledap'

In [14]:
e = ERDDAP(server=gliderdac_erddap_server, protocol=protocol)

In [15]:
gliderdac_erddap_id = deployment_json['gliderdac_erddap_id']
gliderdac_erddap_id

'ce_320-20191012T0448-delayed'

## Get the `profile_id` corresponding to a timestamp
Use this scheme to create our "NVS section files" based on the section timestamps from OOI / Stuart.

In the function below, the test `is_last and b'is outside of the variable' in resp_read` tests for the case of the last section where the ending datetime passed to the function is well beyond the last datetime available in the dataset. In that case, ERDDAP returns the following error:
```
Error {
    code=404;
    message="Not Found: Your query produced no matching results. (precise_time>=2019-08-08T17:42:00Z is outside of the variable's actual_range: 2019-05-15T18:18:17Z to 2019-08-06T17:54:02Z)";
}
```

In [16]:
def get_timestamp_profile_id(erddatasetid, 
                             datetime_iso8601, 
                             interval_delta=20,
                             min_or_max='min', 
                             is_last=False,
                             debug=False):
    e.dataset_id = erddatasetid
    e.variables = ["profile_id"]
    
    if datetime_iso8601:
        # interval_delta is in minutes
        dt = parse_time_string(datetime_iso8601)[0]

        # requested variables and constraints
        e.constraints = {
            "precise_time>=": dt - datetime.timedelta(minutes=interval_delta),
            "precise_time<=": dt + datetime.timedelta(minutes=interval_delta),
        }

        url = e.get_download_url(response='csv0') + '&distinct()'
        response = urlopen(url)
        resp_read = response.read()
        if not (is_last and b'is outside of the variable' in resp_read):
            if debug:
                return url
            profile_id_lst = [int(pid) for pid in resp_read.strip().split(b'\n')]

            # Making an assumption here, for simplicity
            if min_or_max == 'min':
                profile_id = min(profile_id_lst)
            elif min_or_max == 'max':
                profile_id = max(profile_id_lst)
                
            return profile_id
    
    if is_last:
        # datetime_iso8601 is None (the section is ongoing, so datetime_end is null)
        # Issue an ERDDAP query that extracts the largest profile_id
        # These request arguments do the trick, returning two columns, profile_id & wmo_id:
        # ?profile_id,wmo_id&orderByMax("wmo_id")        
        e.variables += ["wmo_id"]
        e.constraints = None

        url = e.get_download_url(response='csv0') + '&orderByMax("wmo_id")'
        if debug:
            return url
        
        response = urlopen(url)
        resp_read = response.read()
        # Only one row will be returned
        profile_id = int(resp_read.strip().split(b',')[0])
    
    return profile_id

### Tests

Use `min_or_max='max'` for the `get_timestamp_profile_id` call at a section start, and `min_or_max='min'` for the section end.

Each row of the NVS-style section file will look roughly like this:
```python
f"{start_profile_id} {end_profile_id} {section_direction_code}"

1 369 1
```

#### Section 1

In [17]:
# section = dep_sections[0]
# section_direction_code = direction_codes[section['direction']]

**NOTES from initial tests:** With this test, I'm getting two kinds of errors
1. Two (or more?) profile_id's are returned by `get_timestamp_profile_id` for the section start, if `interval_delta >= 30`
2. The section-end request to ERDDAP returns an error, even when `interval_delta` is as high as 79 minutes! It only start returning a valid response when it's set to 80 minutes

#### Section 2

In [18]:
# section = dep_sections[1]
# section_direction_code = direction_codes[section['direction']]

**NOTES from initial tests:** With this test, I'm getting two kinds of errors
1. The section-start request to ERDDAP returns an error, even when `interval_delta` is as high as 79 minutes! It only starts returning a valid response when it's set to `40` minutes
2. Two profile_id's (or more) are returned by `get_timestamp_profile_id` for the section end, if `interval_delta >= 34`. I settle don `20`

Need to increment the start profile id of all rows after the first one by 1, so it's not the same as the end profile_id for the previous section (and in that case, maybe there's no need to issue a new ERDDAP query at all, and just use the previous section's `end_profile_id`?!

### Process all sections

In [19]:
end_profile_id = -1
section_file_rows = []
for i, section in enumerate(dep_sections):
    section_direction_code = direction_codes[section['direction']]
    
    # **** Skip the last section if it's an active deployment
    # The active deployment file is not yet (as of 4/3/2020) being updated quickly enough 
    # and is likely to include more than one actual section (glider turns)
    if section['datetime_end'] is None:
        continue

    # Section Start
    if i > 0 and section['datetime_start'] == dep_sections[i-1]['datetime_end']:    
        # Set start_profile_id to be the next id after the previous section's end_profile_id
        start_profile_id = end_profile_id + 1
    else:
        interval_delta = 80 if i == 0 else 40  # 3/20: Testing 20 instead of 40
        start_profile_id = get_timestamp_profile_id(
            gliderdac_erddap_id, 
            section['datetime_start'],
            interval_delta=interval_delta,
            min_or_max='max'
        )
    
    # Section End
    # 3/19/2020: Changed interval_delta from 20 to 80 minutes, due to failure
    # to catch the situation in the last section of ce_319-20190125T2248-delayed
    is_last = True if i == len(dep_sections) - 1 else False
    interval_delta = 80 if is_last else 40
    end_profile_id = get_timestamp_profile_id(
        gliderdac_erddap_id, 
        section['datetime_end'],
        interval_delta=interval_delta,
        min_or_max='min',
        is_last=is_last
    )
    
    row = f"{start_profile_id} {end_profile_id} {section_direction_code}"
    section_file_rows.append(row)
    print(row)

1 203 1
204 259 5
260 381 4
382 582 6
583 732 4
733 858 6
859 920 4
921 1048 1
1049 1119 5
1120 1185 4
1186 1298 1
1299 1364 5
1365 1560 4
1561 1800 6
1801 2010 4
2011 2293 6
2294 2357 4
2358 2486 1
2487 2550 5
2551 2837 4
2838 3071 6
3072 3290 4
3291 3492 6
3493 3732 1


In [20]:
section_file_rows

['1 203 1',
 '204 259 5',
 '260 381 4',
 '382 582 6',
 '583 732 4',
 '733 858 6',
 '859 920 4',
 '921 1048 1',
 '1049 1119 5',
 '1120 1185 4',
 '1186 1298 1',
 '1299 1364 5',
 '1365 1560 4',
 '1561 1800 6',
 '1801 2010 4',
 '2011 2293 6',
 '2294 2357 4',
 '2358 2486 1',
 '2487 2550 5',
 '2551 2837 4',
 '2838 3071 6',
 '3072 3290 4',
 '3291 3492 6',
 '3493 3732 1']

### Notes
The last `precise_time` entry for deployment `ce_319-20190125T2248-delayed` in ERDDAP is "2019-04-21T03:58:43Z", for `profile_id` 815. That's 36 minutes before the `datetime_end` on that section, "2019-04-21T04:34:00Z". But b/c `interval_delta` is 20, the ERDDAP request returns no results.

## Save this section info to a file and upload it to data.nanoos.org (nile)

In [21]:
deployment_code = deployment['url'].split('/')[-1].rsplit('_', 1)[0]
deployment_code

'ce_320_00005'

In [24]:
with open(f"{transect_code}_{deployment_code}_NVSsectionfile.txt", 'w') as fo:
    fo.write('\n'.join(section_file_rows))