*Verified by Leila Belabbassi (to work with Pangeo), July 12, 2018*

In [1]:
import xarray as xr
import pandas as pd
import pickle as pk
import re
import requests
import os
import gc

In [2]:
url = 'https://opendap.oceanobservatories.org/thredds/catalog/ooi/friedrich.knuth@rutgers.edu/20171207T161702-RS03CCAL-MJ03F-05-BOTPTA301-streamed-botpt_nano_sample/catalog.html'
tds_url = 'https://opendap.oceanobservatories.org/thredds/dodsC'
datasets = requests.get(url).text
urls = re.findall(r'href=[\'"]?([^\'" >]+)', datasets)
x = re.findall(r'(ooi/.*?.nc)', datasets)
for i in x:
    if i.endswith('.nc') == False:
        x.remove(i)
for i in x:
    try:
        float(i[-4])
    except:
        x.remove(i)
datasets = [os.path.join(tds_url, i) for i in x]

In [3]:
# make the output directory
new_dir = 'minute_mean_data/'
if not os.path.isdir(new_dir):
    try:
        os.makedirs(new_dir)
    except OSError:
        if os.path.exists(new_dir):
            pass
        else:
            raise

In [4]:
# read in the data directly off THREDDS and write out as subsampled pickled pandas dataframe
# NOTE: It takes about one hour to subsample 69499.81 Mbytes of data and write it out to a dataframe. 
num = 0
for i in datasets:
    ds = xr.open_dataset(i)
    ds = ds.swap_dims({'obs': 'time'})

    pressure_min = pd.DataFrame()
    pressure_min['bottom_pressure'] = ds['bottom_pressure'].to_pandas().resample('T').mean()
    del pressure_min.index.name

    pressure_min = pressure_min.dropna()

    out = 'minute_mean_data/' + i.split('/')[-1][:-3] + '_resampled' + '.pd'
    num = num +1

    with open(out, 'wb') as fh:
        pk.dump(pressure_min,fh)

    gc.collect()

In [5]:
# create a single file with all the pickled data.
pressure_min = pd.DataFrame()
for path, subdirs, files in os.walk('minute_mean_data/'):
    for name in files:
        file_name = os.path.join(path, name) 
        with open(file_name, 'rb') as f:
            pd_df = pk.load(f)
            pressure_min = pressure_min.append(pd_df)

with open('bottom_pressure.pd', 'wb') as f:
    pk.dump(pressure_min,f)