In [1]:
import h5py
import zarr
import numpy as np
import time
import os
import glob
import pandas as pd
from datetime import datetime, timedelta
from tqdm import tqdm

In [None]:
rootpath = '/data/data3/seadasn_2022-10-07_2023-01-13/'
flist = sorted(glob.glob(rootpath + "/seadasn_2022-12-*"))

# AWS S3
z = zarr.open(f"s3://seadas-december-2022/", 
               storage_options = {
                    "key": "this-is-key",
                    "secret": "this-is-secret",
                    "client_kwargs": {
                       "endpoint_url": "https://s3.us-west-2.amazonaws.com"
                   }
               }, mode = 'a')

# PNWstore1
z = zarr.open(f"s3://seadas-december-2022/", 
               storage_options = {
                    "key": "this-is-key",
                    "secret": "this-is-secret",
                    "client_kwargs": {
                       "endpoint_url": "http://pnwstore1.ess.washington.edu:9000"
                   }
               }, mode = 'a')

In [None]:
# define data set and metadata
zrawdata = z.create_dataset("RawData", shape=(2089, 100*60*60*24*31), chunks=(50, 6000), dtype='float32', 
                                        fill_value = np.nan)
zrawdata.attrs['_ARRAY_DIMENSIONS'] = ['channel', 'time']
zrawdata.attrs['overview.location'] = "Seattle, WA"
zrawdata.attrs['overview.deployment_type'] = 'temporary'
zrawdata.attrs['overview.network'] = "N/A"
zrawdata.attrs['overview.site_name'] = "SeaDAS North"
zrawdata.attrs['overview.number_of_interrogators'] = 1
zrawdata.attrs['overview.principle_investigators'] = "University of Washington"
zrawdata.attrs['overview.start_datetime'] = "2022-12-01"
zrawdata.attrs['overview.end_datetime'] = "2022-12-31"
zrawdata.attrs['overview.purpose_of_collection'] = "experiment"
zrawdata.attrs['overview.collection_mode'] = "continuous"
zrawdata.attrs['overview.comment'] = "N/A"

zrawdata.attrs['cable_and_fiber.cable_fiber_id'] = "N/A"
zrawdata.attrs['cable_and_fiber.cable_start_time'] = "N/A"
zrawdata.attrs['cable_and_fiber.cable_end_time'] = "N/A"
zrawdata.attrs['cable_and_fiber.cable_characteristics'] = "N/A"
zrawdata.attrs['cable_and_fiber.cable_environment'] = "N/A"
zrawdata.attrs['cable_and_fiber.cable_model'] = "N/A"
zrawdata.attrs['cable_and_fiber.cable_diameter'] = "N/A"
zrawdata.attrs['cable_and_fiber.cable_coordinates'] = "N/A"
zrawdata.attrs['cable_and_fiber.cable_connector_coordinates'] = "N/A"
zrawdata.attrs['cable_and_fiber.fiber_mode'] = "continuous"
zrawdata.attrs['cable_and_fiber.fiber_refraction_index'] = "N/A"
zrawdata.attrs['cable_and_fiber.attenuation'] = "N/A"
zrawdata.attrs['cable_and_fiber.fiber_geometry'] = "N/A"
zrawdata.attrs['cable_and_fiber.winding_angle'] = "N/A"
zrawdata.attrs['cable_and_fiber.fiber_start_location'] = "N/A"
zrawdata.attrs['cable_and_fiber.fiber_end_location'] = "N/A"
zrawdata.attrs['cable_and_fiber.fiber_length'] = "N/A"
zrawdata.attrs['cable_and_fiber.comment'] = "N/A"

zrawdata.attrs['interrogator.interrogator_id'] = "N/A"
zrawdata.attrs['interrogator.manufacturer'] = "N/A"
zrawdata.attrs['interrogator.model'] = "N/A"
zrawdata.attrs['interrogator.unit_of_measure'] = "N/A"
zrawdata.attrs['interrogator.comment'] = "N/A"

zrawdata.attrs['acquisition.acquisition_id'] = "N/A"
zrawdata.attrs['acquisition.acquisition_start_time'] = "2022-12-01T00:00:00.000000"
zrawdata.attrs['acquisition.acquisition_end_time'] = "2022-12-31T23:59:59.990000"
zrawdata.attrs['acquisition.acquisition_sample_rate'] = 100
zrawdata.attrs['acquisition.pulse_repetition_rate'] = "N/A"
zrawdata.attrs['acquisition.interrogator_rate'] = "N/A"
zrawdata.attrs['acquisition.pulse_width'] = "N/A"
zrawdata.attrs['acquisition.gauge_length'] = "N/A"
zrawdata.attrs['acquisition.number_of_channels'] = "N/A"
zrawdata.attrs['acquisition.channel_spacing'] = "N/A"
zrawdata.attrs['acquisition.archived_sample_rate'] = "N/A"
zrawdata.attrs['acquisition.unit_of_measure'] = "N/A"
zrawdata.attrs['acquisition.decimation'] = "N/A"
zrawdata.attrs['acquisition.filtering'] = "N/A"
zrawdata.attrs['acquisition.comment'] = "N/A"

In [None]:
t0 = datetime.fromisoformat(zrawdata.attrs['acquisition.acquisition_start_time']).timestamp()

for idx, i in tqdm(enumerate(flist), total = len(flist)):
    try:
        f = h5py.File(i, 'r')
        
        tsp_start = int(f['/Acquisition/Raw[0]/RawDataTime'][0]/10000)/1e2 + 28800
        tsp_end = int(f['/Acquisition/Raw[0]/RawDataTime'][-1]/10000)/1e2 + 28800

        ind_start = int((tsp_start - t0)/0.01)
        ind_end = int((tsp_end - t0)/0.01)

        assert (ind_start % 6000, ind_end % 6000) == (0, 5999)
        assert f['/Acquisition/Raw[0]/RawData'].shape == (6000, 2089)
        
        zrawdata[:1100, ind_start : ind_end + 1] = f['/Acquisition/Raw[0]/RawData'][:, :1100].T
        
    except:
        print(i)
        
    finally:
        f.close()