Copied from https://github.com/martindurant/intake-release-blog/blob/master/data_engineer.ipynb

# Data Engineering with Intake

Intake provides a way to isolate the definition of data-sets from the code which will then load and process these data. Here we show how you might go about building entries in a YAML catalog file.


In [1]:
import intake
# intake.output_notebook()



In [2]:
list(intake.registry)

['yaml_file_cat',
 'yaml_files_cat',
 'remote-xarray',
 'esm_metadatastore',
 'thredds_cat',
 'xarray_image',
 'netcdf',
 'opendap',
 'rasterio',
 'zarr',
 'csv',
 'textfiles',
 'catalog',
 'intake_remote',
 'numpy',
 'ndzarr']

In [11]:
# get a list of files to look at using WFS
import pandas as pd
url = ("http://geoserver-123.aodn.org.au/geoserver/ows?typeName=moorings_all_map&SERVICE=WFS&REQUEST=GetFeature&VERSION=1.0.0&outputFormat=csv&CQL_FILTER=(file_version=%271%27%20AND%20realtime=FALSE%20AND%20site_code=%27NRSROT%27)")
df = pd.read_csv(url)

In [12]:
df.head()

Unnamed: 0,FID,file_id,url,date_created,date_published,date_updated,size,feature_type,file_version,toolbox_version,...,has_air_temperature,has_salinity,has_water_pressure,has_air_pressure,has_sea_water_velocity,has_oxygen,has_chlorophyll,has_fluorescence,has_wave_parameters,geom
0,moorings_all_map.fid--5dc4962a_16ab8e09aa3_5357,42395,IMOS/ANMN/NRS/NRSROT/Biogeochem_profiles/IMOS_...,2016-07-25T08:34:44,2016-07-25T08:51:28.685,2016-07-25T08:51:28.696,113990,profile,1,2.5.12 - PCWIN,...,False,True,True,False,False,True,False,False,False,POINT (115.4166666667 -32.0000166667)
1,moorings_all_map.fid--5dc4962a_16ab8e09aa3_5358,55419,IMOS/ANMN/NRS/NRSROT/Biogeochem_profiles/IMOS_...,2017-04-05T00:24:30,2017-04-05T00:26:30.006,2017-04-05T00:26:30.011,113742,profile,1,2.5.24 - PCWIN64,...,False,True,True,False,False,True,False,False,False,POINT (115.4166666667 -32.0000166667)
2,moorings_all_map.fid--5dc4962a_16ab8e09aa3_5359,56339,IMOS/ANMN/NRS/NRSROT/Biogeochem_profiles/IMOS_...,2017-05-04T23:34:16,2017-05-05T00:35:37.463,2017-05-05T00:35:37.486,113742,profile,1,2.5.25 - PCWIN64,...,False,True,True,False,False,True,False,False,False,POINT (115.4166666667 -32.0000166667)
3,moorings_all_map.fid--5dc4962a_16ab8e09aa3_535a,82699,IMOS/ANMN/NRS/NRSROT/Biogeochem_profiles/IMOS_...,2019-03-22T06:39:05,2019-03-25T03:33:07.524,2019-03-25T03:33:07.551,101926,profile,1,2.5.42 - PCWIN64,...,False,True,True,False,False,False,False,False,False,POINT (115.4 -32)
4,moorings_all_map.fid--5dc4962a_16ab8e09aa3_535b,82703,IMOS/ANMN/NRS/NRSROT/Biogeochem_profiles/IMOS_...,2019-03-22T07:34:32,2019-03-25T03:33:44.999,2019-03-25T03:33:45.03,101912,profile,1,2.5.42 - PCWIN64,...,False,True,True,False,False,False,False,False,False,POINT (115.4 -32)


In [23]:
tempfiles = df[(df.data_category=='Temperature') & (df.file_version==1)]
len(tempfiles)

75

In [29]:
tempfiles_latest = tempfiles[tempfiles.date_updated > '2019-01-01']
len(tempfiles_latest)

3

In [38]:
testfiles = ['imos-data/'+u for u in tempfiles_latest.url]
testfiles

['imos-data/IMOS/ANMN/NRS/NRSROT/Temperature/IMOS_ANMN-NRS_TZ_20180816T080000Z_NRSROT_FV01_NRSROT-1808-SBE39-27_END-20181214T034000Z_C-20190402T065832Z.nc',
 'imos-data/IMOS/ANMN/NRS/NRSROT/Temperature/IMOS_ANMN-NRS_TZ_20180816T080000Z_NRSROT_FV01_NRSROT-1808-SBE39-33_END-20181214T032000Z_C-20190402T065833Z.nc',
 'imos-data/IMOS/ANMN/NRS/NRSROT/Temperature/IMOS_ANMN-NRS_TZ_20180816T080000Z_NRSROT_FV01_NRSROT-1808-SBE39-43_END-20181214T030000Z_C-20190402T065833Z.nc']

In [47]:
# let's try to open just one file
source = intake.open_netcdf(
    'imos-data/IMOS/ANMN/NRS/NRSROT/Temperature/IMOS_ANMN-NRS_TZ_*_C-2019*.nc',
    concat_dim='TIME'
)
source.discover()
# d = source.read()

{'datashape': None,
 'dtype': None,
 'shape': None,
 'npartitions': None,
 'metadata': {'dims': {'TIME': 51759},
  'data_vars': {'TIMESERIES': ['LATITUDE',
    'LONGITUDE',
    'TIME',
    'NOMINAL_DEPTH'],
   'TEMP': ['LATITUDE', 'LONGITUDE', 'TIME', 'NOMINAL_DEPTH'],
   'TEMP_quality_control': ['LATITUDE', 'LONGITUDE', 'TIME', 'NOMINAL_DEPTH'],
   'DEPTH': ['LATITUDE', 'LONGITUDE', 'TIME', 'NOMINAL_DEPTH'],
   'DEPTH_quality_control': ['LATITUDE',
    'LONGITUDE',
    'TIME',
    'NOMINAL_DEPTH']},
  'coords': ('LATITUDE', 'LONGITUDE', 'TIME', 'NOMINAL_DEPTH'),
  'abstract': 'NRSROT Rottnest Island, WA, 60m mooring, Aug2018 - Dec2018. Preprocessed with DepthPP.',
  'acknowledgement': 'Any users of IMOS data are required to clearly acknowledge the source of the material derived from IMOS in the format: "Data was sourced from the Integrated Marine Observing System (IMOS) - IMOS is a national collaborative research infrastructure, supported by the Australian Government." If relevant, al

In [None]:
print(source.yaml())