Skip to content

Commit

Permalink
Merge pull request #223 from opendatacube/simple_replication
Browse files Browse the repository at this point in the history
[Complete] Simple replication
  • Loading branch information
andrewdhicks committed May 15, 2017
2 parents 3f40842 + 02fcdbc commit c490244
Show file tree
Hide file tree
Showing 14 changed files with 345 additions and 9 deletions.
2 changes: 1 addition & 1 deletion .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
conda env create -q -n agdc --file .travis/environment_py35.yaml
source activate agdc
conda install -q sphinx sphinx_rtd_theme mock click # Stuff for docs
pip install .[analytics,test,interactive,docs] --no-deps --upgrade
pip install . --no-deps --upgrade
# - restore_cache:
# key: projectname-{{ .Branch }}-{{ checksum "yarn.lock" }}
Expand Down
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ install:

- conda env create -n agdc --file $CONDA_ENV_FILE
- source activate agdc
- pip install .[analytics,test,interactive] --no-deps --upgrade
- pip install . --no-deps --upgrade

- pip freeze

Expand Down
3 changes: 3 additions & 0 deletions .travis/environment_py27.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ dependencies:
- pathlib
- compliance-checker = 3.0.3
- pygeoif = 0.6 # compliance-checker 3.0.3 fails with 0.7
- paramiko # for simple-replicas
- sshtunnel # for simple-replicas
- tqdm # for simple-replicas
- pip:
- pypeg2
- pytest-cov # testing
Expand Down
3 changes: 3 additions & 0 deletions .travis/environment_py35.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ dependencies:
- pathlib
- compliance-checker = 3.0.3
- pygeoif = 0.6 # compliance-checker 3.0.3 fails with 0.7
- paramiko # for simple-replicas
- sshtunnel # for simple-replicas
- tqdm # for simple-replicas
- pip:
- pypeg2
- pytest-cov # testing
Expand Down
4 changes: 2 additions & 2 deletions datacube/scripts/metadata_type.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from __future__ import absolute_import, print_function

import json
import logging
from pathlib import Path
from pprint import pprint

import click
from click import echo
Expand Down Expand Up @@ -106,7 +106,7 @@ def show_metadata_type(index, metadata_type_name, verbose):
print(metadata_type_obj.description)
print('Search fields: %s' % ', '.join(sorted(metadata_type_obj.dataset_fields.keys())))
if verbose:
pprint(metadata_type_obj.definition, width=100)
echo(json.dumps(metadata_type_obj.definition, indent=4))


@metadata_type.command('list')
Expand Down
4 changes: 2 additions & 2 deletions datacube/scripts/product.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from __future__ import absolute_import

import json
import logging
from pathlib import Path
from pprint import pprint

import click
from click import echo
Expand Down Expand Up @@ -131,4 +131,4 @@ def show_product(index, product_name):
Show details about a product in the index
"""
product_def = index.products.get_by_name(product_name)
pprint(product_def.definition)
click.echo_via_pager(json.dumps(product_def.definition, indent=4))
201 changes: 201 additions & 0 deletions datacube_apps/simple_replica.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
#!/usr/bin/env python
"""
A Simple Data Cube Replication Tool
Connects to a remote Data Cube via SSH, and downloads database records and files to a local file system and database.
Provide a configuration file in ~/.datacube.replication.conf in YAML format, or specify an alternate location
on the command line.
For example, the following config will download 3 PQ products for the specified time and space range. Queries
are specified the same as when using the API to search for datasets.
.. code-block:: yaml
remote_host: raijin.nci.org.auo
remote_user: dra547
db_password: xxxxxxxxxxxx
remote_dir: /g/data/
local_dir: C:/datacube/
replicated_data:
- product: ls5_pq_albers
crs: EPSG:3577
x: [1200000, 1300000]
y: [-4200000, -4300000]
time: [2008-01-01, 2010-01-01]
- product: ls7_pq_albers
crs: EPSG:3577
x: [1200000, 1300000]
y: [-4200000, -4300000]
time: [2008-01-01, 2010-01-01]
- product: ls8_pq_albers
crs: EPSG:3577
x: [1200000, 1300000]
y: [-4200000, -4300000]
time: [2008-01-01, 2010-01-01]
"""

import logging
import os.path
from configparser import ConfigParser
from pathlib import Path

import click
import yaml
from paramiko import SSHClient, WarningPolicy
from sshtunnel import SSHTunnelForwarder
from tqdm import tqdm

from datacube import Datacube
from datacube.config import LocalConfig, _DEFAULT_CONF
from datacube.index import index_connect
from datacube.ui.click import global_cli_options

LOG = logging.getLogger('simple_replicator')

DEFAULT_REPLICATION_CONFIG = os.path.expanduser('~/.datacube.replication.conf')


def uri_to_path(uri):
return uri.replace('file://', '')


class DatacubeReplicator(object):
def __init__(self, config):
self.remote_host = config['remote_host']
self.remote_user = config['remote_user']
self.db_password = config['db_password']
self.remote_dir = config['remote_dir']
self.local_dir = config['local_dir']
self.replication_defns = config['replicated_data']

self.client = None
self.sftp = None
self.tunnel = None
self.remote_dc_config = None
self.remote_dc = None
self.local_index = index_connect()

def run(self):
self.connect()
self.read_remote_config()
self.connect_to_db()
self.replicate_all()
self.disconnect()

def connect(self):
client = SSHClient()
client.load_system_host_keys()
client.set_missing_host_key_policy(WarningPolicy())
client.connect(hostname=self.remote_host, username=self.remote_user)

LOG.debug(client)
self.client = client
self.sftp = client.open_sftp()

def disconnect(self):
self.client.close()
self.tunnel.stop()

def read_remote_config(self):
remote_config = ConfigParser()
remote_config.read_string(_DEFAULT_CONF)
with self.sftp.open('.datacube.conf') as fin:
remote_config.read_file(fin)
self.remote_dc_config = LocalConfig(remote_config)

def connect_to_db(self):
self.tunnel = SSHTunnelForwarder(
self.remote_host,
ssh_username=self.remote_user,
remote_bind_address=(self.remote_dc_config.db_hostname, int(self.remote_dc_config.db_port)))
self.tunnel.start()

# pylint: disable=protected-access
self.remote_dc_config._config['datacube']['db_hostname'] = '127.0.0.1'
self.remote_dc_config._config['datacube']['db_port'] = str(self.tunnel.local_bind_port)
self.remote_dc_config._config['datacube']['db_username'] = self.remote_user
self.remote_dc_config._config['datacube']['db_password'] = self.db_password

# This requires the password from somewhere
# Parsing it out of .pgpass sounds error prone and fragile
# Lets put it in the configuration for now
LOG.debug('Remote configuration loaded %s', self.remote_dc_config)

self.remote_dc = Datacube(config=self.remote_dc_config)

def replicate_all(self):

for defn in tqdm(self.replication_defns, 'Replicating products'):
self.replicate(defn)

def replicate_all_products(self):
products = self.remote_dc.index.products.get_all()
for product in products:
self.local_index.products.add(product)

def replicate(self, defn):
datasets = list(self.remote_dc.find_datasets(**defn))

if not datasets:
LOG.info('No remote datasets found matching %s', defn)
return

# TODO: use generator not list
product = datasets[0].type
LOG.info('Ensuring remote product is in local index. %s', product)

self.local_index.products.add(product)

for dataset in tqdm(datasets, 'Datasets'):
# dataset = remote_dc.index.datasets.get(dataset.id, include_sources=True)
# We would need to pull the parent products down too
# TODO: Include parent source datasets + product definitions
dataset.sources = {}

LOG.debug('Replicating dataset %s', dataset)
remote_path = uri_to_path(dataset.local_uri)
local_path = self.remote_to_local(uri_to_path(dataset.local_uri))

# Ensure local path exists
Path(local_path).parent.mkdir(parents=True, exist_ok=True)

# Download file
self.sftp.get(remote_path, local_path)

# Add to local index
dataset.local_uri = 'file://' + local_path
self.local_index.datasets.add(dataset)
LOG.debug('Downloaded to %s', local_path)

def remote_to_local(self, remote):
return remote.replace(self.remote_dir, self.local_dir)


def replicate_data(config):
replicator = DatacubeReplicator(config)
replicator.run()


@click.command()
@click.argument('config_path', required=False)
@global_cli_options
def replicate(config_path):
"""
Connect to a remote Datacube, and replicate data locally.
"""
if config_path is None:
config_path = DEFAULT_REPLICATION_CONFIG
LOG.debug('Config path: %s', config_path)
with open(config_path) as fin:
config = yaml.load(fin)

replicate_data(config)


if __name__ == '__main__':
replicate()
4 changes: 4 additions & 0 deletions docs/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -175,3 +175,7 @@ pseudoxml:
$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
@echo
@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."

livehtml:
sphinx-autobuild -b html -p 8123 --ignore "*_tmp_*" --ignore "*_old_*" $(ALLSPHINXOPTS) $(BUILDDIR)/html

7 changes: 7 additions & 0 deletions docs/_templates/layout.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{# Fix to allow changing the syntax highlighting colour scheme.
See https://github.com/rtfd/sphinx_rtd_theme/issues/166 for more info #}
{# layout.html #}
{# Import the theme's layout. #}
{% extends "!layout.html" %}

{% set css_files = css_files + ['_static/pygments.css'] %}
5 changes: 3 additions & 2 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,8 @@
}

click_utils_commands = {'datacube-search': 'datacube.scripts.search_tool:cli',
'datacube': 'datacube.scripts.cli_app:cli'}
'datacube': 'datacube.scripts.cli_app:cli',
'datacube-simple-replica': 'datacube_apps.simple_replica:replicate'}

graphviz_output_format = 'svg'

Expand Down Expand Up @@ -193,7 +194,7 @@

# If true, SmartyPants will be used to convert quotes and dashes to
# typographically correct entities.
#html_use_smartypants = True
html_use_smartypants = True

# Custom sidebar templates, maps document names to template names.
#html_sidebars = {}
Expand Down
1 change: 1 addition & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ and related data from multiple satellite and other acquisition systems.
ops/config
ops/prepare_scripts
ops/tools
ops/replication

.. toctree::
:maxdepth: 2
Expand Down
37 changes: 37 additions & 0 deletions docs/make.bat
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
@ECHO OFF

pushd %~dp0

activate agdc
REM Command file for Sphinx documentation

if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=.
set BUILDDIR=_build
set SPHINXPROJ=FooBar

if "%1" == "" goto help

%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.http://sphinx-doc.org/
exit /b 1
)

%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
goto end

:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%

:end
popd
Loading

0 comments on commit c490244

Please sign in to comment.