In [1]:
# default_exp core

In [2]:
#hide
from nbdev.showdoc import show_doc

# Core

> Core tools for working with storage.

In [3]:
#export
from abc import ABC,abstractmethod
from configparser import ConfigParser
from pathlib import Path
import azure.storage.blob,azure.core.exceptions
import boto3
import zlib,shutil,re,json,importlib,hashlib,datetime
from typing import List,Tuple,Optional,Union

In [4]:
#hide
from fastcore.test import *
from configparser import SectionProxy

In [5]:
#export
def read_config(section_name:str=None,config_name:str='secrets/settings.ini'):
    "Read the INI file `config_name` and return a dict for `section_name` if specified"
    config_path=Path(config_name)
    config=ConfigParser()
    config.read(config_path)
    if section_name is None:
        return config
    if section_name not in config:
        raise Exception(f'Error: [{section_name}] section not found in {config_path}')
    return dict(config.items(section_name))

If `section_name` is not specified, this function will return the `ConfigParser` used to read the INI file.

In [6]:
assert isinstance(read_config(),ConfigParser)
assert isinstance(read_config()['DEFAULT'],SectionProxy)
assert isinstance(read_config('DEFAULT'),dict)
test_eq(read_config('local_cwd',config_name='test/settings.ini')['storage_client'],
        'storage_tools.core.LocalStorageClient')

In [7]:
#export
def parse_dataset_archive_name(name:str) -> Optional[Tuple[str,...]]:
    "Returns (name,version) if `name` is a dataset archive name, `None` otherwise"
    match = re.match(r'^([\./\s\w-]+)\.(\d+\.\d+\.\d+)\.zip$',name)
    return None if match is None else match.group(1,2)

In [8]:
test_eq(('dsetname', '0.0.1'), parse_dataset_archive_name('dsetname.0.0.1.zip'))
test_eq(('dsetname.txt', '0.2.1'), parse_dataset_archive_name('dsetname.txt.0.2.1.zip'))
test_eq(('path/to/dsetname', '0.0.1'), parse_dataset_archive_name('path/to/dsetname.0.0.1.zip'))
test_eq(('//path/to/dsetname', '0.0.1'), parse_dataset_archive_name('//path/to/dsetname.0.0.1.zip'))
test_eq(None, parse_dataset_archive_name('dsetname.0.0.1.csv'))
test_eq(None, parse_dataset_archive_name('dsetname.0.1.zip'))
test_eq(None, parse_dataset_archive_name('dsetname.0.a.1.csv'))
test_eq(None, parse_dataset_archive_name('.0.0.1.zip'))
test_eq(None, parse_dataset_archive_name('0.0.1.csv'))
test_eq(None, parse_dataset_archive_name('dsetname.0.0.1'))

In [9]:
#export 
def parse_dataset_archive_version(version:str) -> List[int]:
    "Returns (major,minor,patch) if `version` is a valid dataset archive version"
    match = re.match(r'^(\d+)\.(\d+)\.(\d+)$',version)
    if match is None: raise ValueError(f'Invalid version: {version}')
    return [int(s) for s in match.group(1,2,3)]

In [10]:
test_eq([0,1,2],parse_dataset_archive_version('0.1.2'))
test_eq([5,4,3],parse_dataset_archive_version('5.4.3'))
test_fail(lambda: parse_dataset_archive_version('0.1.2.'))
test_fail(lambda: parse_dataset_archive_version('0.1'))

In [11]:
#export
def next_version(versions:List[str]=None,increment:str='patch'):
    "Return the version that should follow the last version in `versions`"
    v=[0,0,0] if versions is None else parse_dataset_archive_version(versions[-1])
    if increment=='patch': v[2]+=1
    elif increment=='minor': v[1]+=1;v[2]=0
    elif increment=='major': v[0]+=1;v[1]=0;v[2]=0
    else: raise ValueError(f'Unknown increment: {increment}')
    return f'{v[0]}.{v[1]}.{v[2]}'

In [12]:
test_eq('0.0.1',next_version(None))
test_eq('33.55.67',next_version(['2.4.60','33.55.66']))
test_eq('0.1.0',next_version(None,'minor'))
test_eq('1.0.0',next_version(None,'major'))
test_eq('3.0.0',next_version(['2.4.60'],'major'))
test_fail(lambda: next_version(None,'beta'))
test_fail(lambda: next_version(['2.4.60','33.55.66a']))

In [13]:
#export
def sha256(file:Union[Path,str]) -> str:
    "Return the secure hash (as a hex digest) of the specified file"
    m = hashlib.sha256()
    with open(file,'rb') as f: m.update(f.read())
    return m.hexdigest()        

In [14]:
test_eq(sha256('test/test_data.csv'),sha256(Path('test/test_data.csv')))

In [15]:
#export
def _get_manifest(archive_folder):
    "Returns (archive folder path, manifest file path, manifest as dict)"
    p=Path(archive_folder)
    mf=p/'manifest.json'
    if mf.is_file():
        with open(mf) as f: m=json.load(f)
    else: 
        m={}
    return p,mf,m

def make_or_update_manifest(archive_folder:Union[Path,str]):
    "Create or update a manifest in `archive_folder`"
    p,mf,m=_get_manifest(archive_folder)
    m['datetime']=datetime.datetime.utcnow().isoformat()
    m['files']=[]
    len_p=len(str(p).replace('\\','/'))
    for f in [f for f in p.rglob('*') if f.is_file()]:
        m['files'].append(dict(
            file=str(f).replace('\\','/')[len_p+1:],
            sha256=sha256(f)))
    with open(mf,'w') as f: json.dump(m,f,indent=2,sort_keys=True)
        
def check_archive(archive_folder:Union[Path,str]):
    "Check that all files listed in manifest.json have the correct secure hash"
    p,mf,m=_get_manifest(archive_folder)
    for file in m['files']:
        expected,actual=file['sha256'],sha256(p/file['file'])
        if actual!=expected:
            raise ValueError(f"sha mismatch for {file['file']}. Expected {expected} but found {actual}")

In [16]:
#export
def make_dataset_archive_folder(
        path:str, name:str, versions:List[str]=None, version:str='patch') -> str:
    "Create a new dataset archive folder in `path`"
    src=Path(path)/name
    if not src.exists(): raise FileNotFoundError(f'{src} not found')
        
    if version in ['major','minor','patch']:
        version=next_version(versions,version)
    else:
        parse_dataset_archive_version(version)
        
    archive_folder=Path(path)/'.'.join([name,version])
    if archive_folder.exists(): 
        raise FileExistsError(f'Archive folder {archive_folder} exists')
    if src.is_file(): 
        archive_folder.mkdir(parents=True)
        shutil.copy(src,archive_folder)
    else: 
        shutil.copytree(src,archive_folder)
    make_or_update_manifest(archive_folder)
    return f'{path}/{name}.{version}'

In [17]:
#hide
def _rmtree(p):
    try: shutil.rmtree(p)
    except FileNotFoundError: pass

In [18]:
test_config=dict(storage_client='storage_tools.core.LocalStorageClient',
                 local_path='test/local_path2',storage_area='test/storage_area2')

In [19]:
def _make_local_test_data():
    for p in [test_config['local_path'],test_config['storage_area']]: _rmtree(p)
    test_files=['a/b/test_data.2.0.0.txt','test_data.txt']
    for i in reversed(range(3)): test_files.insert(1,f'sub/test_data.0.0.{i}.txt')
    for i,f in enumerate(test_files):
        f=test_config['local_path']+'/'+f
        Path(f).parent.mkdir(parents=True,exist_ok=True)
        with open(f, 'w') as _file: _file.write(f'a little bit of data {i}')
    return test_files

In [20]:
_make_local_test_data()

test_eq(test_config['local_path']+'/test_data.txt.0.0.1',
        make_dataset_archive_folder(test_config['local_path'],'test_data.txt'))
test_eq(test_config['local_path']+'/test_data.txt.2.5.0',
        make_dataset_archive_folder(test_config['local_path'],'test_data.txt',['2.4.6'],'minor'))
test_eq(test_config['local_path']+'/sub.0.0.1',
        make_dataset_archive_folder(test_config['local_path'],'sub'))
# TODO: check archive folder contents

In [21]:
#export
class StorageClientABC(ABC):
    """Defines functionality common to all storage clients"""
    
    def __init__(self, config:dict):
        "Create a new storage client using the specified `config`"
        self.config=config
        
    def config_get(self,key,default=None,dtype=None):
        "Return a value via `self.config.get` optionally checking that the value is of `dtype`"
        result=self.config.get(key,default)
        if dtype is not None and not isinstance(result,dtype): 
            raise ValueError(f'Config[{key}] should be a {dtype} but we found {result} which is a {type(result)}')
        return result

    def ls(self, what:str='storage_area',name_starts_with:str=None) -> List[str]:
        "Return a list containing the names of files in either `storage_area` or `local_path`"
        p=Path(self.config[what])
        p.mkdir(parents=True,exist_ok=True)
        len_p=len(str(p).replace('\\','/'))
        result=[str(f).replace('\\','/')[len_p+1:] for f in p.rglob('*') if f.is_file()]
        if name_starts_with is not None: 
            result=[r for r in result if r.startswith(name_starts_with)]
        return sorted(result)
        
    @abstractmethod
    def download(self, filename:str) -> Path: 
        "Copy `filename` from `storage_area` to `local_path`"
    
    @abstractmethod
    def upload(self, filename:str, overwrite=False) -> Union[Path,str]: 
        "Copy `filename` from `local_path` to `storage_area`"
        
    def _sort_by_dataset_archive_version(self,version):
        try: return tuple(parse_dataset_archive_version(version))
        except: return (-1,-1,-1)
        
    def ls_versions(self, name:str, what:str='storage_area') -> Union[List[str],None]:
        "Return a list containing all versions of the specified archive `name`"
        files=[parse_dataset_archive_name(f) for f in self.ls(what)]
        result=[f[1] for f in files if f is not None and f[0]==name]
        if not result: return None
        return sorted(result, key=self._sort_by_dataset_archive_version)
        
    def upload_dataset(self, name:str, version:str='patch') -> Union[Path,str]:
        "Create a new dataset archive and upload it to `storage_area`"
        archive_folder=make_dataset_archive_folder(
                self.config['local_path'],name,self.ls_versions(name),version)
        default_compression=zlib.Z_DEFAULT_COMPRESSION
        compression=self.config_get('compression_level',default_compression,int)
        try:
            zlib.Z_DEFAULT_COMPRESSION=int(compression)
            shutil.make_archive(archive_folder,'zip',archive_folder)
        finally:
            zlib.Z_DEFAULT_COMPRESSION=default_compression
        return self.upload(f"{archive_folder[len(self.config['local_path'])+1:]}.zip")
        
    def download_dataset(self, name:str, version:str='latest', overwrite:bool=False) -> Path:
        "Download a dataset archive from `storage_area` and extract it to `local_path`"
        if version=='latest': 
            versions=self.ls_versions(name)
            if versions is None:
                raise ValueError('latest version requested but no versions exist in storage area')
            version=versions[-1]
        dst=Path(self.config['local_path'])/f'{name}.{version}'
        if dst.exists():
            if not overwrite: return dst
            else: shutil.rmtree(dst)
        archive=self.download(f'{name}.{version}.zip')
        shutil.unpack_archive(str(archive),dst)
        check_archive(dst)
        return dst

In [22]:
show_doc(StorageClientABC.__init__)

<h4 id="StorageClientABC.__init__" class="doc_header"><code>StorageClientABC.__init__</code><a href="__main__.py#L5" class="source_link" style="float:right">[source]</a></h4>

> <code>StorageClientABC.__init__</code>(**`config`**:`dict`)

Create a new storage client using the specified `config`

In [23]:
show_doc(StorageClientABC.ls)

<h4 id="StorageClientABC.ls" class="doc_header"><code>StorageClientABC.ls</code><a href="__main__.py#L16" class="source_link" style="float:right">[source]</a></h4>

> <code>StorageClientABC.ls</code>(**`what`**:`str`=*`'storage_area'`*, **`name_starts_with`**:`str`=*`None`*)

Return a list containing the names of files in either `storage_area` or `local_path`

In [24]:
show_doc(StorageClientABC.ls_versions)

<h4 id="StorageClientABC.ls_versions" class="doc_header"><code>StorageClientABC.ls_versions</code><a href="__main__.py#L38" class="source_link" style="float:right">[source]</a></h4>

> <code>StorageClientABC.ls_versions</code>(**`name`**:`str`, **`what`**:`str`=*`'storage_area'`*)

Return a list containing all versions of the specified archive `name`

In [25]:
show_doc(StorageClientABC.upload_dataset)

<h4 id="StorageClientABC.upload_dataset" class="doc_header"><code>StorageClientABC.upload_dataset</code><a href="__main__.py#L45" class="source_link" style="float:right">[source]</a></h4>

> <code>StorageClientABC.upload_dataset</code>(**`name`**:`str`, **`version`**:`str`=*`'patch'`*)

Create a new dataset archive and upload it to `storage_area`

- `name`
    - file or folder name
    - which must exist in "local_path"
    - without "local_path" prefix
    - e.g. if
        - "local_path" is "~/storage_tools/test/local_path"
        - and you want to upload "~/storage_tools/test/local_path/test_data.txt" as a dataset
        - you would pass the name "test_data.txt"
- `version`
    - "major", "minor" or "patch" to automatically create a new version or
    - version literal that matches `\d+\.\d+\.\d+` (e.g. "1.0.45")

<code>upload_dataset</code> will;
- create a folder `[local_path]/[name].[version]`
    - if this folder already exists, as error will be raised
- copy the file or folder contents (and all sub-folders) to this folder
- create a manifest in this folder
- create a zip archive, called `[name].[version].zip`, of this folder
    - Use the optional `compression_level` config setting to control the level of compression used by `zlib`
    - If you are creating a dataset of files that are already compressed (e.g. images, models etc), setting `compression_level=0` can make this step run much faster
- upload the zip archive to remote storage
- return the location of the dataset in remote storage

Why no `overwrite` option?
- It is not expected that archives will need to be overwritten
    - as we want to be able to re-run old experiments using the data as it was
- bad archives could be deleted via storage API (e.g. `storage_client.client.delete_blob('test.0.0.1.zip')`) or via storage browsers
    - we might want to add a soft delete, archive status etc to handle this kind of thing?

In [26]:
show_doc(StorageClientABC.download_dataset)

<h4 id="StorageClientABC.download_dataset" class="doc_header"><code>StorageClientABC.download_dataset</code><a href="__main__.py#L58" class="source_link" style="float:right">[source]</a></h4>

> <code>StorageClientABC.download_dataset</code>(**`name`**:`str`, **`version`**:`str`=*`'latest'`*, **`overwrite`**:`bool`=*`False`*)

Download a dataset archive from `storage_area` and extract it to `local_path`

- `name`
    - dataset name
- `version`
    - "latest" download the latest version of the dataset
    - version literal that matches `\d+\.\d+\.\d+` (e.g. "1.0.45")
- `overwrite`
    - If `False` and the dataset exists in "local_path", this is a no op
    - If `True` and the dataset exists in "local_path", delete the dataset and re-download
    
<code>download_dataset</code> will;
- download a zip archive, called `[name].[version].zip`, from remote storage
- extract it to `[local_path]/[name].[version]`
- check that all files listed in manifest.json have the correct secure hash
- return the location of the dataset in local storage

In [27]:
#export
class LocalStorageClient(StorageClientABC):
    """Storage client that uses the local filesystem for both `storage_area` and `local_path`"""
    
    def _cp(self,from_key,to_key,filename,overwrite=False):
        src=Path(self.config[from_key])/filename
        dst=Path(self.config[to_key])/filename
        if dst.exists() and not overwrite: 
            raise FileExistsError(f'{dst} exists and overwrite=False')
        dst.parent.mkdir(parents=True,exist_ok=True)
        shutil.copy(src,dst)
        return dst
        
    def download(self,filename,overwrite=False):
        try: self._cp('storage_area','local_path',filename,overwrite)
        except FileExistsError: pass
        return Path(self.config['local_path'])/filename
        
    def upload(self,filename,overwrite=False): 
        return self._cp('local_path','storage_area',filename,overwrite)

`LocalStorageClient` will most often be used for local testing.

In [28]:
storage_client=LocalStorageClient(test_config)
assert storage_client.config['storage_client']=='storage_tools.core.LocalStorageClient'
test_fail(lambda: storage_client.config['bad_key'])

In [29]:
show_doc(StorageClientABC.config_get)

<h4 id="StorageClientABC.config_get" class="doc_header"><code>StorageClientABC.config_get</code><a href="__main__.py#L9" class="source_link" style="float:right">[source]</a></h4>

> <code>StorageClientABC.config_get</code>(**`key`**, **`default`**=*`None`*, **`dtype`**=*`None`*)

Return a value via `self.config.get` optionally checking that the value is of `dtype`

In [30]:
assert storage_client.config_get('storage_client')=='storage_tools.core.LocalStorageClient'
assert storage_client.config_get('bad_key')==None
assert storage_client.config_get('storage_client',dtype=str)=='storage_tools.core.LocalStorageClient'
test_fail(lambda: storage_client.config_get('storage_client',dtype=int))
assert storage_client.config_get('bad_key','but its ok')=='but its ok'
assert storage_client.config_get('storage_client','default is not used')=='storage_tools.core.LocalStorageClient'

In [31]:
#export
class AzureStorageClient(StorageClientABC):
    """Storage client that uses Azure for `storage_area` and the local filesystem `local_path`"""
    
    @property
    def client(self):
        if not hasattr(self,'_client'):
            service_client=azure.storage.blob.BlobServiceClient.from_connection_string(
                self.config['conn_str'],self.config['credential'])
            self._client=service_client.get_container_client(self.config['container'])
        return self._client
    
    def ls(self,what='storage_area',name_starts_with=None):
        if what=='local_path': return super().ls(what,name_starts_with)
        result=[b.name for b in self.client.list_blobs(name_starts_with)]
        return sorted(result)
    
    def download(self,filename,overwrite=False):
        p=Path(self.config['local_path'])/filename
        if p.exists() and not overwrite: return p
        p.parent.mkdir(parents=True,exist_ok=True)
        with open(p, 'wb') as f:
            f.write(self.client.download_blob(filename).readall())
        return p
            
    def upload(self,filename,overwrite=False): 
        p=Path(self.config['local_path'])/filename
        try:
            with open(p, 'rb') as f:
                self.client.upload_blob(filename,f,overwrite=overwrite)
            return f"{self.config['storage_client']}:{self.config['container']}:{filename}"
        except azure.core.exceptions.ResourceExistsError as e:
            raise FileExistsError(f'{e}\noverwrite=False')

In [32]:
#export
class AwsStorageClient(StorageClientABC):
    """Storage client that uses AWS for `storage_area` and the local filesystem `local_path`"""

    @property
    def client(self):
        if not hasattr(self,'_client'):
            self._client=boto3.client(service_name=self.config['service_name'],
                                      aws_access_key_id=self.config['aws_access_key_id'],
                                      aws_secret_access_key=self.config['aws_secret_access_key'])
        return self._client
    
    def ls(self,what='storage_area',name_starts_with=None): 
        if what=='local_path': return super().ls(what,name_starts_with)
        args=dict(Bucket=self.config['bucket'])
        if name_starts_with is not None: args['Prefix']=name_starts_with
        objects=self.client.list_objects_v2(**args)
        if objects['KeyCount']==0: return []
        result=[o['Key'] for o in objects['Contents'] if o['Size']>0]
        return sorted(result)
    
    def download(self,filename,overwrite=False): 
        p=Path(self.config['local_path'])/filename
        if p.exists() and not overwrite: return p
        p.parent.mkdir(parents=True,exist_ok=True)
        self.client.download_file(
                Filename='/'.join([self.config['local_path'],filename]),
                Bucket=self.config['bucket'],
                Key=filename)
        return p
        
    def upload(self,filename,overwrite=False): 
        result=f"{self.config['storage_client']}:{self.config['bucket']}:{filename}"
        if overwrite==False and filename in [self.ls(name_starts_with=filename)]:
            raise FileExistsError(f'{result} exists and overwrite=False')
        self.client.upload_file(
                Filename='/'.join([self.config['local_path'],filename]),
                Bucket=self.config['bucket'],
                Key=filename)
        return result

Ideally, we would use the same property keys in settings.ini as the boto3 API but `ConfigParser` converts keys to lower case by default.

So if boto3 has a parameter called `Bucket`, we need `bucket=a-bucket-name` settings.ini.

#hide
If we pass `None` to `Prefix` when listing objects (e.g. `list_objects_v2(Prefix=None)`) AWS raises an error.
This is why we have to create and unpack the `args` dictionary when we call `list_objects_v2`.

In [33]:
#export
def new_storage_client(storage_name:str,config_name:str='secrets/settings.ini'):
    "Returns a storage client based on the configured `storage_client`"
    try:
        config=read_config(storage_name,config_name=config_name)
        storage_client=config['storage_client']
        module=importlib.import_module(storage_client[:storage_client.rindex('.')])
        return getattr(module,storage_client[storage_client.rindex('.')+1:])(config)
    except Exception as ex:
        message=f'Failed to create storage client. storage_name={storage_name}, config_name={config_name}'
        raise ValueError(message) from ex

This function reads the `storage_name` section of `config_name`. 

A key config setting is `storage_client` which should;
- specify the storage client implementation to use (fully qualified with package name)
- refer to a module that can be imported
- refer to a class that is a `StorageClientABC` (that has not changed the signature of `__init__`)

This makes it possible to use a storage client implementation that is not defined in this project.

In [34]:
storage_client=new_storage_client('local_test','test/settings.ini')
assert storage_client.config['storage_client']=='storage_tools.core.LocalStorageClient'
test_fail(lambda: new_storage_client('gcp_dummy','test/settings.ini'))

In [35]:
for p in [test_config['local_path'],test_config['storage_area']]: _rmtree(p)

storage_client=LocalStorageClient(test_config)
assert isinstance(storage_client,LocalStorageClient)
assert storage_client.config['storage_client']=='storage_tools.core.LocalStorageClient'
test_eq([],storage_client.ls())
test_eq([],storage_client.ls('local_path'))
    
test_files=_make_local_test_data()
test_eq([],storage_client.ls())
test_eq(test_files,storage_client.ls('local_path'))
test_eq(['a/b/test_data.2.0.0.txt'],storage_client.ls('local_path','a/b'))
test_eq([],storage_client.ls('local_path','does/not/exist'))
        
for f in test_files: storage_client.upload(f)
test_eq(test_files,storage_client.ls())
test_eq(['a/b/test_data.2.0.0.txt'],storage_client.ls(name_starts_with='a/b'))
test_eq([],storage_client.ls(name_starts_with='does_no_exist'))
test_eq(test_files,storage_client.ls('local_path'))
_rmtree(test_config['local_path'])
test_eq([],storage_client.ls('local_path'))

for f in test_files: storage_client.download(f)
test_eq(test_files,storage_client.ls('local_path'))
test_eq('a little bit of data 4',open(test_config['local_path']+'/test_data.txt').read())

with open(test_config['local_path']+'/test_data.txt', 'w') as _file: _file.write('upd')
test_eq('upd',open(test_config['local_path']+'/test_data.txt').read())
storage_client.download('test_data.txt')
test_eq('upd',open(test_config['local_path']+'/test_data.txt').read())
storage_client.download('test_data.txt',True)
test_eq('a little bit of data 4',open(test_config['local_path']+'/test_data.txt').read())

test_fail(lambda: storage_client.upload('test_data.txt'))
storage_client.upload('test_data.txt',True)

test_eq(None,storage_client.ls_versions('this/does/not/exitst'))

In [36]:
for p in [test_config['local_path'],test_config['storage_area']]: _rmtree(p)
    
storage_client=LocalStorageClient(test_config)
test_files=_make_local_test_data()

def _t(expected,upload_name,version='patch'):
    test_eq(Path(test_config['storage_area'])/expected,storage_client.upload_dataset(upload_name,version))
_t('test_data.txt.0.0.1.zip','test_data.txt')
_t('sub.0.0.1.zip','sub')
_t('sub/test_data.0.0.2.txt.0.0.1.zip','sub/test_data.0.0.2.txt')
# switch off compression when creating zip archives
test_config['compression_level']=0
_t('a.3.0.0.zip','a','3.0.0')
_t('a.3.0.1.zip','a')
# TODO: check zip contents
_rmtree(test_config['local_path']+'/a.3.0.0')
_rmtree(test_config['local_path']+'/a.3.0.1')
test_eq(Path(test_config['local_path'])/'a.3.0.1',storage_client.download_dataset('a'))
test_eq(Path(test_config['local_path'])/'a.3.0.0',storage_client.download_dataset('a','3.0.0'))
test_eq(Path(test_config['local_path'])/'a.3.0.0',storage_client.download_dataset('a','3.0.0'))
test_eq(Path(test_config['local_path'])/'a.3.0.0',storage_client.download_dataset('a','3.0.0',True))
# check manifest was created and downloaded as part of the dataset
assert Path(test_config['local_path']+'/a.3.0.0/manifest.json').is_file()
check_archive(test_config['local_path']+'/a.3.0.0')
# if we change the secure hash for any of the files in the dataset, check_archive should fail
manifest=json.load(open(test_config['local_path']+'/a.3.0.0/manifest.json'))
manifest['files'][0]['sha256']='thisllnevermatch'
json.dump(manifest,open(test_config['local_path']+'/a.3.0.0/manifest.json','w'))
test_fail(lambda: check_archive(test_config['local_path']+'/a.3.0.0'))
# only files listed in the manifest are checked
manifest['files']=[]
json.dump(manifest,open(test_config['local_path']+'/a.3.0.0/manifest.json','w'))
check_archive(test_config['local_path']+'/a.3.0.0')
# check_archive will fail if it can't find a manifest
Path(test_config['local_path']+'/a.3.0.0/manifest.json').unlink()
test_fail(lambda: check_archive(test_config['local_path']+'/a.3.0.0'))

In [37]:
storage_client=new_storage_client('azure_dummy','test/settings.ini')
test_eq(storage_client.__class__.__name__,'AzureStorageClient')
storage_client=new_storage_client('aws_dummy','test/settings.ini')
test_eq(storage_client.__class__.__name__,'AwsStorageClient')

In [38]:
# clean-up test data
for p in [test_config['local_path'],test_config['storage_area']]: _rmtree(p)