In [None]:
# default_exp core

In [None]:
#hide
from nbdev.showdoc import show_doc

# Core

> Core tools for working with storage.

In [None]:
#export
from abc import ABC,abstractmethod
from configparser import ConfigParser
from pathlib import Path
import azure.storage.blob,azure.core.exceptions
import boto3
import shutil,re

In [None]:
from fastcore.test import *
from configparser import SectionProxy

In [None]:
#export
def read_config(section_name=None,config_name='secrets/settings.ini'):
    config_path=Path(config_name)
    config=ConfigParser()
    config.read(config_path)
    if section_name is None:
        return config
    if section_name not in config:
        raise Exception(f'Error: [{section_name}] section not found in {config_path}')
    return dict(config.items(section_name))

In [None]:
assert isinstance(read_config(),ConfigParser)
assert isinstance(read_config()['DEFAULT'],SectionProxy)
assert isinstance(read_config('DEFAULT'),dict)
assert read_config('local_cwd',config_name='test/settings.ini')['storage_type']=='local'

In [None]:
#export
def parse_dataset_archive_name(name):
    "Returns (name,version) if `name` is a dataset archive name, `None` otherwise"
    match = re.match('^([\./\s\w-]+)\.(\d+\.\d+\.\d+)\.zip$',name)
    if match is None: return None
    return match.group(1,2)

In [None]:
test_eq(('dsetname', '0.0.1'), parse_dataset_archive_name('dsetname.0.0.1.zip'))
test_eq(('dsetname.txt', '0.2.1'), parse_dataset_archive_name('dsetname.txt.0.2.1.zip'))
test_eq(('path/to/dsetname', '0.0.1'), parse_dataset_archive_name('path/to/dsetname.0.0.1.zip'))
test_eq(('//path/to/dsetname', '0.0.1'), parse_dataset_archive_name('//path/to/dsetname.0.0.1.zip'))
test_eq(None, parse_dataset_archive_name('dsetname.0.0.1.csv'))
test_eq(None, parse_dataset_archive_name('dsetname.0.1.zip'))
test_eq(None, parse_dataset_archive_name('dsetname.0.a.1.csv'))
test_eq(None, parse_dataset_archive_name('.0.0.1.zip'))
test_eq(None, parse_dataset_archive_name('0.0.1.csv'))
test_eq(None, parse_dataset_archive_name('dsetname.0.0.1'))

In [None]:
#export 
def parse_dataset_archive_version(version):
    match = re.match('^(\d+)\.(\d+)\.(\d+)$',version)
    if match is None: return None
    return [int(s) for s in match.group(1,2,3)]

In [None]:
test_eq((0, 1, 2),parse_dataset_archive_version('0.1.2'))
test_eq((5, 4, 3),parse_dataset_archive_version('5.4.3'))
test_eq(None,parse_dataset_archive_version('0.1.2.'))
test_eq(None,parse_dataset_archive_version('0.1'))

In [None]:
#export
def next_version(versions,increment='patch'):
    v=[0,0,0] if versions is None else parse_dataset_archive_version(versions[-1])
    if increment=='patch': v[2]+=1
    elif increment=='minor': v[1]+=1
    elif increment=='major': v[0]+=1
    else: raise ValueError(f'Unknown increment: {increment}')
    return f'{v[0]}.{v[1]}.{v[2]}'

In [None]:
test_eq('0.0.1',next_version(None))
test_eq('33.55.67',next_version(['2.4.60','33.55.66']))
test_eq('0.1.0',next_version(None,'minor'))
test_eq('1.0.0',next_version(None,'major'))
test_eq('3.4.60',next_version(['2.4.60'],'major'))
test_fail(lambda: next_version(None,'beta'))

In [None]:
#export
def make_dataset_archive_folder(path,versions,name,version='patch'):
    "Create a new dataset archive folder in `local_path`"
    src=Path(path)/name
    if not src.exists():
        raise FileNotFoundError(f'{src} not found')
    if version in ['major','minor','patch']:
        version=next_version(versions,version)
    elif parse_dataset_archive_version(version) is None:
        raise ValueError(f'Invalid version: {version}')
    archive_folder=Path(path)/'.'.join([name,version])
    if archive_folder.exists(): 
        raise FileExistsError(f'Archive folder {archive_folder} exists')
    if src.is_file(): 
        archive_folder.mkdir(parents=True)
        shutil.copy(src,archive_folder)
    else: 
        shutil.copytree(src,archive_folder)
    # TODO: create/update manifest
    return archive_folder

In [None]:
def _rmtree(p):
    try: shutil.rmtree(p)
    except FileNotFoundError: pass

In [None]:
def _make_local_test_data():
    test_files=['a/b/test_data.2.0.0.txt','test_data.txt']
    for i in reversed(range(3)): test_files.insert(1,f'sub/test_data.0.0.{i}.txt')
    for i,f in enumerate(test_files):
        f='test/local_path/'+f
        Path(f).parent.mkdir(parents=True,exist_ok=True)
        with open(f, 'w') as _file: _file.write(f'a little bit of data {i}')
    return test_files

In [None]:
for p in ['test/local_path','test/storage_area']: _rmtree(p)
_make_local_test_data()

test_eq(Path('test/local_path/test_data.txt.0.0.1'),
        make_dataset_archive_folder('test/local_path',None,'test_data.txt'))
test_eq(Path('test/local_path/test_data.txt.2.5.6'),
        make_dataset_archive_folder('test/local_path',['2.4.6'],'test_data.txt','minor'))
test_eq(Path('test/local_path/sub.0.0.1'),
        make_dataset_archive_folder('test/local_path',None,'sub'))
# TODO: check archive folder contents

In [None]:
#export
class StorageClientABC(ABC):
    """Defines functionality common to all storage clients"""
    
    def __init__(self,storage_name,config_name='secrets/settings.ini'):
        "Create a new storage client using the `storage_name` section of `config_name`"
        self.config=read_config(storage_name,config_name=config_name)

    def _ls(self,p,result,len_path_prefix=None):
        if len_path_prefix is None: len_path_prefix=len(str(p).replace('\\','/'))
        for _p in p.iterdir():
            if _p.is_dir(): self._ls(_p,result,len_path_prefix)
            else: result.append(str(_p).replace('\\','/')[len_path_prefix+1:])
        
    def ls(self,what='storage_area'):
        "Return a list containing the names of files in either `storage_area` or `local_path`"
        result,p=[],Path(self.config[what])
        p.mkdir(parents=True,exist_ok=True)
        self._ls(p,result)
        sorted(result)
        return result
        
    @abstractmethod
    def download(self,filename): 
        "Copy `filename` from `storage_area` to `local_path`"
    
    @abstractmethod
    def upload(self,filename,overwrite=False): 
        "Copy `filename` from `local_path` to `storage_area`"
        
    def ls_versions(self,name,what='storage_area'):
        "Return a list containing all versions of the specified archive `name`"
        files=[parse_dataset_archive_name(f) for f in self.ls(what)]
        result=[f[1] for f in files if f is not None and f[0]==name]
        if not result: return None
        return sorted(result, key=lambda v: parse_dataset_archive_version(v))
        
    def upload_dataset(self,name,version='patch'):
        "Create a new dataset archive and upload it to `storage_area`"
        archive_folder=make_dataset_archive_folder(
                self.config['local_path'],self.ls_versions(name),name,version)
        archive=shutil.make_archive(archive_folder,'zip',archive_folder)
        return self.upload(Path(archive).name)
        
    def download_dataset(self,name,version='latest'):
        "Download a dataset archive from `storage_area` and extract it to `local_path`"

In [None]:
# show_doc(StorageClientABC.upload_archive)

`upload_archive`

- `name`
    - file or folder name
- `version`
    - "major", "minor" or "patch" to automatically create a new version or
    - version literal `\d+\.\d+\.\d+` (e.g. "1.0.45")

The name of the new archive will be `[folder name|file name without format-specific extension][version].zip` and will contain
- the specified file or all files in the specified folder (and all sub-folders)
- a manifest describing archive contents, data owner etc (TODO: manifest details TBC)

If a folder called `[local_path][name][version]` already exists, we will
- create a manifest in this folder (if it doesn't already exist)
- archive and upload this folder

Otherwise, we will 
- create a folder called `[local_path][name][version]`
- copy the file or folder contents to `[local_path][name][version]`
- create a manifest in this folder
- archive and upload this folder

Why no `overwrite` option?
- It is not expected that archives will need to be overwritten
    - as we want to be able to re-run old experiments using the data as it was
- bad archives could be deleted via storage API (e.g. `storage_client.client.delete_blob('test.0.0.1.zip')`) or via storage bowsers
    - we might want to add a soft delete, archive status etc to handle this kind of thing?

In [None]:
#export
class LocalStorageClient(StorageClientABC):
    """Storage client that uses the local filesystem for both `storage_area` and `local_path`"""
    
    def _cp(self,from_key,to_key,filename,overwrite=False):
        src=Path(self.config[from_key])/filename
        dst=Path(self.config[to_key])/filename
        if dst.exists() and not overwrite: 
            raise FileExistsError(f'{dst} exists and overwrite=False')
        dst.parent.mkdir(parents=True,exist_ok=True)
        shutil.copy(src,dst)
        return dst
        
    def download(self,filename,overwrite=False):
        try: self._cp('storage_area','local_path',filename,overwrite)
        except FileExistsError: pass
        
    def upload(self,filename,overwrite=False): 
        return self._cp('local_path','storage_area',filename,overwrite)

`LocalStorageClient` will most often be used for local testing.

In [None]:
storage_client=LocalStorageClient('local_test','test/settings.ini')
assert storage_client.config['storage_type']=='local'

In [None]:
#export
class AzureStorageClient(StorageClientABC):
    """Storage client that uses Azure for `storage_area` and the local filesystem `local_path`"""
    
    @property
    def client(self):
        if not hasattr(self,'_client'):
            service_client=azure.storage.blob.BlobServiceClient.from_connection_string(
                self.config['conn_str'],self.config['credential'])
            self._client=service_client.get_container_client(self.config['container'])
        return self._client
    
    def ls(self,what='storage_area'):
        if what=='local_path': return super().ls(what)
        result=[b.name for b in self.client.list_blobs()]
        sorted(result)
        return result
    
    def download(self,filename,overwrite=False):
        p=Path(self.config['local_path'])/filename
        if p.exists() and not overwrite: return
        p.parent.mkdir(parents=True,exist_ok=True)
        with open(p, 'wb') as f:
            f.write(self.client.download_blob(filename).readall())
            
    def upload(self,filename,overwrite=False): 
        p=Path(self.config['local_path'])/filename
        try:
            with open(p, 'rb') as f:
                self.client.upload_blob(filename,f,overwrite=overwrite)
            return f"{config['storage_type']}:{config['container']}:{filename}"
        except azure.core.exceptions.ResourceExistsError as e:
            raise FileExistsError(f'{e}\noverwrite=False')

In [None]:
#export
class AwsStorageClient(StorageClientABC):
    """Storage client that uses AWS for `storage_area` and the local filesystem `local_path`"""
    def ls(self,what='storage_area'): pass 
    def download(self,filename): pass 
    def upload(self,filename,overwrite=False): pass

In [None]:
#export
def new_storage_client(storage_name,config_name='secrets/settings.ini'):
    "Returns a storage client based on the configured `storage_type`"
    config=read_config(storage_name,config_name=config_name)
    storage_type=config['storage_type']
    if storage_type=='local': return LocalStorageClient(storage_name, config_name)
    elif storage_type=='azure': return AzureStorageClient(storage_name, config_name)
    elif storage_type=='aws': return AwsStorageClient(storage_name, config_name)
    else: raise ValueError(f'Unknown storage_type: {storage_type}')

In [None]:
test_fail(lambda: new_storage_client('gcp_dummy','test/settings.ini'))

In [None]:
for p in ['test/local_path','test/storage_area']: _rmtree(p)
    
storage_client=new_storage_client('local_test','test/settings.ini')
assert isinstance(storage_client,LocalStorageClient)
assert storage_client.config['storage_type']=='local'
test_eq([],storage_client.ls())
test_eq([],storage_client.ls('local_path'))
    
test_files=_make_local_test_data()
test_eq([],storage_client.ls())
test_eq(test_files,storage_client.ls('local_path'))
        
for f in test_files: storage_client.upload(f)
test_eq(test_files,storage_client.ls())
test_eq(test_files,storage_client.ls('local_path'))
_rmtree('test/local_path')
test_eq([],storage_client.ls('local_path'))

for f in test_files: storage_client.download(f)
test_eq(test_files,storage_client.ls('local_path'))
test_eq('a little bit of data 4',open('test/local_path/test_data.txt').read())

with open('test/local_path/test_data.txt', 'w') as _file: _file.write('upd')
test_eq('upd',open('test/local_path/test_data.txt').read())
storage_client.download('test_data.txt')
test_eq('upd',open('test/local_path/test_data.txt').read())
storage_client.download('test_data.txt',True)
test_eq('a little bit of data 4',open('test/local_path/test_data.txt').read())

test_fail(lambda: storage_client.upload('test_data.txt'))
storage_client.upload('test_data.txt',True)

test_eq(None,storage_client.ls_versions('this/does/not/exitst'))

In [None]:
test_eq(Path('test/storage_area/test_data.txt.0.0.1.zip'),storage_client.upload_dataset('test_data.txt'))
test_eq(Path('test/storage_area/sub.0.0.1.zip'),storage_client.upload_dataset('sub'))
test_eq(Path('test/storage_area/a.1.0.0.zip'),storage_client.upload_dataset('a','1.0.0'))
test_eq(Path('test/storage_area/a.1.0.1.zip'),storage_client.upload_dataset('a'))

In [None]:
storage_client=new_storage_client('azure_dummy','test/settings.ini')
assert isinstance(storage_client,AzureStorageClient)
storage_client=new_storage_client('aws_dummy','test/settings.ini')
assert isinstance(storage_client,AwsStorageClient)

In [None]:
# clean-up test data
for p in ['test/local_path','test/storage_area']: _rmtree(p)