In [1]:
import sys
if "../" not in sys.path: sys.path.insert(0,"../");

## Building utilities for marynlp

Creating multiple utilities for building models, downloading the files and a tone of other things

In [None]:
# downloading the data

def cache_from_google_bucket(online_path: str, save_path: str, credentials_json_path: str, bucket_name: str = STORAGE_BUCKET, overwrite: bool = True, **bucket_blob_args):
    
    # Gets the google client associated with the service account
    # NOTE: a valid service account needs to have the marynlp-private storage
    google_client = None
    
    try:
        google_client = storage.Client.from_service_account_json(str(credentials_json_path))
    except DefaultCredentialsError:
        logger.error('Unable to get client from credentials path')
        raise
    
    save_path = Path(save_path)
    
    if save_path.exists():
        logger.info('Data already exists in \'%s\'' % (str(save_path)))
        
        # if overwrite is false
        if not overwrite:
            logger.info('')
            return False
        
        # ...
        # overrite is true
        logger.info('Overwriting contents')
        
    bucket = google_client.get_bucket(bucket_name)
    
    blob = bucket.blob(online_path, **bucket_blob_args)
    blob.download_to_filename(str(save_path))
    
    logger.info('Contents in [%s:%s] have been cached to \'%s\'' % (bucket_name, str(online_path), str(save_path)))
    

### For working with files in GCP

Working with files in the GCP

In [78]:
import os
from typing import Union, Optional

# stored in `marynlp.utils.storage.google`

from google.cloud.storage import Client as GCStorageClient
from google.cloud.storage.bucket import Bucket as GCBucket
from google.cloud.storage.blob import Blob as GCBlob

# creating the function that we need to download contents from the 
#  create utilities on to that you can use to create helpful information abou the output of the model
def get_google_bucket_storage_client_from_credential_file(credential_file: Union[str, os.PathLike]) -> GCStorageClient:
    return GCStorageClient.from_service_account_json(str(credential_file))

def get_bucket_from_client(bucket_name: str, storage_client: GCStorageClient, *args, **kwargs) -> GCBucket:
    bucket = storage_client.get_bucket(bucket_name, *args, **kwargs)
    return bucket

def get_blob_from_bucket(blob_name: str, bucket: GCBucket, *args, **kwargs) -> GCBlob:
    return bucket.blob(blob_name, *args, **kwargs)

def save_to_file(save_to_path: str, blob: GCBlob):
    # save the contents of the file
    blob.download_to_filename(save_to_path)

In [76]:
_256KB = 1024 * 256;

storage_client_path = "../resources/mary_africa_credentials_key.json"

# get_google_bucket_storage_client_from_credential_file("wrong_file")
gc = get_google_bucket_storage_client_from_credential_file(storage_client_path)
bucket = get_bucket_from_client("inspired-nlp-bucket", gc)
blob = get_blob_from_bucket('ner.txt', bucket, chunk_size=1 * _256KB)


In [None]:
## file_system
def get_local_path

In [89]:
# Store file
import logging
LOGGER_KEY = 'marynlp'

nena_logger = logging.getLogger(LOGGER_KEY)


def download_from_bucket(blob_name: str, save_filename: str, bucket: GCBucket):
    nena_logger.info("Downloading '%s'..." % blob_name)
    blob = get_blob_from_bucket(blob_name, bucket)
    save_to_file(save_filename, blob)
    nena_logger.info('Download complete: ' + save_filename)
    
download_from_bucket('ner.txt', 'sample_file', bucket)

INFO:marynlp:Downloading 'ner.txt'...
INFO:marynlp:Download complete: sample_file


In [90]:
## unzip the file
from typing import Optional

def unzip_file(file_to_zip: str, save_to_path: Optional[str] = None):
    """Unpacks the contents of `file_to_zip` to the location `save_to_path`"""
    from zipfile import ZipFile

    with ZipFile(file_to_zip, mode='r') as zpf:
        zpf.extractall(path=save_to_path)

# unzip_file("../resources/operate_on/assets_20201211204302.zip", "here")


In [54]:
bytes(60).decode('ascii')

'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'