In [1]:
import sys
if "../" not in sys.path: sys.path.insert(0,"../");

## Building utilities for marynlp

Creating multiple utilities for building models, downloading the files and a tone of other things

In [2]:
## Module downloading sequence

In [3]:
flair_gcp = {
    'sw-exp-sent_analy-small': 'flair/classifier/sw-exp-sent_analy-small.zip',
    'early-wpc': 'flair/classifier/exp-wpc-small.zip',
    'early-sentiment-hasf': 'flair/classifier/sw-ft100-ffw-bilstm-exp-sent_analy-small-h256-noreproj.zip',
    'early-alpha-tag-ner': 'flair/taggers/sw-ner-gen1f-base.zip',
    'early-alpha-tag-pos': 'flair/taggers/sw-pos-early-h256.zip',
}

small_one = "flair-h_128-nl_1-f-char_lvl.zip"

In [9]:
from marynlp.utils import storage

def get_model_path_from_bucket(model_path: str):
    pass

bucket = storage.get_bucket("../resources/mary_africa_credentials_key.json", "marynlp-private")

# create the path
from pathlib import Path
save_path = Path("./temp/stuff.zip")
save_path.parent.mkdir(exist_ok=True)
path = storage.localize_google_cloud_file("flair-h_128-nl_1-f-char_lvl.zip", bucket=bucket, save_to_path="./temp/stuff.zip")


In [11]:
# Unzip the path
storage.unzip_file(path, './stuff_contents')

In [14]:
temp_path = storage.local.get_temp_path("flair-h_128-nl_1-f-char_lvl.zip"); temp_path

'/tmp/tmp7m30mcf_/flair-h_128-nl_1-f-char_lvl.zip'

In [28]:
from marynlp.utils import storage


def download_file_from_google_temporary(cloud_file_blob_name: str, bucket):
    store_local_path = storage.local.get_temp_path(cloud_file_blob_name)
    return download_file_from_google(cloud_file_blob_name, bucket, store_local_path)

def download_file_from_google_to_store(cloud_file_blob_name: str, bucket):
    store_local_path = storage.local.get_path_from_store(cloud_file_blob_name)
    return download_file_from_google(cloud_file_blob_name, bucket, store_local_path)

def download_file_from_google(cloud_file_blob_name: str, bucket, save_to_path: str):
    # create parent folders
    Path(save_to_path).parent.mkdir(exist_ok=True)
    return storage.localize_google_cloud_file(cloud_file_blob_name, bucket=bucket, save_to_path=save_to_path)

def prepare_zipped_model_from_google(cloud_file_blob_name: str, bucket, folder_name_for_contents: str = None):
    # download model to temporary location
    print('Downloading:', cloud_file_blob_name)
    temp_zipped_path = download_file_from_google_temporary(cloud_file_blob_name, bucket)
    print("Temp path:", temp_zipped_path)
    
    store_folder_name = folder_name_for_contents
    if folder_name_for_contents is None:
        # get the new name
        store_folder_name = ".".join(cloud_file_blob_name.split(".")[:-1])  
    
    # unzip to differen location
    store_folder_path = storage.local.get_path_from_store(store_folder_name)
    storage.unzip_file(temp_zipped_path, store_folder_path)    
    print("Unzipped to path:", store_folder_path)
    
    return str(store_folder_path)

In [29]:
model_location = prepare_zipped_model_from_google("flair-h_128-nl_1-f-char_lvl.zip", bucket)

Downloading: flair-h_128-nl_1-f-char_lvl.zip
Temp path: /tmp/tmpbv4tdenj/flair-h_128-nl_1-f-char_lvl.zip
Unzipped to path: /home/iam-kevin/.marynlp/store/flair-h_128-nl_1-f-char_lvl


In [30]:
from marynlp import funcutils as f

# Flair relatedel models
FLAIR_TEXT_CLASSIFIERS = {
    'sw-exp-sent_analy-small': ('flair/classifier/sw-exp-sent_analy-small.zip', True),
    'early-wpc': ('flair/classifier/exp-wpc-small.zip', True),
    'early-sentiment-hasf': ('flair/classifier/sw-ft100-ffw-bilstm-exp-sent_analy-small-h256-noreproj.zip', True)
}

FLAIR_SEQUENCE_TAGGERS = {
    'early-alpha-tag-ner': ('flair/taggers/sw-ner-gen1f-base.zip', True),
    'early-alpha-tag-pos': ('flair/taggers/sw-pos-early-h256.zip', True)
}

def get_model_from_google_bucket(src: str, flair_model_path_dict, bucket):
    """Gets the model"""
    if src not in flair_model_path_dict:
        assert Path(src).exists(), "The model path '%s' doesn't exist" % src
        return src
    
    # download the model and provice it
    blob_name, zipped = flair_model_path_dict[src]
    
    if zipped:
        # downloads the model and unzip
        return prepare_zipped_model_from_google(blob_name, bucket)
    
    # if not zipped
    return download_file_from_google_to_store(blob_name, bucket)

build_sequence_tagger: SequenceTagger = f.apply(SequenceTagger.load)(get_model_from_google_bucket)