Skip to content
This repository has been archived by the owner on Aug 13, 2021. It is now read-only.

Commit

Permalink
[326 precursor] Factored in utils from 326_rename (#329)
Browse files Browse the repository at this point in the history
* file renaming factored from 326_nih

* revert autobatch

* removed references to old health_data terminology in favour of nih

* traded lambda for partial

* factored out utils

* factored in utils from 326_rename

* added docstrings and comments, and small amount of refactoring

* added pk tests

* test for auto pkey check

* test for generate pk

* added retrieve pk tests

* added delete stmt test

* added delete stmt test

* added merge tests

* added bucket keys tests

* backwards compatability for older pipelines

* fixes wrt to new insert method

* added minor fix to gtr routine due to tanzania

* added sleeps to test to avoid overhitting the api
  • Loading branch information
Joel Klinger committed Nov 11, 2020
1 parent b46ad14 commit e2d13fd
Show file tree
Hide file tree
Showing 9 changed files with 426 additions and 48 deletions.
13 changes: 7 additions & 6 deletions nesta/core/batchables/gtr/collect_gtr/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from nesta.packages.gtr.get_gtr_data import TOP_URL

from nesta.core.orms.orm_utils import insert_data
from nesta.core.orms.orm_utils import orm_column_names
from nesta.core.orms.orm_utils import get_class_by_tablename
from nesta.core.orms.gtr_orm import Base
from nesta.core.luigihacks.s3 import parse_s3_path
Expand Down Expand Up @@ -54,9 +55,10 @@ def run():
objs = []
for table_name, rows in data.items():
_class = get_class_by_tablename(Base, f"gtr_{table_name}")
# Remove any fields that aren't in the ORM
cleaned_rows = [{k:v for k, v in row.items() if k in _class.__dict__}
for row in rows]
# Remove any fields that aren't in the ORM and set NULL as default
field_names = orm_column_names(_class)
cleaned_rows = [{field: (row[field] if field in row else None)
for field in field_names} for row in rows]
objs += insert_data("BATCHPAR_config", "mysqldb", db,
Base, _class, cleaned_rows)

Expand All @@ -73,9 +75,8 @@ def run():
if __name__ == "__main__":
# Local testing
if "BATCHPAR_outinfo" not in os.environ:
os.environ['BATCHPAR_TOPURL'] = "https://gtr.ukri.org/gtr/api/projects"
os.environ['BATCHPAR_PAGESIZE'] = "100"
os.environ['BATCHPAR_page'] = "647"
os.environ['BATCHPAR_PAGESIZE'] = "10"
os.environ['BATCHPAR_page'] = "2"
os.environ["BATCHPAR_db"] = "dev"
os.environ["BATCHPAR_outinfo"] = ""
os.environ["BATCHPAR_config"] = os.environ["MYSQLDBCONF"]
Expand Down
17 changes: 16 additions & 1 deletion nesta/core/luigihacks/misctools.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from functools import lru_cache
import yaml
from datetime import datetime as dt

import boto3

def get_config(file_name, header, path="core/config/"):
'''Get the configuration from a file in the luigi config path
Expand Down Expand Up @@ -125,3 +125,18 @@ def extract_task_info(luigi_task):
task_name = type(luigi_task).__name__
routine_id = f'{task_name}-{luigi_task.date}-{test}'
return test, routine_id


@lru_cache()
def bucket_keys(bucket_name):
"""Get all keys in an S3 bucket.
Args:
bucket_name (str): Name of a bucket to query.
Returns:
keys (set): Set of keys
"""
s3 = boto3.resource('s3')
bucket = s3.Bucket(bucket_name)
keys = set(obj.key for obj in bucket.objects.all())
return keys
6 changes: 6 additions & 0 deletions nesta/core/luigihacks/tests/test_elasticsearchplus.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from unittest import mock
from alphabet_detector import AlphabetDetector
from collections import Counter
import time

from nesta.core.luigihacks.elasticsearchplus import Translator

Expand Down Expand Up @@ -94,6 +95,7 @@ def test_sentence_chunks():

def test_auto_translate_true_short(row):
"""The translator shouldn't be applied for short pieces of text"""
time.sleep(5)
_row = _auto_translate(row, TRANSLATOR, 1000)
assert not _row.pop(TRANS_TAG)
assert len(_row.pop(LANGS_TAG)) == 0
Expand All @@ -102,7 +104,9 @@ def test_auto_translate_true_short(row):
assert row == _row

def test_auto_translate_true_long_small_chunks(row):
time.sleep(5)
_row_1 = _auto_translate(row, TRANSLATOR, 10, chunksize=1)
time.sleep(5)
_row_2 = _auto_translate(row, TRANSLATOR, 10, chunksize=10000)
assert _row_1.pop('mixed_lang') != _row_2.pop('mixed_lang')

Expand All @@ -129,6 +133,7 @@ def test_auto_translate_true_long_small_chunks(row):
assert _row_1 == _row_2

def test_auto_translate_true_long(row):
time.sleep(5)
_row = _auto_translate(row, TRANSLATOR, 10)
assert row.pop('korean') != _row['korean']
assert row.pop('mixed_lang') != _row['mixed_lang']
Expand All @@ -147,6 +152,7 @@ def test_auto_translate_true_long(row):
def test_auto_translate_false(row):
row.pop('korean')
row.pop('mixed_lang')
time.sleep(5)
_row = _auto_translate(row, TRANSLATOR)
assert not _row.pop(TRANS_TAG)
assert _row.pop(LANGS_TAG) == ['en']
Expand Down
20 changes: 19 additions & 1 deletion nesta/core/luigihacks/tests/test_misctools.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from unittest import TestCase
from unittest import TestCase, mock
from nesta.core.luigihacks.misctools import get_config
from nesta.core.luigihacks.misctools import find_filepath_from_pathstub
from nesta.core.luigihacks.misctools import bucket_keys


class TestMiscTools(TestCase):
Expand All @@ -15,3 +16,20 @@ def test_find_filepath_from_pathstub(self):
find_filepath_from_pathstub("nesta/packages")
with self.assertRaises(FileNotFoundError):
find_filepath_from_pathstub("nesta/package")


@mock.patch('nesta.core.luigihacks.misctools.boto3')
def test_bucket_keys(mocked_boto3):
keys = {'foo', 'bar', 'baz'}

# Mock up the bucket
bucket_objs = []
for key in keys:
obj = mock.Mock()
obj.key = key
bucket_objs.append(obj)
mocked_bucket = mocked_boto3.resource().Bucket()
mocked_bucket.objects.all.return_value = bucket_objs

# Actually do the test
assert bucket_keys('dummy') == keys

0 comments on commit e2d13fd

Please sign in to comment.