Skip to content

Commit

Permalink
Add additional comments and improve variable names
Browse files Browse the repository at this point in the history
  • Loading branch information
pkalita-lbl committed Oct 19, 2023
1 parent ca5c780 commit b88bb77
Show file tree
Hide file tree
Showing 5 changed files with 29 additions and 14 deletions.
10 changes: 5 additions & 5 deletions nmdc_runtime/site/graphs.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
get_neon_pipeline_mms_data_product,
get_neon_pipeline_sls_data_product,
get_submission_portal_pipeline_inputs,
get_csv_file_from_url,
get_csv_rows_from_url,
)


Expand Down Expand Up @@ -143,8 +143,8 @@ def translate_metadata_submission_to_nmdc_schema_database():
) = get_submission_portal_pipeline_inputs()

metadata_submission = fetch_nmdc_portal_submission_by_id(submission_id)
omics_processing_mapping = get_csv_file_from_url(omics_processing_mapping_file_url)
data_object_mapping = get_csv_file_from_url(data_object_mapping_file_url)
omics_processing_mapping = get_csv_rows_from_url(omics_processing_mapping_file_url)
data_object_mapping = get_csv_rows_from_url(data_object_mapping_file_url)

database = translate_portal_submission_to_nmdc_schema_database(
metadata_submission, omics_processing_mapping, data_object_mapping
Expand All @@ -167,8 +167,8 @@ def ingest_metadata_submission():
) = get_submission_portal_pipeline_inputs()

metadata_submission = fetch_nmdc_portal_submission_by_id(submission_id)
omics_processing_mapping = get_csv_file_from_url(omics_processing_mapping_file_url)
data_object_mapping = get_csv_file_from_url(data_object_mapping_file_url)
omics_processing_mapping = get_csv_rows_from_url(omics_processing_mapping_file_url)
data_object_mapping = get_csv_rows_from_url(data_object_mapping_file_url)

database = translate_portal_submission_to_nmdc_schema_database(
metadata_submission, omics_processing_mapping, data_object_mapping
Expand Down
14 changes: 13 additions & 1 deletion nmdc_runtime/site/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -809,12 +809,24 @@ def nmdc_schema_database_export_filename_neon() -> str:


@op
def get_csv_file_from_url(url: str) -> List[Dict]:
def get_csv_rows_from_url(url: str) -> List[Dict]:
"""Download and parse a CSV file from a remote URL.
This method fetches data from the given URL and parses that data as CSV. The parsed data
is returned as a list (each element corresponds to a row) of dicts (each key is a column
name and the value is the corresponding cell value). The dict will *not* contain keys
for columns where the cell was empty.
:param url: Url to fetch and parse
:return: List[Dict]
"""
if not url:
return []

response = requests.get(url)
response.raise_for_status()

reader = csv.DictReader(response.text.splitlines())
# Collect all the rows into a list of dicts while stripping out (valfilter) cells where the
# value is an empty string (identity returns a Falsy value).
return [valfilter(identity, row) for row in reader]
11 changes: 6 additions & 5 deletions nmdc_runtime/site/translation/submission_portal_translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def _get_doi(self, metadata_submission: JSON_OBJECT) -> Union[List[str], None]:
"""Get DOI information from the context form data
:param metadata_submission: submission portal entry
:return: string or None
:return: list of strings or None
"""
dataset_doi = get_in(["contextForm", "datasetDoi"], metadata_submission)
if not dataset_doi:
Expand Down Expand Up @@ -380,7 +380,7 @@ def _transform_dict_for_class(self, raw_values: dict, class_name: str) -> dict:
raw_values is a dict where the keys are slot names and the values are plain strings.
Each of the items in this dict will be transformed by the _transform_value_for_slot
method. If the slot is multivalued each individual value will be transformed. If the
slot is multivalued and the value is a string it will be split in pipe characters
slot is multivalued and the value is a string it will be split at pipe characters
before transforming.
"""
slot_names = self.schema_view.class_slots(class_name)
Expand All @@ -392,11 +392,12 @@ def _transform_dict_for_class(self, raw_values: dict, class_name: str) -> dict:

slot_definition = self.schema_view.induced_slot(column, class_name)
if slot_definition.multivalued:
value_list = value
if isinstance(value, str):
value = [v.strip() for v in value.split("|")]
value_list = [v.strip() for v in value.split("|")]
transformed_value = [
self._transform_value_for_slot(item, slot_definition)
for item in value
for item in value_list
]
else:
transformed_value = self._transform_value_for_slot(
Expand Down Expand Up @@ -535,7 +536,7 @@ def get_database(self) -> nmdc.Database:
for data_object_row in data_objects_by_sample_data_id.get(
sample_data_id, []
):
# For each row in the DataObject mapping file that correspond to the sample ID,
# For each row in the DataObject mapping file that corresponds to the sample ID,
# transform the raw row data according to the DataObject class's slots, generate
# an instance, and connect that instance's minted ID to the OmicsProcessing
# instance
Expand Down
2 changes: 2 additions & 0 deletions tests/test_data/test_submission_portal_translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,8 @@ def now(cls, **kwargs):
test_datasets = yaml.safe_load_all(f)

for test_data in test_datasets:
# Reset the random number seed here so that fake IDs generated by the `test_minter`
# fixture are stable across test runs
random.seed(0)
translator = SubmissionPortalTranslator(
**test_data["input"], id_minter=test_minter
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import requests_mock
from dagster import build_op_context

from nmdc_runtime.site.ops import get_csv_file_from_url
from nmdc_runtime.site.ops import get_csv_rows_from_url


def test_valid_data():
Expand All @@ -14,7 +14,7 @@ def test_valid_data():
text='a,b,c\n1,hello,"apple, banana"\n2,wow,great',
)

result = get_csv_file_from_url(context, "http://www.example.com/data.csv")
result = get_csv_rows_from_url(context, "http://www.example.com/data.csv")
assert result == [
{"a": "1", "b": "hello", "c": "apple, banana"},
{"a": "2", "b": "wow", "c": "great"},
Expand All @@ -27,4 +27,4 @@ def test_not_found():
mock.get("http://www.example.com/data.csv", status_code=404)

with pytest.raises(requests.HTTPError):
get_csv_file_from_url(context, "http://www.example.com/data.csv")
get_csv_rows_from_url(context, "http://www.example.com/data.csv")

0 comments on commit b88bb77

Please sign in to comment.