Add additional comments and improve variable names

microbiomedata · Oct 19, 2023 · b88bb77 · b88bb77
1 parent ca5c780
commit b88bb77
Show file tree

Hide file tree

Showing 5 changed files with 29 additions and 14 deletions.
diff --git a/nmdc_runtime/site/graphs.py b/nmdc_runtime/site/graphs.py
@@ -41,7 +41,7 @@
     get_neon_pipeline_mms_data_product,
     get_neon_pipeline_sls_data_product,
     get_submission_portal_pipeline_inputs,
-    get_csv_file_from_url,
+    get_csv_rows_from_url,
 )
 
 
@@ -143,8 +143,8 @@ def translate_metadata_submission_to_nmdc_schema_database():
     ) = get_submission_portal_pipeline_inputs()
 
     metadata_submission = fetch_nmdc_portal_submission_by_id(submission_id)
-    omics_processing_mapping = get_csv_file_from_url(omics_processing_mapping_file_url)
-    data_object_mapping = get_csv_file_from_url(data_object_mapping_file_url)
+    omics_processing_mapping = get_csv_rows_from_url(omics_processing_mapping_file_url)
+    data_object_mapping = get_csv_rows_from_url(data_object_mapping_file_url)
 
     database = translate_portal_submission_to_nmdc_schema_database(
         metadata_submission, omics_processing_mapping, data_object_mapping
@@ -167,8 +167,8 @@ def ingest_metadata_submission():
     ) = get_submission_portal_pipeline_inputs()
 
     metadata_submission = fetch_nmdc_portal_submission_by_id(submission_id)
-    omics_processing_mapping = get_csv_file_from_url(omics_processing_mapping_file_url)
-    data_object_mapping = get_csv_file_from_url(data_object_mapping_file_url)
+    omics_processing_mapping = get_csv_rows_from_url(omics_processing_mapping_file_url)
+    data_object_mapping = get_csv_rows_from_url(data_object_mapping_file_url)
 
     database = translate_portal_submission_to_nmdc_schema_database(
         metadata_submission, omics_processing_mapping, data_object_mapping

diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py
@@ -809,12 +809,24 @@ def nmdc_schema_database_export_filename_neon() -> str:
 
 
 @op
-def get_csv_file_from_url(url: str) -> List[Dict]:
+def get_csv_rows_from_url(url: str) -> List[Dict]:
+    """Download and parse a CSV file from a remote URL.
+
+    This method fetches data from the given URL and parses that data as CSV. The parsed data
+    is returned as a list (each element corresponds to a row) of dicts (each key is a column
+    name and the value is the corresponding cell value). The dict will *not* contain keys
+    for columns where the cell was empty.
+
+    :param url: Url to fetch and parse
+    :return: List[Dict]
+    """
     if not url:
         return []
 
     response = requests.get(url)
     response.raise_for_status()
 
     reader = csv.DictReader(response.text.splitlines())
+    # Collect all the rows into a list of dicts while stripping out (valfilter) cells where the
+    # value is an empty string (identity returns a Falsy value).
     return [valfilter(identity, row) for row in reader]
diff --git a/nmdc_runtime/site/translation/submission_portal_translator.py b/nmdc_runtime/site/translation/submission_portal_translator.py
@@ -67,7 +67,7 @@ def _get_doi(self, metadata_submission: JSON_OBJECT) -> Union[List[str], None]:
         """Get DOI information from the context form data
 
         :param metadata_submission: submission portal entry
-        :return: string or None
+        :return: list of strings or None
         """
         dataset_doi = get_in(["contextForm", "datasetDoi"], metadata_submission)
         if not dataset_doi:
@@ -380,7 +380,7 @@ def _transform_dict_for_class(self, raw_values: dict, class_name: str) -> dict:
         raw_values is a dict where the keys are slot names and the values are plain strings.
         Each of the items in this dict will be transformed by the _transform_value_for_slot
         method. If the slot is multivalued each individual value will be transformed. If the
-        slot is multivalued and the value is a string it will be split in pipe characters
+        slot is multivalued and the value is a string it will be split at pipe characters
         before transforming.
         """
         slot_names = self.schema_view.class_slots(class_name)
@@ -392,11 +392,12 @@ def _transform_dict_for_class(self, raw_values: dict, class_name: str) -> dict:
 
             slot_definition = self.schema_view.induced_slot(column, class_name)
             if slot_definition.multivalued:
+                value_list = value
                 if isinstance(value, str):
-                    value = [v.strip() for v in value.split("|")]
+                    value_list = [v.strip() for v in value.split("|")]
                 transformed_value = [
                     self._transform_value_for_slot(item, slot_definition)
-                    for item in value
+                    for item in value_list
                 ]
             else:
                 transformed_value = self._transform_value_for_slot(
@@ -535,7 +536,7 @@ def get_database(self) -> nmdc.Database:
                 for data_object_row in data_objects_by_sample_data_id.get(
                     sample_data_id, []
                 ):
-                    # For each row in the DataObject mapping file that correspond to the sample ID,
+                    # For each row in the DataObject mapping file that corresponds to the sample ID,
                     # transform the raw row data according to the DataObject class's slots, generate
                     # an instance, and connect that instance's minted ID to the OmicsProcessing
                     # instance

diff --git a/tests/test_data/test_submission_portal_translator.py b/tests/test_data/test_submission_portal_translator.py
@@ -263,6 +263,8 @@ def now(cls, **kwargs):
         test_datasets = yaml.safe_load_all(f)
 
         for test_data in test_datasets:
+            # Reset the random number seed here so that fake IDs generated by the `test_minter`
+            # fixture are stable across test runs
             random.seed(0)
             translator = SubmissionPortalTranslator(
                 **test_data["input"], id_minter=test_minter

diff --git a/tests/test_ops/test_get_csv_file_from_url.py → tests/test_ops/test_get_csv_rows_from_url.py b/tests/test_ops/test_get_csv_file_from_url.py → tests/test_ops/test_get_csv_rows_from_url.py
@@ -3,7 +3,7 @@
 import requests_mock
 from dagster import build_op_context
 
-from nmdc_runtime.site.ops import get_csv_file_from_url
+from nmdc_runtime.site.ops import get_csv_rows_from_url
 
 
 def test_valid_data():
@@ -14,7 +14,7 @@ def test_valid_data():
             text='a,b,c\n1,hello,"apple, banana"\n2,wow,great',
         )
 
-        result = get_csv_file_from_url(context, "http://www.example.com/data.csv")
+        result = get_csv_rows_from_url(context, "http://www.example.com/data.csv")
         assert result == [
             {"a": "1", "b": "hello", "c": "apple, banana"},
             {"a": "2", "b": "wow", "c": "great"},
@@ -27,4 +27,4 @@ def test_not_found():
         mock.get("http://www.example.com/data.csv", status_code=404)
 
         with pytest.raises(requests.HTTPError):
-            get_csv_file_from_url(context, "http://www.example.com/data.csv")
+            get_csv_rows_from_url(context, "http://www.example.com/data.csv")