Merge pull request #183 from Tehsurfer/s3-path-fix

S3 path fix
nih-sparc · Aug 22, 2023 · 14ea83a · 14ea83a
2 parents bf47b39 + f8e6f9c
commit 14ea83a
Show file tree

Hide file tree

Showing 4 changed files with 74 additions and 16 deletions.
diff --git a/app/main.py b/app/main.py
@@ -44,7 +44,7 @@
 from app.scicrunch_process_results import process_results, process_get_first_scaffold_info, reform_aggregation_results, \
     reform_curies_results, reform_dataset_results, reform_related_terms, reform_anatomy_results
 from app.serializer import ContactRequestSchema
-from app.utilities import img_to_base64_str
+from app.utilities import img_to_base64_str, get_path_from_mangled_list
 from app.osparc.osparc import start_simulation as do_start_simulation
 from app.osparc.osparc import check_simulation as do_check_simulation
 from app.biolucida_process_results import process_results as process_biolucida_results
@@ -417,33 +417,56 @@ def get_discover_path():
 
     return abort(404, description=f'Failed to retrieve uri {uri}')
 
-# Reverse proxy for objects from S3, a simple get object
-# operation. This is used by scaffoldvuer and its
-# important to keep the relative <path> for accessing
-# other required files.
-@app.route("/s3-resource/<path:path>")
-def direct_download_url(path, bucket_name=Config.DEFAULT_S3_BUCKET_NAME):
-
-    query_args = request.args
-    s3BucketName = query_args.get("s3BucketName", bucket_name)
-
+def s3_header_check(path, bucket_name):
     try:
         head_response = s3.head_object(
-            Bucket=s3BucketName,
+            Bucket=bucket_name,
             Key=path,
             RequestPayer="requester"
         )
         content_length = head_response.get('ContentLength', Config.DIRECT_DOWNLOAD_LIMIT)
         if content_length and not content_length < Config.DIRECT_DOWNLOAD_LIMIT :  # 20 MB
-            return abort(413, description=f"File too big to download: {content_length}")
+            return abort(413, description= f"File too big to download: {content_length}")
     except botocore.exceptions.ClientError as err:
         # NOTE: This case is required because of https://github.com/boto/boto3/issues/2442
         if err.response["Error"]["Code"] == "404":
-            return abort(404, description=f'Provided path was not found on the s3 resource')
+            return (404, f'Provided path was not found on the s3 resource')
+        else:
+            abort(err.response["Error"]["Code"], err.response["Error"]["Message"])
+    else:
+        return (200, 'OK')
+
+# Reverse proxy for objects from S3, a simple get object
+# operation. This is used by scaffoldvuer and its
+# important to keep the relative <path> for accessing
+# other required files.
+@app.route("/s3-resource/<path:path>")
+def direct_download_url(path, bucket_name=Config.DEFAULT_S3_BUCKET_NAME):
+
+    query_args = request.args
+    s3BucketName = query_args.get("s3BucketName", bucket_name)
+    s3_path = path  # Will modify s3_path if we find name mangling
+
+    # Check the header to see if too large or does not exist
+    response = s3_header_check(path, s3BucketName)
+
+    # If the file does not exist, check if the name was mangled
+    if response[0] == 404:
+        s3_path_modified = get_path_from_mangled_list(path)
+        if s3_path_modified == s3_path:
+            abort(404, description=f'Provided path was not found on the s3 resource')  # Abort if path did not change
+
+        # Check the modified path
+        response2 = s3_header_check(s3_path_modified, s3BucketName)
+        if response2[0] == 200:
+            s3_path = s3_path_modified  # Modify the path if de-mangling was successful
+        elif response2[0] == 404:
+            abort(404, description=f'Provided path was not found on the s3 resource')
+
 
     response = s3.get_object(
         Bucket=s3BucketName,
-        Key=path,
+        Key=s3_path,
         RequestPayer="requester"
     )
 

diff --git a/app/manifest_name_to_discover_name.py b/app/manifest_name_to_discover_name.py
@@ -5008,5 +5008,7 @@
             'files/primary/sub-Cadaver-Subject-2/Ultrasound_raw_Cadaver_Subject 2.mp4': 'files/primary/sub-Cadaver-Subject-2/Ultrasound_raw_Cadaver_Subject_2.mp4',
             'files/primary/sub-Cadaver-Subject-3/Ultrasound_raw_Cadaver_Subject 3.mp4': 'files/primary/sub-Cadaver-Subject-3/Ultrasound_raw_Cadaver_Subject_3.mp4',
             'files/derivative/sam-P21 BAT/ses-Pgp9.5/P21-3_Pgp9.5.jpx': 'files/derivative/sam-P21 BAT/ses-Pgp9.5/P21-3 Pgp9.5.jpx',
-            'files/derivative/sam-P21 BAT/ses-Pgp9.5/P21-4_Pgp9.5.jpx': 'files/derivative/sam-P21 BAT/ses-Pgp9.5/P21-4 Pgp9.5.jpx'
+            'files/derivative/sam-P21 BAT/ses-Pgp9.5/P21-4_Pgp9.5.jpx': 'files/derivative/sam-P21 BAT/ses-Pgp9.5/P21-4 Pgp9.5.jpx',
+            'files/derivative/mapped_Pig 7_thumbnail.jpeg': 'files/derivative/mapped_Pig_7_thumbnail.jpeg',
+            'files/derivative/mapped_Pig 16_thumbnail.jpeg': 'files/derivative/mapped_Pig_16_thumbnail.jpeg'
 }
diff --git a/app/utilities.py b/app/utilities.py
@@ -1,6 +1,15 @@
 import base64
 from io import BytesIO
+from app.manifest_name_to_discover_name import name_map
 
+def get_path_from_mangled_list(s3_path):
+
+    # Split out the file part of the path
+    s3_paths = s3_path.split('files/')
+    file_path = 'files/' + s3_paths[1]  # Add the part we split on back
+
+    # Switch the path to the mapped one if it is listed
+    return s3_paths[0] + name_map.get(file_path, file_path)
 
 def img_to_base64_str(img):
     """

diff --git a/tests/test_dataset_info.py b/tests/test_dataset_info.py
@@ -2,6 +2,8 @@
 import pytest
 from app import app
 
+from app.config import Config
+
 from timeit import default_timer as timer
 
 from app.scicrunch_processing_common import SCAFFOLD_FILE, PLOT_FILE, COMMON_IMAGES, THUMBNAIL_IMAGE, NAME, BIOLUCIDA_3D, VIDEO, SEGMENTATION_FILES, BIOLUCIDA_2D
@@ -273,3 +275,25 @@ def test_pennsieve_identifier_dataset_search(client):
     print(first_result)
     for r in result[BIOLUCIDA_3D]:
         print(r['dataset']['path'])
+
+def test_name_mangling_for_s3_resource(client):
+    # This test uses a file on dataset 328 which has a file where space is converted to underscore
+    # to check the name mangling code is working
+    r = client.get('/s3-resource/328/1/files/derivative/mapped_Pig%207_thumbnail.jpeg?s3BucketName=prd-sparc-discover-use1')
+    assert r.status_code == 200
+
+def test_size_limit_on_mangled_s3_resource(client):
+    # This test checks that files that are mangled and too large will return a 413
+
+    config_download_limit = Config.DIRECT_DOWNLOAD_LIMIT  # Store download limit
+    Config.DIRECT_DOWNLOAD_LIMIT = 20  # set limit to 20 bytes to force a 413
+
+    # Use a try-except to make sure we can set the limit back
+    try:
+        r = client.get('/s3-resource/328/1/files/derivative/mapped_Pig%207_thumbnail.jpeg?s3BucketName=prd-sparc-discover-use1')
+    except:
+        pass
+
+    Config.DIRECT_DOWNLOAD_LIMIT = config_download_limit  # set limit back
+
+    assert r.status_code == 413  # Check we got the correct response