Skip to content

Commit

Permalink
Merge pull request #183 from Tehsurfer/s3-path-fix
Browse files Browse the repository at this point in the history
S3 path fix
  • Loading branch information
Tehsurfer committed Aug 22, 2023
2 parents bf47b39 + f8e6f9c commit 14ea83a
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 16 deletions.
53 changes: 38 additions & 15 deletions app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
from app.scicrunch_process_results import process_results, process_get_first_scaffold_info, reform_aggregation_results, \
reform_curies_results, reform_dataset_results, reform_related_terms, reform_anatomy_results
from app.serializer import ContactRequestSchema
from app.utilities import img_to_base64_str
from app.utilities import img_to_base64_str, get_path_from_mangled_list
from app.osparc.osparc import start_simulation as do_start_simulation
from app.osparc.osparc import check_simulation as do_check_simulation
from app.biolucida_process_results import process_results as process_biolucida_results
Expand Down Expand Up @@ -417,33 +417,56 @@ def get_discover_path():

return abort(404, description=f'Failed to retrieve uri {uri}')

# Reverse proxy for objects from S3, a simple get object
# operation. This is used by scaffoldvuer and its
# important to keep the relative <path> for accessing
# other required files.
@app.route("/s3-resource/<path:path>")
def direct_download_url(path, bucket_name=Config.DEFAULT_S3_BUCKET_NAME):

query_args = request.args
s3BucketName = query_args.get("s3BucketName", bucket_name)

def s3_header_check(path, bucket_name):
try:
head_response = s3.head_object(
Bucket=s3BucketName,
Bucket=bucket_name,
Key=path,
RequestPayer="requester"
)
content_length = head_response.get('ContentLength', Config.DIRECT_DOWNLOAD_LIMIT)
if content_length and not content_length < Config.DIRECT_DOWNLOAD_LIMIT : # 20 MB
return abort(413, description=f"File too big to download: {content_length}")
return abort(413, description= f"File too big to download: {content_length}")
except botocore.exceptions.ClientError as err:
# NOTE: This case is required because of https://github.com/boto/boto3/issues/2442
if err.response["Error"]["Code"] == "404":
return abort(404, description=f'Provided path was not found on the s3 resource')
return (404, f'Provided path was not found on the s3 resource')
else:
abort(err.response["Error"]["Code"], err.response["Error"]["Message"])
else:
return (200, 'OK')

# Reverse proxy for objects from S3, a simple get object
# operation. This is used by scaffoldvuer and its
# important to keep the relative <path> for accessing
# other required files.
@app.route("/s3-resource/<path:path>")
def direct_download_url(path, bucket_name=Config.DEFAULT_S3_BUCKET_NAME):

query_args = request.args
s3BucketName = query_args.get("s3BucketName", bucket_name)
s3_path = path # Will modify s3_path if we find name mangling

# Check the header to see if too large or does not exist
response = s3_header_check(path, s3BucketName)

# If the file does not exist, check if the name was mangled
if response[0] == 404:
s3_path_modified = get_path_from_mangled_list(path)
if s3_path_modified == s3_path:
abort(404, description=f'Provided path was not found on the s3 resource') # Abort if path did not change

# Check the modified path
response2 = s3_header_check(s3_path_modified, s3BucketName)
if response2[0] == 200:
s3_path = s3_path_modified # Modify the path if de-mangling was successful
elif response2[0] == 404:
abort(404, description=f'Provided path was not found on the s3 resource')


response = s3.get_object(
Bucket=s3BucketName,
Key=path,
Key=s3_path,
RequestPayer="requester"
)

Expand Down
4 changes: 3 additions & 1 deletion app/manifest_name_to_discover_name.py
Original file line number Diff line number Diff line change
Expand Up @@ -5008,5 +5008,7 @@
'files/primary/sub-Cadaver-Subject-2/Ultrasound_raw_Cadaver_Subject 2.mp4': 'files/primary/sub-Cadaver-Subject-2/Ultrasound_raw_Cadaver_Subject_2.mp4',
'files/primary/sub-Cadaver-Subject-3/Ultrasound_raw_Cadaver_Subject 3.mp4': 'files/primary/sub-Cadaver-Subject-3/Ultrasound_raw_Cadaver_Subject_3.mp4',
'files/derivative/sam-P21 BAT/ses-Pgp9.5/P21-3_Pgp9.5.jpx': 'files/derivative/sam-P21 BAT/ses-Pgp9.5/P21-3 Pgp9.5.jpx',
'files/derivative/sam-P21 BAT/ses-Pgp9.5/P21-4_Pgp9.5.jpx': 'files/derivative/sam-P21 BAT/ses-Pgp9.5/P21-4 Pgp9.5.jpx'
'files/derivative/sam-P21 BAT/ses-Pgp9.5/P21-4_Pgp9.5.jpx': 'files/derivative/sam-P21 BAT/ses-Pgp9.5/P21-4 Pgp9.5.jpx',
'files/derivative/mapped_Pig 7_thumbnail.jpeg': 'files/derivative/mapped_Pig_7_thumbnail.jpeg',
'files/derivative/mapped_Pig 16_thumbnail.jpeg': 'files/derivative/mapped_Pig_16_thumbnail.jpeg'
}
9 changes: 9 additions & 0 deletions app/utilities.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
import base64
from io import BytesIO
from app.manifest_name_to_discover_name import name_map

def get_path_from_mangled_list(s3_path):

# Split out the file part of the path
s3_paths = s3_path.split('files/')
file_path = 'files/' + s3_paths[1] # Add the part we split on back

# Switch the path to the mapped one if it is listed
return s3_paths[0] + name_map.get(file_path, file_path)

def img_to_base64_str(img):
"""
Expand Down
24 changes: 24 additions & 0 deletions tests/test_dataset_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import pytest
from app import app

from app.config import Config

from timeit import default_timer as timer

from app.scicrunch_processing_common import SCAFFOLD_FILE, PLOT_FILE, COMMON_IMAGES, THUMBNAIL_IMAGE, NAME, BIOLUCIDA_3D, VIDEO, SEGMENTATION_FILES, BIOLUCIDA_2D
Expand Down Expand Up @@ -273,3 +275,25 @@ def test_pennsieve_identifier_dataset_search(client):
print(first_result)
for r in result[BIOLUCIDA_3D]:
print(r['dataset']['path'])

def test_name_mangling_for_s3_resource(client):
# This test uses a file on dataset 328 which has a file where space is converted to underscore
# to check the name mangling code is working
r = client.get('/s3-resource/328/1/files/derivative/mapped_Pig%207_thumbnail.jpeg?s3BucketName=prd-sparc-discover-use1')
assert r.status_code == 200

def test_size_limit_on_mangled_s3_resource(client):
# This test checks that files that are mangled and too large will return a 413

config_download_limit = Config.DIRECT_DOWNLOAD_LIMIT # Store download limit
Config.DIRECT_DOWNLOAD_LIMIT = 20 # set limit to 20 bytes to force a 413

# Use a try-except to make sure we can set the limit back
try:
r = client.get('/s3-resource/328/1/files/derivative/mapped_Pig%207_thumbnail.jpeg?s3BucketName=prd-sparc-discover-use1')
except:
pass

Config.DIRECT_DOWNLOAD_LIMIT = config_download_limit # set limit back

assert r.status_code == 413 # Check we got the correct response

0 comments on commit 14ea83a

Please sign in to comment.