Skip to content

Commit

Permalink
Merge ded9796 into 4de45aa
Browse files Browse the repository at this point in the history
  • Loading branch information
MrAlecJohnson committed Sep 30, 2021
2 parents 4de45aa + ded9796 commit cf9a833
Show file tree
Hide file tree
Showing 4 changed files with 217 additions and 64 deletions.
109 changes: 106 additions & 3 deletions dataengineeringutils3/s3.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import gzip
from io import StringIO
import json
from pathlib import Path
from typing import Union

import boto3
import botocore

from io import StringIO


def gzip_string_write_to_s3(file_as_string, s3_path):
"""
Expand Down Expand Up @@ -199,7 +201,7 @@ def check_for_s3_file(s3_path):

def write_local_file_to_s3(local_file_path, s3_path, overwrite=False):
"""
Checks if a file exists in the S3 path provided.
Copy a file from a local folder to a location on s3.
:param local_file_path: "myfolder/myfile.json"
:param s3_path: "s3://path/to/myfile.json"
Expand All @@ -215,3 +217,104 @@ def write_local_file_to_s3(local_file_path, s3_path, overwrite=False):
resp = s3_resource.meta.client.upload_file(local_file_path, bucket, key)

return resp


def write_local_folder_to_s3(
root_folder: Union[Path, str],
s3_path: str,
overwrite: bool = False,
include_hidden_files: bool = True,
current_folder: Union[Path, str] = None,
) -> None:
"""Copy a local folder and all its contents to s3, keeping its directory structure.
:param root_folder: the folder whose contents you want to upload
:param s3_path: where you want the folder to be located when it's uploaded
:param overwrite: if True, overwrite existing files in the target location
if False, raise ValueError if existing files are found in the target location
:param include_hidden_files: if False, ignore files whose names start with a .
:param current_folder: leave as None - only used during recursion
:returns: None
"""
# On initial run, set current folder and make sure s3 path ahs a slash on the end
if not current_folder:
current_folder = root_folder
s3_path = _add_slash(s3_path)

for obj in Path(current_folder).iterdir():
if obj.is_file():
# Ignore hidden files if requested
if include_hidden_files or not obj.name.startswith("."):
# Construct relative path to retain local folder structure
relative_to_root = str(obj.relative_to(root_folder))
file_s3_path = f"{s3_path}{relative_to_root}"
write_local_file_to_s3(str(obj), file_s3_path, overwrite)
else:
# If not a file, it's a directory - so rerun the process recursively
write_local_folder_to_s3(
root_folder, s3_path, overwrite, include_hidden_files, obj
)


def write_s3_file_to_local(
s3_path: str, local_file_path: Union[Path, str], overwrite: bool = False,
) -> None:
"""Save a file from an s3 path to a local folder.
:param s3_path: full s3 path of the file you want to download
:param local_file_path: Path or str for where to save the file
:param overwrite: if True, overwrite an existing file at the local_file_path
:returns: None
"""
# Check if there's already a file there
if not overwrite:
location = Path(local_file_path)
if location.is_file():
raise FileExistsError

# Create the folder if it doesn't yet exist
folder = str(local_file_path).rsplit("/", 1)[0]
Path(folder).mkdir(parents=True, exist_ok=True)

# Download the file
s3 = boto3.client("s3")
bucket, key = s3_path_to_bucket_key(s3_path)
s3.download_file(bucket, key, str(local_file_path))


def write_s3_folder_to_local(
s3_path: str, local_folder_path: Union[Path, str], overwrite: bool = False
) -> None:
"""Copy files from an s3 'folder' to a local folder, keeping directory structure.
:param s3_path: full s3 path of the folder whose contents you want to download
:param local_folder_path: Path or str for where to save the contents of s3_path
:param overwrite: if False, raise an error if any of the files already exist
:returns: None
"""
# Prepare local root folder
root = Path(local_folder_path)
root.mkdir(parents=True, exist_ok=True)

# Get an object representing the bucket
s3 = boto3.resource("s3")
bucket_name, s3_folder = s3_path_to_bucket_key(s3_path)
bucket = s3.Bucket(bucket_name)

# For each file in bucket, check if it needs a new subfolder, then download it
for obj in bucket.objects.filter(Prefix=s3_folder):
# Split up s3 path to work out directory structure for the local file
s3_subfolder, filename = obj.key.rsplit("/", 1)
local_subfolder = root / s3_subfolder
destination = local_subfolder / filename

# Raise an error if file already exists and not overwriting
if not overwrite and destination.is_file():
raise FileExistsError

# Make the local folder if it doesn't exist, then download the file
local_subfolder.mkdir(parents=True, exist_ok=True)
bucket.download_file(obj.key, str(destination))
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "dataengineeringutils3"
version = "1.2.1"
version = "1.3.0"
description = "Data engineering utils Python 3 version"
authors = ["Data Engineering <dataengineering@digital.justice.gov.uk>"]
license = "MIT"
Expand Down
9 changes: 9 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,15 @@ def s3(aws_credentials):
yield boto3.resource("s3", region_name="eu-west-1")


@pytest.fixture(scope="function")
def bucket(s3):
with mock_s3():
yield s3.meta.client.create_bucket(
Bucket="test",
CreateBucketConfiguration={"LocationConstraint": "eu-west-1"},
)


@pytest.fixture(scope="function")
def sts(aws_credentials):
with mock_sts():
Expand Down
Loading

0 comments on commit cf9a833

Please sign in to comment.