Skip to content

Commit

Permalink
TST: Refactor s3 resource (#53803)
Browse files Browse the repository at this point in the history
* TST: Refactor s3 resource

* Fix api usage

* bot3 instead of cli

* refactor call

* Make cleanup and bucket names more unique

* Use external error raised
  • Loading branch information
mroeschke committed Jun 23, 2023
1 parent a7a5b13 commit 8bab235
Show file tree
Hide file tree
Showing 9 changed files with 180 additions and 132 deletions.
112 changes: 58 additions & 54 deletions pandas/tests/io/conftest.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import shlex
import subprocess
import time
import uuid

import pytest

Expand Down Expand Up @@ -54,13 +55,13 @@ def s3so(worker_id):
return {"client_kwargs": {"endpoint_url": url}}


@pytest.fixture(scope="session")
@pytest.fixture(scope="function" if is_ci_environment() else "session")
def monkeysession():
with pytest.MonkeyPatch.context() as mp:
yield mp


@pytest.fixture(scope="session")
@pytest.fixture(scope="function" if is_ci_environment() else "session")
def s3_base(worker_id, monkeysession):
"""
Fixture for mocking S3 interaction.
Expand Down Expand Up @@ -123,24 +124,67 @@ def s3_base(worker_id, monkeysession):


@pytest.fixture
def s3_resource(s3_base, tips_file, jsonl_file, feather_file):
"""
Sets up S3 bucket with contents
def s3_resource(s3_base):
import boto3

s3 = boto3.resource("s3", endpoint_url=s3_base)
return s3


The primary bucket name is "pandas-test". The following datasets
@pytest.fixture
def s3_public_bucket(s3_resource):
bucket = s3_resource.Bucket(f"pandas-test-{uuid.uuid4()}")
bucket.create()
yield bucket
bucket.objects.delete()
bucket.delete()


@pytest.fixture
def s3_public_bucket_with_data(s3_public_bucket, tips_file, jsonl_file, feather_file):
"""
The following datasets
are loaded.
- tips.csv
- tips.csv.gz
- tips.csv.bz2
- items.jsonl
"""
test_s3_files = [
("tips#1.csv", tips_file),
("tips.csv", tips_file),
("tips.csv.gz", tips_file + ".gz"),
("tips.csv.bz2", tips_file + ".bz2"),
("items.jsonl", jsonl_file),
("simple_dataset.feather", feather_file),
]
for s3_key, file_name in test_s3_files:
with open(file_name, "rb") as f:
s3_public_bucket.put_object(Key=s3_key, Body=f)
return s3_public_bucket


@pytest.fixture
def s3_private_bucket(s3_resource):
bucket = s3_resource.Bucket(f"cant_get_it-{uuid.uuid4()}")
bucket.create(ACL="private")
yield bucket
bucket.objects.delete()
bucket.delete()


A private bucket "cant_get_it" is also created. The boto3 s3 resource
is yielded by the fixture.
@pytest.fixture
def s3_private_bucket_with_data(s3_private_bucket, tips_file, jsonl_file, feather_file):
"""
import boto3
import s3fs
The following datasets
are loaded.
- tips.csv
- tips.csv.gz
- tips.csv.bz2
- items.jsonl
"""
test_s3_files = [
("tips#1.csv", tips_file),
("tips.csv", tips_file),
Expand All @@ -149,50 +193,10 @@ def s3_resource(s3_base, tips_file, jsonl_file, feather_file):
("items.jsonl", jsonl_file),
("simple_dataset.feather", feather_file),
]

def add_tips_files(bucket_name):
for s3_key, file_name in test_s3_files:
with open(file_name, "rb") as f:
cli.put_object(Bucket=bucket_name, Key=s3_key, Body=f)

bucket = "pandas-test"
conn = boto3.resource("s3", endpoint_url=s3_base)
cli = boto3.client("s3", endpoint_url=s3_base)

try:
cli.create_bucket(Bucket=bucket)
except Exception:
# OK is bucket already exists
pass
try:
cli.create_bucket(Bucket="cant_get_it", ACL="private")
except Exception:
# OK is bucket already exists
pass
timeout = 2
while not cli.list_buckets()["Buckets"] and timeout > 0:
time.sleep(0.1)
timeout -= 0.1

add_tips_files(bucket)
add_tips_files("cant_get_it")
s3fs.S3FileSystem.clear_instance_cache()
yield conn

s3 = s3fs.S3FileSystem(client_kwargs={"endpoint_url": s3_base})

try:
s3.rm(bucket, recursive=True)
except Exception:
pass
try:
s3.rm("cant_get_it", recursive=True)
except Exception:
pass
timeout = 2
while cli.list_buckets()["Buckets"] and timeout > 0:
time.sleep(0.1)
timeout -= 0.1
for s3_key, file_name in test_s3_files:
with open(file_name, "rb") as f:
s3_private_bucket.put_object(Key=s3_key, Body=f)
return s3_private_bucket


_compression_formats_params = [
Expand Down
16 changes: 8 additions & 8 deletions pandas/tests/io/excel/test_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -894,29 +894,29 @@ def test_read_from_http_url(self, read_ext):

@td.skip_if_not_us_locale
@pytest.mark.single_cpu
def test_read_from_s3_url(self, read_ext, s3_resource, s3so):
# Bucket "pandas-test" created in tests/io/conftest.py
def test_read_from_s3_url(self, read_ext, s3_public_bucket, s3so):
# Bucket created in tests/io/conftest.py
with open("test1" + read_ext, "rb") as f:
s3_resource.Bucket("pandas-test").put_object(Key="test1" + read_ext, Body=f)
s3_public_bucket.put_object(Key="test1" + read_ext, Body=f)

url = "s3://pandas-test/test1" + read_ext
url = f"s3://{s3_public_bucket.name}/test1" + read_ext

url_table = pd.read_excel(url, storage_options=s3so)
local_table = pd.read_excel("test1" + read_ext)
tm.assert_frame_equal(url_table, local_table)

@pytest.mark.single_cpu
def test_read_from_s3_object(self, read_ext, s3_resource, s3so):
def test_read_from_s3_object(self, read_ext, s3_public_bucket, s3so):
# GH 38788
# Bucket "pandas-test" created in tests/io/conftest.py
# Bucket created in tests/io/conftest.py
with open("test1" + read_ext, "rb") as f:
s3_resource.Bucket("pandas-test").put_object(Key="test1" + read_ext, Body=f)
s3_public_bucket.put_object(Key="test1" + read_ext, Body=f)

import s3fs

s3 = s3fs.S3FileSystem(**s3so)

with s3.open("s3://pandas-test/test1" + read_ext) as f:
with s3.open(f"s3://{s3_public_bucket.name}/test1" + read_ext) as f:
url_table = pd.read_excel(f)

local_table = pd.read_excel("test1" + read_ext)
Expand Down
8 changes: 3 additions & 5 deletions pandas/tests/io/excel/test_style.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,18 +274,16 @@ def custom_converter(css):

@pytest.mark.single_cpu
@td.skip_if_not_us_locale
def test_styler_to_s3(s3_resource, s3so):
def test_styler_to_s3(s3_public_bucket, s3so):
# GH#46381

mock_bucket_name, target_file = "pandas-test", "test.xlsx"
mock_bucket_name, target_file = s3_public_bucket.name, "test.xlsx"
df = DataFrame({"x": [1, 2, 3], "y": [2, 4, 6]})
styler = df.style.set_sticky(axis="index")
styler.to_excel(f"s3://{mock_bucket_name}/{target_file}", storage_options=s3so)
timeout = 5
while True:
if target_file in (
obj.key for obj in s3_resource.Bucket("pandas-test").objects.all()
):
if target_file in (obj.key for obj in s3_public_bucket.objects.all()):
break
time.sleep(0.1)
timeout -= 0.1
Expand Down
10 changes: 6 additions & 4 deletions pandas/tests/io/json/test_compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,17 +41,19 @@ def test_read_zipped_json(datapath):

@td.skip_if_not_us_locale
@pytest.mark.single_cpu
def test_with_s3_url(compression, s3_resource, s3so):
# Bucket "pandas-test" created in tests/io/conftest.py
def test_with_s3_url(compression, s3_public_bucket, s3so):
# Bucket created in tests/io/conftest.py
df = pd.read_json(StringIO('{"a": [1, 2, 3], "b": [4, 5, 6]}'))

with tm.ensure_clean() as path:
df.to_json(path, compression=compression)
with open(path, "rb") as f:
s3_resource.Bucket("pandas-test").put_object(Key="test-1", Body=f)
s3_public_bucket.put_object(Key="test-1", Body=f)

roundtripped_df = pd.read_json(
"s3://pandas-test/test-1", compression=compression, storage_options=s3so
f"s3://{s3_public_bucket.name}/test-1",
compression=compression,
storage_options=s3so,
)
tm.assert_frame_equal(df, roundtripped_df)

Expand Down
14 changes: 7 additions & 7 deletions pandas/tests/io/json/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -1267,11 +1267,13 @@ def test_read_inline_jsonl(self):

@pytest.mark.single_cpu
@td.skip_if_not_us_locale
def test_read_s3_jsonl(self, s3_resource, s3so):
def test_read_s3_jsonl(self, s3_public_bucket_with_data, s3so):
# GH17200

result = read_json(
"s3n://pandas-test/items.jsonl", lines=True, storage_options=s3so
f"s3n://{s3_public_bucket_with_data.name}/items.jsonl",
lines=True,
storage_options=s3so,
)
expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
tm.assert_frame_equal(result, expected)
Expand Down Expand Up @@ -1843,16 +1845,14 @@ def test_json_multiindex(self, dataframe, expected):
assert result == expected

@pytest.mark.single_cpu
def test_to_s3(self, s3_resource, s3so):
def test_to_s3(self, s3_public_bucket, s3so):
# GH 28375
mock_bucket_name, target_file = "pandas-test", "test.json"
mock_bucket_name, target_file = s3_public_bucket.name, "test.json"
df = DataFrame({"x": [1, 2, 3], "y": [2, 4, 6]})
df.to_json(f"s3://{mock_bucket_name}/{target_file}", storage_options=s3so)
timeout = 5
while True:
if target_file in (
obj.key for obj in s3_resource.Bucket("pandas-test").objects.all()
):
if target_file in (obj.key for obj in s3_public_bucket.objects.all()):
break
time.sleep(0.1)
timeout -= 0.1
Expand Down

0 comments on commit 8bab235

Please sign in to comment.