TST: Refactor s3 resource (#53803)

* TST: Refactor s3 resource * Fix api usage * bot3 instead of cli * refactor call * Make cleanup and bucket names more unique * Use external error raised
pandas-dev · Jun 23, 2023 · 8bab235 · 8bab235
1 parent a7a5b13
commit 8bab235
Show file tree

Hide file tree

Showing 9 changed files with 180 additions and 132 deletions.
diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py
@@ -1,6 +1,7 @@
 import shlex
 import subprocess
 import time
+import uuid
 
 import pytest
 
@@ -54,13 +55,13 @@ def s3so(worker_id):
     return {"client_kwargs": {"endpoint_url": url}}
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="function" if is_ci_environment() else "session")
 def monkeysession():
     with pytest.MonkeyPatch.context() as mp:
         yield mp
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="function" if is_ci_environment() else "session")
 def s3_base(worker_id, monkeysession):
     """
     Fixture for mocking S3 interaction.
@@ -123,24 +124,67 @@ def s3_base(worker_id, monkeysession):
 
 
 @pytest.fixture
-def s3_resource(s3_base, tips_file, jsonl_file, feather_file):
-    """
-    Sets up S3 bucket with contents
+def s3_resource(s3_base):
+    import boto3
+
+    s3 = boto3.resource("s3", endpoint_url=s3_base)
+    return s3
+
 
-    The primary bucket name is "pandas-test". The following datasets
+@pytest.fixture
+def s3_public_bucket(s3_resource):
+    bucket = s3_resource.Bucket(f"pandas-test-{uuid.uuid4()}")
+    bucket.create()
+    yield bucket
+    bucket.objects.delete()
+    bucket.delete()
+
+
+@pytest.fixture
+def s3_public_bucket_with_data(s3_public_bucket, tips_file, jsonl_file, feather_file):
+    """
+    The following datasets
     are loaded.
 
     - tips.csv
     - tips.csv.gz
     - tips.csv.bz2
     - items.jsonl
+    """
+    test_s3_files = [
+        ("tips#1.csv", tips_file),
+        ("tips.csv", tips_file),
+        ("tips.csv.gz", tips_file + ".gz"),
+        ("tips.csv.bz2", tips_file + ".bz2"),
+        ("items.jsonl", jsonl_file),
+        ("simple_dataset.feather", feather_file),
+    ]
+    for s3_key, file_name in test_s3_files:
+        with open(file_name, "rb") as f:
+            s3_public_bucket.put_object(Key=s3_key, Body=f)
+    return s3_public_bucket
+
+
+@pytest.fixture
+def s3_private_bucket(s3_resource):
+    bucket = s3_resource.Bucket(f"cant_get_it-{uuid.uuid4()}")
+    bucket.create(ACL="private")
+    yield bucket
+    bucket.objects.delete()
+    bucket.delete()
+
 
-    A private bucket "cant_get_it" is also created. The boto3 s3 resource
-    is yielded by the fixture.
+@pytest.fixture
+def s3_private_bucket_with_data(s3_private_bucket, tips_file, jsonl_file, feather_file):
     """
-    import boto3
-    import s3fs
+    The following datasets
+    are loaded.
 
+    - tips.csv
+    - tips.csv.gz
+    - tips.csv.bz2
+    - items.jsonl
+    """
     test_s3_files = [
         ("tips#1.csv", tips_file),
         ("tips.csv", tips_file),
@@ -149,50 +193,10 @@ def s3_resource(s3_base, tips_file, jsonl_file, feather_file):
         ("items.jsonl", jsonl_file),
         ("simple_dataset.feather", feather_file),
     ]
-
-    def add_tips_files(bucket_name):
-        for s3_key, file_name in test_s3_files:
-            with open(file_name, "rb") as f:
-                cli.put_object(Bucket=bucket_name, Key=s3_key, Body=f)
-
-    bucket = "pandas-test"
-    conn = boto3.resource("s3", endpoint_url=s3_base)
-    cli = boto3.client("s3", endpoint_url=s3_base)
-
-    try:
-        cli.create_bucket(Bucket=bucket)
-    except Exception:
-        # OK is bucket already exists
-        pass
-    try:
-        cli.create_bucket(Bucket="cant_get_it", ACL="private")
-    except Exception:
-        # OK is bucket already exists
-        pass
-    timeout = 2
-    while not cli.list_buckets()["Buckets"] and timeout > 0:
-        time.sleep(0.1)
-        timeout -= 0.1
-
-    add_tips_files(bucket)
-    add_tips_files("cant_get_it")
-    s3fs.S3FileSystem.clear_instance_cache()
-    yield conn
-
-    s3 = s3fs.S3FileSystem(client_kwargs={"endpoint_url": s3_base})
-
-    try:
-        s3.rm(bucket, recursive=True)
-    except Exception:
-        pass
-    try:
-        s3.rm("cant_get_it", recursive=True)
-    except Exception:
-        pass
-    timeout = 2
-    while cli.list_buckets()["Buckets"] and timeout > 0:
-        time.sleep(0.1)
-        timeout -= 0.1
+    for s3_key, file_name in test_s3_files:
+        with open(file_name, "rb") as f:
+            s3_private_bucket.put_object(Key=s3_key, Body=f)
+    return s3_private_bucket
 
 
 _compression_formats_params = [

diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
@@ -894,29 +894,29 @@ def test_read_from_http_url(self, read_ext):
 
     @td.skip_if_not_us_locale
     @pytest.mark.single_cpu
-    def test_read_from_s3_url(self, read_ext, s3_resource, s3so):
-        # Bucket "pandas-test" created in tests/io/conftest.py
+    def test_read_from_s3_url(self, read_ext, s3_public_bucket, s3so):
+        # Bucket created in tests/io/conftest.py
         with open("test1" + read_ext, "rb") as f:
-            s3_resource.Bucket("pandas-test").put_object(Key="test1" + read_ext, Body=f)
+            s3_public_bucket.put_object(Key="test1" + read_ext, Body=f)
 
-        url = "s3://pandas-test/test1" + read_ext
+        url = f"s3://{s3_public_bucket.name}/test1" + read_ext
 
         url_table = pd.read_excel(url, storage_options=s3so)
         local_table = pd.read_excel("test1" + read_ext)
         tm.assert_frame_equal(url_table, local_table)
 
     @pytest.mark.single_cpu
-    def test_read_from_s3_object(self, read_ext, s3_resource, s3so):
+    def test_read_from_s3_object(self, read_ext, s3_public_bucket, s3so):
         # GH 38788
-        # Bucket "pandas-test" created in tests/io/conftest.py
+        # Bucket created in tests/io/conftest.py
         with open("test1" + read_ext, "rb") as f:
-            s3_resource.Bucket("pandas-test").put_object(Key="test1" + read_ext, Body=f)
+            s3_public_bucket.put_object(Key="test1" + read_ext, Body=f)
 
         import s3fs
 
         s3 = s3fs.S3FileSystem(**s3so)
 
-        with s3.open("s3://pandas-test/test1" + read_ext) as f:
+        with s3.open(f"s3://{s3_public_bucket.name}/test1" + read_ext) as f:
             url_table = pd.read_excel(f)
 
         local_table = pd.read_excel("test1" + read_ext)

diff --git a/pandas/tests/io/excel/test_style.py b/pandas/tests/io/excel/test_style.py
@@ -274,18 +274,16 @@ def custom_converter(css):
 
 @pytest.mark.single_cpu
 @td.skip_if_not_us_locale
-def test_styler_to_s3(s3_resource, s3so):
+def test_styler_to_s3(s3_public_bucket, s3so):
     # GH#46381
 
-    mock_bucket_name, target_file = "pandas-test", "test.xlsx"
+    mock_bucket_name, target_file = s3_public_bucket.name, "test.xlsx"
     df = DataFrame({"x": [1, 2, 3], "y": [2, 4, 6]})
     styler = df.style.set_sticky(axis="index")
     styler.to_excel(f"s3://{mock_bucket_name}/{target_file}", storage_options=s3so)
     timeout = 5
     while True:
-        if target_file in (
-            obj.key for obj in s3_resource.Bucket("pandas-test").objects.all()
-        ):
+        if target_file in (obj.key for obj in s3_public_bucket.objects.all()):
             break
         time.sleep(0.1)
         timeout -= 0.1

diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py
@@ -41,17 +41,19 @@ def test_read_zipped_json(datapath):
 
 @td.skip_if_not_us_locale
 @pytest.mark.single_cpu
-def test_with_s3_url(compression, s3_resource, s3so):
-    # Bucket "pandas-test" created in tests/io/conftest.py
+def test_with_s3_url(compression, s3_public_bucket, s3so):
+    # Bucket created in tests/io/conftest.py
     df = pd.read_json(StringIO('{"a": [1, 2, 3], "b": [4, 5, 6]}'))
 
     with tm.ensure_clean() as path:
         df.to_json(path, compression=compression)
         with open(path, "rb") as f:
-            s3_resource.Bucket("pandas-test").put_object(Key="test-1", Body=f)
+            s3_public_bucket.put_object(Key="test-1", Body=f)
 
     roundtripped_df = pd.read_json(
-        "s3://pandas-test/test-1", compression=compression, storage_options=s3so
+        f"s3://{s3_public_bucket.name}/test-1",
+        compression=compression,
+        storage_options=s3so,
     )
     tm.assert_frame_equal(df, roundtripped_df)
 

diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
@@ -1267,11 +1267,13 @@ def test_read_inline_jsonl(self):
 
     @pytest.mark.single_cpu
     @td.skip_if_not_us_locale
-    def test_read_s3_jsonl(self, s3_resource, s3so):
+    def test_read_s3_jsonl(self, s3_public_bucket_with_data, s3so):
         # GH17200
 
         result = read_json(
-            "s3n://pandas-test/items.jsonl", lines=True, storage_options=s3so
+            f"s3n://{s3_public_bucket_with_data.name}/items.jsonl",
+            lines=True,
+            storage_options=s3so,
         )
         expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
         tm.assert_frame_equal(result, expected)
@@ -1843,16 +1845,14 @@ def test_json_multiindex(self, dataframe, expected):
         assert result == expected
 
     @pytest.mark.single_cpu
-    def test_to_s3(self, s3_resource, s3so):
+    def test_to_s3(self, s3_public_bucket, s3so):
         # GH 28375
-        mock_bucket_name, target_file = "pandas-test", "test.json"
+        mock_bucket_name, target_file = s3_public_bucket.name, "test.json"
         df = DataFrame({"x": [1, 2, 3], "y": [2, 4, 6]})
         df.to_json(f"s3://{mock_bucket_name}/{target_file}", storage_options=s3so)
         timeout = 5
         while True:
-            if target_file in (
-                obj.key for obj in s3_resource.Bucket("pandas-test").objects.all()
-            ):
+            if target_file in (obj.key for obj in s3_public_bucket.objects.all()):
                 break
             time.sleep(0.1)
             timeout -= 0.1