Skip to content

Commit

Permalink
TST: Made s3 related tests mock boto
Browse files Browse the repository at this point in the history
Kept a couple around for testing things like accessing a private bucket as that's hard to mock.
  • Loading branch information
kirkhansen committed Sep 1, 2017
1 parent 062f6f1 commit 23aa1d1
Show file tree
Hide file tree
Showing 17 changed files with 202 additions and 154 deletions.
5 changes: 5 additions & 0 deletions appveyor.yml
Expand Up @@ -80,6 +80,11 @@ install:
- cmd: conda list -n pandas
- cmd: echo "installing requirements from %REQ% - done"

# add some pip only reqs to the env
- SET REQ=ci\requirements-%PYTHON_VERSION%_WIN.pip
- cmd: echo "installing requirements from %REQ%"
- cmd: pip install -Ur %REQ%

# build em using the local source checkout in the correct windows env
- cmd: '%CMD_IN_ENV% python setup.py build_ext --inplace'

Expand Down
1 change: 1 addition & 0 deletions ci/requirements-2.7.run
Expand Up @@ -18,3 +18,4 @@ patsy
pymysql=0.6.3
jinja2=2.8
xarray=0.8.0
moto
1 change: 1 addition & 0 deletions ci/requirements-2.7_SLOW.run
Expand Up @@ -17,3 +17,4 @@ psycopg2
pymysql
html5lib
beautiful-soup
moto
Empty file added ci/requirements-2.7_WIN.pip
Empty file.
1 change: 1 addition & 0 deletions ci/requirements-2.7_WIN.run
Expand Up @@ -16,3 +16,4 @@ bottleneck
html5lib
beautiful-soup
jinja2=2.8
moto
1 change: 1 addition & 0 deletions ci/requirements-3.5.run
Expand Up @@ -18,3 +18,4 @@ psycopg2
s3fs
beautifulsoup4
ipython
moto
1 change: 1 addition & 0 deletions ci/requirements-3.5_OSX.run
Expand Up @@ -14,3 +14,4 @@ bottleneck
xarray
s3fs
beautifulsoup4
moto
1 change: 1 addition & 0 deletions ci/requirements-3.6.run
Expand Up @@ -23,3 +23,4 @@ beautifulsoup4
s3fs
xarray
ipython
moto
1 change: 1 addition & 0 deletions ci/requirements-3.6_LOCALE.run
Expand Up @@ -20,3 +20,4 @@ beautifulsoup4
s3fs
xarray
ipython
moto
1 change: 1 addition & 0 deletions ci/requirements-3.6_LOCALE_SLOW.run
Expand Up @@ -20,3 +20,4 @@ beautifulsoup4
s3fs
xarray
ipython
moto
Empty file.
Empty file added ci/requirements-3.6_WIN.pip
Empty file.
Binary file added pandas/tests/io/parser/data/tips.csv.bz2
Binary file not shown.
Binary file added pandas/tests/io/parser/data/tips.csv.gz
Binary file not shown.
284 changes: 159 additions & 125 deletions pandas/tests/io/parser/test_network.py
Expand Up @@ -4,21 +4,64 @@
Tests parsers ability to read and parse non-local files
and hence require a network connection to be read.
"""

import os

import pytest
import six

import pandas.util.testing as tm
from pandas import DataFrame
from pandas.io.parsers import read_csv, read_table


@pytest.fixture(scope='module')
def tips_file():
return os.path.join(tm.get_data_path(), 'tips.csv')


@pytest.fixture(scope='module')
def salaries_table():
path = os.path.join(tm.get_data_path(), 'salaries.csv')
return read_table(path)


@pytest.fixture(scope='module')
def test_s3_resource(request, tips_file):
pytest.importorskip('s3fs')
moto = pytest.importorskip('moto')
moto.mock_s3().start()

test_s3_files = [
('tips.csv', tips_file),
('tips.csv.gz', tips_file + '.gz'),
('tips.csv.bz2', tips_file + '.bz2'),
]

def add_tips_files(bucket_name):
for s3_key, file_name in test_s3_files:
with open(file_name, 'rb') as f:
conn.Bucket(bucket_name).put_object(
Key=s3_key,
Body=f)

boto3 = pytest.importorskip('boto3')
# see gh-16135
bucket = 'pandas-test'

conn = boto3.resource("s3", region_name="us-east-1")
conn.create_bucket(Bucket=bucket)
add_tips_files(bucket)

conn.create_bucket(Bucket='cant_get_it', ACL='private')
add_tips_files('cant_get_it')

def teardown():
moto.mock_s3().stop()
request.addfinalizer(teardown)

return conn


@pytest.mark.network
@pytest.mark.parametrize(
"compression,extension",
Expand Down Expand Up @@ -50,151 +93,142 @@ def check_compressed_urls(salaries_table, compression, extension, mode,
tm.assert_frame_equal(url_table, salaries_table)


class TestS3(object):
@tm.network
def test_parse_public_s3_bucket():
pytest.importorskip('s3fs')
# more of an integration test due to the not-public contents portion
# can probably mock this though.
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
df = read_csv('s3://pandas-test/tips.csv' +
ext, compression=comp)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(read_csv(
tm.get_data_path('tips.csv')), df)

# Read public file from bucket with not-public contents
df = read_csv('s3://cant_get_it/tips.csv')
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(read_csv(tm.get_data_path('tips.csv')), df)


def test_parse_public_s3n_bucket(test_s3_resource):

# Read from AWS s3 as "s3n" URL
df = read_csv('s3n://pandas-test/tips.csv', nrows=10)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(read_csv(
tm.get_data_path('tips.csv')).iloc[:10], df)

def setup_method(self, method):
try:
import s3fs # noqa
except ImportError:
pytest.skip("s3fs not installed")

@tm.network
def test_parse_public_s3_bucket(self):
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
df = read_csv('s3://pandas-test/tips.csv' +
ext, compression=comp)
def test_parse_public_s3a_bucket(test_s3_resource):
# Read from AWS s3 as "s3a" URL
df = read_csv('s3a://pandas-test/tips.csv', nrows=10)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(read_csv(
tm.get_data_path('tips.csv')).iloc[:10], df)


def test_parse_public_s3_bucket_nrows(test_s3_resource):
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
df = read_csv('s3://pandas-test/tips.csv' +
ext, nrows=10, compression=comp)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(read_csv(
tm.get_data_path('tips.csv')).iloc[:10], df)


def test_parse_public_s3_bucket_chunked(test_s3_resource):
# Read with a chunksize
chunksize = 5
local_tips = read_csv(tm.get_data_path('tips.csv'))
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
df_reader = read_csv('s3://pandas-test/tips.csv' + ext,
chunksize=chunksize, compression=comp)
assert df_reader.chunksize == chunksize
for i_chunk in [0, 1, 2]:
# Read a couple of chunks and make sure we see them
# properly.
df = df_reader.get_chunk()
assert isinstance(df, DataFrame)
assert not df.empty
true_df = local_tips.iloc[
chunksize * i_chunk: chunksize * (i_chunk + 1)]
tm.assert_frame_equal(true_df, df)


def test_parse_public_s3_bucket_chunked_python(test_s3_resource):
# Read with a chunksize using the Python parser
chunksize = 5
local_tips = read_csv(tm.get_data_path('tips.csv'))
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
df_reader = read_csv('s3://pandas-test/tips.csv' + ext,
chunksize=chunksize, compression=comp,
engine='python')
assert df_reader.chunksize == chunksize
for i_chunk in [0, 1, 2]:
# Read a couple of chunks and make sure we see them properly.
df = df_reader.get_chunk()
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(read_csv(
tm.get_data_path('tips.csv')), df)
true_df = local_tips.iloc[
chunksize * i_chunk: chunksize * (i_chunk + 1)]
tm.assert_frame_equal(true_df, df)


# Read public file from bucket with not-public contents
df = read_csv('s3://cant_get_it/tips.csv')
def test_parse_public_s3_bucket_python(test_s3_resource):
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python',
compression=comp)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(read_csv(tm.get_data_path('tips.csv')), df)
tm.assert_frame_equal(read_csv(
tm.get_data_path('tips.csv')), df)

@tm.network
def test_parse_public_s3n_bucket(self):
# Read from AWS s3 as "s3n" URL
df = read_csv('s3n://pandas-test/tips.csv', nrows=10)

def test_infer_s3_compression(test_s3_resource):
for ext in ['', '.gz', '.bz2']:
df = read_csv('s3://pandas-test/tips.csv' + ext,
engine='python', compression='infer')
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(read_csv(
tm.get_data_path('tips.csv')).iloc[:10], df)
tm.get_data_path('tips.csv')), df)


@tm.network
def test_parse_public_s3a_bucket(self):
# Read from AWS s3 as "s3a" URL
df = read_csv('s3a://pandas-test/tips.csv', nrows=10)
def test_parse_public_s3_bucket_nrows_python(test_s3_resource):
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python',
nrows=10, compression=comp)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(read_csv(
tm.get_data_path('tips.csv')).iloc[:10], df)

@tm.network
def test_parse_public_s3_bucket_nrows(self):
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
df = read_csv('s3://pandas-test/tips.csv' +
ext, nrows=10, compression=comp)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(read_csv(
tm.get_data_path('tips.csv')).iloc[:10], df)

@tm.network
def test_parse_public_s3_bucket_chunked(self):
# Read with a chunksize
chunksize = 5
local_tips = read_csv(tm.get_data_path('tips.csv'))
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
df_reader = read_csv('s3://pandas-test/tips.csv' + ext,
chunksize=chunksize, compression=comp)
assert df_reader.chunksize == chunksize
for i_chunk in [0, 1, 2]:
# Read a couple of chunks and make sure we see them
# properly.
df = df_reader.get_chunk()
assert isinstance(df, DataFrame)
assert not df.empty
true_df = local_tips.iloc[
chunksize * i_chunk: chunksize * (i_chunk + 1)]
tm.assert_frame_equal(true_df, df)

@tm.network
def test_parse_public_s3_bucket_chunked_python(self):
# Read with a chunksize using the Python parser
chunksize = 5
local_tips = read_csv(tm.get_data_path('tips.csv'))
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
df_reader = read_csv('s3://pandas-test/tips.csv' + ext,
chunksize=chunksize, compression=comp,
engine='python')
assert df_reader.chunksize == chunksize
for i_chunk in [0, 1, 2]:
# Read a couple of chunks and make sure we see them properly.
df = df_reader.get_chunk()
assert isinstance(df, DataFrame)
assert not df.empty
true_df = local_tips.iloc[
chunksize * i_chunk: chunksize * (i_chunk + 1)]
tm.assert_frame_equal(true_df, df)

@tm.network
def test_parse_public_s3_bucket_python(self):
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python',
compression=comp)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(read_csv(
tm.get_data_path('tips.csv')), df)

@tm.network
def test_infer_s3_compression(self):
for ext in ['', '.gz', '.bz2']:
df = read_csv('s3://pandas-test/tips.csv' + ext,
engine='python', compression='infer')
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(read_csv(
tm.get_data_path('tips.csv')), df)

@tm.network
def test_parse_public_s3_bucket_nrows_python(self):
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python',
nrows=10, compression=comp)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(read_csv(
tm.get_data_path('tips.csv')).iloc[:10], df)

@tm.network
def test_s3_fails(self):
with pytest.raises(IOError):
read_csv('s3://nyqpug/asdf.csv')
def test_s3_fails(test_s3_resource):
with pytest.raises(IOError):
read_csv('s3://nyqpug/asdf.csv')

# Receive a permission error when trying to read a private bucket.
# It's irrelevant here that this isn't actually a table.
with pytest.raises(IOError):
read_csv('s3://cant_get_it/')
# Receive a permission error when trying to read a private bucket.
# It's irrelevant here that this isn't actually a table.
with pytest.raises(IOError):
read_csv('s3://cant_get_it/')

@tm.network
def boto3_client_s3(self):
# see gh-16135

# boto3 is a dependency of s3fs
import boto3
client = boto3.client("s3")
def test_read_csv__handles_boto_s3_object(test_s3_resource, tips_file):
# see gh-16135

key = "/tips.csv"
bucket = "pandas-test"
s3_object = client.get_object(Bucket=bucket, Key=key)
s3_object = test_s3_resource.meta.client.get_object(Bucket='pandas-test',
Key='tips.csv')

result = read_csv(s3_object["Body"])
assert isinstance(result, DataFrame)
assert not result.empty
result = read_csv(six.BytesIO(s3_object["Body"].read()), encoding='utf8')
assert isinstance(result, DataFrame)
assert not result.empty

expected = read_csv(tm.get_data_path('tips.csv'))
tm.assert_frame_equal(result, expected)
expected = read_csv(tips_file)
tm.assert_frame_equal(result, expected)

0 comments on commit 23aa1d1

Please sign in to comment.