-
-
Notifications
You must be signed in to change notification settings - Fork 19.1k
Description
Pandas version checks
-
I have checked that this issue has not already been reported.
-
I have confirmed this bug exists on the latest version of pandas.
-
I have confirmed this bug exists on the main branch of pandas.
Reproducible Example
import base64
import zlib
import awswrangler as wr
import boto3
import pandas as pd
# DL DEV
AWS_ACCESS_KEY_ID = <Redacted>
AWS_SECRET_ACCESS_KEY = <Redacted>
AWS_SESSION_TOKEN = <Redacted>
session_west = boto3.Session(
aws_access_key_id=AWS_ACCESS_KEY_ID,
aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
aws_session_token=AWS_SESSION_TOKEN,
region_name="eu-west-1",
)
client = session_west.client("s3")
localpath = <Redacted>
bigfile = "bigfile.csv"
smallfile = "smallfile.csv"
bucket = "checksum-test-bucket"
s3path = "checksum-test"
for filetype in [smallfile, bigfile]:
with open(f"{localpath}{filetype}", "rb") as file:
# Calculate CRC32 ourselves for reference
crcval = zlib.crc32(file.read())
crc_bytes = crcval.to_bytes(4, "big")
crc = base64.b64encode(crc_bytes).decode("utf-8")
print(f"{filetype} - {crc}")
with open(f"{localpath}{filetype}", "rb") as file:
client.put_object(
Bucket=bucket, Key=f"{s3path}/put_object/{filetype}", Body=file
)
client.upload_file(
Bucket=bucket,
Key=f"{s3path}/upload_file/{filetype}",
Filename=f"{localpath}{filetype}",
)
for filetype in [smallfile, bigfile]:
for upload_method in ["put_object", "upload_file"]:
path = f"s3://{bucket}/{s3path}/{upload_method}/{filetype}"
print(path)
try:
fw: pd.DataFrame = wr.s3.read_csv(
path=path,
dtype="object",
boto3_session=session_west,
)
print(fw.shape)
except Exception as e:
print(f"wrangler failed - {e}")
try:
fp = pd.read_csv(
path,
storage_options={
"key": AWS_ACCESS_KEY_ID,
"secret": AWS_SECRET_ACCESS_KEY,
"token": AWS_SESSION_TOKEN,
},
)
print(fp.shape)
except Exception as e:
print(f"Pandas fail - {e}")
try:
client = session_west.client("s3")
fb = client.get_object(
Bucket=bucket,
Key=f"{s3path}/{upload_method}/{filetype}",
ChecksumMode="ENABLED",
)
print(f'{fb["ChecksumCRC32"]} - {fb["ChecksumType"]}')
except Exception as e:
print(f"boto error - {e}")
Issue Description
Boto3 >=1.36.0 has modified behaviour to add CRC32 checksum by default where supported.
When accessing s3 objects with pd.read_csv any s3 object that has created a COMPOSITE checksum fails reading as the checksum compared against is the FULL_OBJECT checksum.
Composite checksum appears to be calculated when an object exceeds ~10Mb when using boto3 upload_file(), seemingly it switches to a multi-part upload behind the scenes at that threshold. Other explicit multi-part uploads will presumably have the same behaviour.
Included test using both Pandas and Awswrangler for completeness
Output for failing versions
Name: boto3
Version: 1.36.5
Name: botocore
Version: 1.36.5
Name: s3transfer
Version: 0.11.2
Name: pandas
Version: 2.2.3
Name: awswrangler
Version: 3.11.0
smallfile.csv - CbsfmA==
bigfile.csv - vGPIeA==
s3://checksum-test-bucket/checksum-test/put_object/smallfile.csv
(1461, 91)
(1461, 91)
CbsfmA== - FULL_OBJECT
s3://checksum-test-bucket/checksum-test/upload_file/smallfile.csv
(1461, 91)
(1461, 91)
CbsfmA== - FULL_OBJECT
s3://checksum-test-bucket/checksum-test/put_object/bigfile.csv
(20467, 91)
(20467, 91)
vGPIeA== - FULL_OBJECT
s3://checksum-test-bucket/checksum-test/upload_file/bigfile.csv
wrangler failed - Expected checksum DIoExg== did not match calculated checksum: vGPIeA==
Pandas fail - Expected checksum DIoExg== did not match calculated checksum: vGPIeA==
DIoExg==-2 - COMPOSITE
Using boto3 <1.36 all scenarios from the example code work
Test with older version
pip install "boto3<1.36.0"
Output from working version
Name: boto3
Version: 1.35.99
Name: botocore
Version: 1.35.99
Name: s3transfer
Version: 0.10.4
Name: pandas
Version: 2.2.3
Name: awswrangler
Version: 3.11.0
smallfile.csv - CbsfmA==
bigfile.csv - vGPIeA==
s3://checksum-test-bucket/checksum-test/put_object/smallfile.csv
(1461, 91)
(1461, 91)
None - None
s3://checksum-test-bucket/checksum-test/upload_file/smallfile.csv
(1461, 91)
(1461, 91)
None - None
s3://checksum-test-bucket/checksum-test/put_object/bigfile.csv
(20467, 91)
(20467, 91)
None - None
s3://checksum-test-bucket/checksum-test/upload_file/bigfile.csv
(20467, 91)
(20467, 91)
None - None```
### Expected Behavior
When reading using pd.read_csv the checksum calculated for comparison should be aware of whether the stored checksum is FULL_OBJECT or COMPOSITE and handle it correctly.
### Installed Versions
<details>
Working versions
INSTALLED VERSIONS
------------------
commit : 0691c5cf90477d3503834d983f69350f250a6ff7
python : 3.11.11
python-bits : 64
OS : Linux
OS-release : 6.8.0-1021-aws
Version : #23~22.04.1-Ubuntu SMP Tue Dec 10 16:50:46 UTC 2024
machine : x86_64
processor : x86_64
byteorder : little
LC_ALL : None
LANG : en_US.utf8
LOCALE : en_US.UTF-8
pandas : 2.2.3
numpy : 1.26.4
pytz : 2024.1
dateutil : 2.9.0.post0
pip : 24.3.1
Cython : None
sphinx : None
IPython : None
adbc-driver-postgresql: None
adbc-driver-sqlite : None
bs4 : 4.12.3
blosc : None
bottleneck : None
dataframe-api-compat : None
fastparquet : None
fsspec : 2024.12.0
html5lib : None
hypothesis : None
gcsfs : None
jinja2 : 3.1.4
lxml.etree : None
matplotlib : None
numba : None
numexpr : None
odfpy : None
openpyxl : None
pandas_gbq : None
psycopg2 : None
pymysql : None
pyarrow : 16.0.0
pyreadstat : None
pytest : 8.2.0
python-calamine : None
pyxlsb : None
s3fs : 2024.12.0
scipy : None
sqlalchemy : None
tables : None
tabulate : None
xarray : None
xlrd : None
xlsxwriter : None
zstandard : None
tzdata : 2024.1
qtpy : None
pyqt5 : None
None
Failing versions
INSTALLED VERSIONS
------------------
commit : 0691c5cf90477d3503834d983f69350f250a6ff7
python : 3.11.11
python-bits : 64
OS : Linux
OS-release : 6.8.0-1021-aws
Version : #23~22.04.1-Ubuntu SMP Tue Dec 10 16:50:46 UTC 2024
machine : x86_64
processor : x86_64
byteorder : little
LC_ALL : None
LANG : en_US.utf8
LOCALE : en_US.UTF-8
pandas : 2.2.3
numpy : 1.26.4
pytz : 2024.1
dateutil : 2.9.0.post0
pip : 24.3.1
Cython : None
sphinx : None
IPython : None
adbc-driver-postgresql: None
adbc-driver-sqlite : None
bs4 : 4.12.3
blosc : None
bottleneck : None
dataframe-api-compat : None
fastparquet : None
fsspec : 2024.12.0
html5lib : None
hypothesis : None
gcsfs : None
jinja2 : 3.1.4
lxml.etree : None
matplotlib : None
numba : None
numexpr : None
odfpy : None
openpyxl : None
pandas_gbq : None
psycopg2 : None
pymysql : None
pyarrow : 16.0.0
pyreadstat : None
pytest : 8.2.0
python-calamine : None
pyxlsb : None
s3fs : 2024.12.0
scipy : None
sqlalchemy : None
tables : None
tabulate : None
xarray : None
xlrd : None
xlsxwriter : None
zstandard : None
tzdata : 2024.1
qtpy : None
pyqt5 : None
None
</details>