# Round-trip tests for reading and writing to and from S3

Run this notebook to ensure read and writing behave as expected when using S3.

In [None]:
import tempfile

from pandas.testing import assert_frame_equal

from arrow_pd_parser import reader, writer
from arrow_pd_parser.utils import FileFormat

import awswrangler as wr
import pandas as pd
import itertools

First clean up the test directory.

In [None]:
wr.s3.delete_objects("s3://alpha-everyone/arrow_pd_parser_testing")

Read the test dataframe with the existing metadata.

In [None]:
meta_read = {
    "columns": [
        {"name": "my_float", "type": "float64", "type_category": "float"},
        {"name": "my_bool", "type": "bool_", "type_category": "boolean"},
        {"name": "my_nullable_bool", "type": "bool_", "type_category": "boolean"},
        {"name": "my_date", "type": "date32", "type_category": "timestamp"},
        {
            "name": "my_datetime",
            "type": "timestamp(s)",
            "type_category": "timestamp",
        },
        {"name": "my_int", "type": "int64", "type_category": "integer"},
        {"name": "my_string", "type": "string", "type_category": "string"},
    ]
}

df = reader.read("tests/data/all_types.csv", metadata=meta_read)

Now add a column `sorted_index` which can later be used to sort the dataframes to ensure the values after roundtrip read/writes are identical after sorting into the original order. This then needs a new metadata object. Write the dataframe to each format in S3.

In [None]:
df = df.reset_index().rename(columns={'index': 'sorted_index'})

meta = {
    "columns": [
        {"name": "sorted_index", "type": "int64", "type_category": "integer"},
        {"name": "my_float", "type": "float64", "type_category": "float"},
        {"name": "my_bool", "type": "bool_", "type_category": "boolean"},
        {"name": "my_nullable_bool", "type": "bool_", "type_category": "boolean"},
        {"name": "my_date", "type": "date32", "type_category": "timestamp"},
        {
            "name": "my_datetime",
            "type": "timestamp(s)",
            "type_category": "timestamp",
        },
        {"name": "my_int", "type": "int64", "type_category": "integer"},
        {"name": "my_string", "type": "string", "type_category": "string"},
    ]
}

formats = ['csv', 'parquet', 'jsonl']

for f in formats:
    writer.write(df, f"s3://alpha-everyone/arrow_pd_parser_testing/all_types.{f}", metadata=meta)

In [None]:
wr.s3.list_objects("s3://alpha-everyone/arrow_pd_parser_testing")

Read the dataframe from each format, write to each format, read again and make sure everything's the same after each round-trip.

In [None]:
wr.s3.delete_objects("s3://alpha-everyone/arrow_pd_parser_testing/all_types_output")

for read_format, write_format in itertools.product(formats, repeat=2):
    try:
        print(read_format, "->", write_format)
        d = reader.read(
            f"s3://alpha-everyone/arrow_pd_parser_testing/all_types.{read_format}", 
            metadata=meta
        )
        writer.write(
            d,
            f"s3://alpha-everyone/arrow_pd_parser_testing/all_types_output_from_{read_format}.{write_format}",
            metadata=meta
        )
        final = reader.read(
            f"s3://alpha-everyone/arrow_pd_parser_testing/all_types_output_from_{read_format}.{write_format}",
            metadata=meta
        )
        assert_frame_equal(d, final)
        print('Succeeded')
    except Exception as e:
        print('Failed')
        print(e)

Same as above, except reading and writing in chunks. The dataframes need to be sorted by the added `sorted_index` column and the the actual index reset before comparisons are made.

In [None]:
wr.s3.delete_objects("s3://alpha-everyone/arrow_pd_parser_testing/all_types_output")

for read_format, write_format in itertools.product(formats, repeat=2):
    try:
        print(read_format, "->", write_format)
        d = reader.read(
            f"s3://alpha-everyone/arrow_pd_parser_testing/all_types.{read_format}", 
            metadata=meta,
        )
        d_iter = reader.read(
            f"s3://alpha-everyone/arrow_pd_parser_testing/all_types.{read_format}", 
            metadata=meta,
            chunksize=2
        )
        writer.write(
            d_iter,
            f"s3://alpha-everyone/arrow_pd_parser_testing/all_types_output_from_{read_format}.{write_format}",
            metadata=meta
        )
        final = reader.read(
            f"s3://alpha-everyone/arrow_pd_parser_testing/all_types_output_from_{read_format}.{write_format}",
            metadata=meta
        )
        assert_frame_equal(
            d.sort_values(by="sorted_index").reset_index(drop=True), 
            final.sort_values(by="sorted_index").reset_index(drop=True)
        )
        print('Succeeded')
    except Exception as e:
        print('Failed')
        print(e)