In [3]:
import dlt
import pyarrow.parquet as pq
import pyarrow.dataset as ds
import fsspec
from aiohttp import ClientTimeout

In [3]:

def load_parquet_to_minio(parquet_url, batch_size=50000):
    """
    Downloads and processes a remote Parquet file in batches, yielding pandas DataFrames.

    This version is optimized for single Parquet files (not directory-partitioned).
    It uses `fsspec` to stream the file over HTTPS and `pyarrow.parquet.ParquetFile`
    to read it efficiently in row groups or batches. Ideal for scalable ingestion
    in data pipelines like DLT, especially when loading into destinations such as MinIO.

    Args:
        parquet_url (str): Full URL of the remote Parquet file.
        batch_size (int, optional): Number of rows per batch. Default is 50,000.

    Yields:
        pandas.DataFrame: A batch of rows converted from Parquet to pandas format.

    Example:
        >>> for df in load_parquet_to_minio("https://.../posts/2022.parquet"):
        >>>     process(df)
    """
    timeout = ClientTimeout(total=600)
    fs = fsspec.filesystem("https", client_kwargs={"timeout": timeout})

    with fs.open(parquet_url) as f:
        parquet_file = pq.ParquetFile(f)
        for batch in parquet_file.iter_batches(batch_size=batch_size):
            yield batch.to_pandas()

# POSTS 2022
@dlt.resource(table_name="posts_2022")
def posts_2022():
    url = "https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/posts/2022.parquet"
    yield from load_parquet_to_minio(url)

# POSTS 2023
@dlt.resource(table_name="posts_2023")
def posts_2023():
    url = "https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/posts/2023.parquet"
    yield from load_parquet_to_minio(url)

# VOTES 2022
@dlt.resource(table_name="votes_2022")
def votes_2022():
    url = "https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/votes/2022.parquet"
    yield from load_parquet_to_minio(url)

# VOTES 2023
@dlt.resource(table_name="votes_2023")
def votes_2023():
    url = "https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/votes/2023.parquet"
    yield from load_parquet_to_minio(url)

In [4]:
#pipeline_posts = dlt.pipeline(
#    pipeline_name="parquet_to_minio",
#    destination="filesystem",
#    dataset_name="posts",
#)

In [5]:
pipeline_votes = dlt.pipeline(
    pipeline_name="parquet_to_minio",
    destination="filesystem",
    dataset_name="votes",
)

In [5]:
# POSTS 2022
# try:
#     load_info = pipeline_posts.run(
#         posts_2022(),
#         table_name="2022",
#         loader_file_format="parquet",
#         write_disposition="replace"
#     )
#     print("✅ posts/2022 loaded:", load_info)
# except Exception as e:
#     print("❌ Error loading posts/2022:", e)



In [6]:
# POSTS 2023
# try:
#     load_info = pipeline_posts.run(
#         posts_2023(),
#         table_name="2023",
#         loader_file_format="parquet",
#         write_disposition="replace"
#     )
#     print("✅ posts/2023 loaded:", load_info)
# except Exception as e:
#     print("❌ Error loading posts/2023:", e)

In [None]:
# VOTES 2022
try:
    load_info = pipeline_votes.run(
        votes_2022(),
        table_name="2022",
        loader_file_format="parquet",
        write_disposition="replace"
    )
    print("✅ votes/2022 loaded:", load_info)
except Exception as e:
    print("❌ Error loading votes/2022:", e)


In [None]:
# VOTES 2023
try:
    load_info = pipeline_votes.run(
        votes_2023(),
        table_name="2023",
        loader_file_format="parquet",
        write_disposition="replace"
    )
    print("✅ votes/2023 loaded:", load_info)
except Exception as e:
    print("❌ Error loading votes/2023:", e)