In [1]:
import dlt
import pyarrow.parquet as pq
import pyarrow.dataset as ds
import fsspec
from aiohttp import ClientTimeout

In [2]:
pip show pyarrow

Name: pyarrow
Version: 21.0.0
Summary: Python library for Apache Arrow
Home-page: 
Author: 
Author-email: 
License: Apache Software License
Location: /opt/conda/lib/python3.11/site-packages
Requires: 
Required-by: 
Note: you may need to restart the kernel to use updated packages.


***Si la version de Pyarrow es menor a 14.0.0 se debe ejecutar la siguiente celda para actualizarla y que el resto de codigo funcione :p***

In [3]:
pip install --upgrade "pyarrow>=14.0.0"

Collecting pyarrow>=14.0.0
  Downloading pyarrow-21.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Downloading pyarrow-21.0.0-cp311-cp311-manylinux_2_28_x86_64.whl (42.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pyarrow
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 13.0.0
    Uninstalling pyarrow-13.0.0:
      Successfully uninstalled pyarrow-13.0.0
Successfully installed pyarrow-21.0.0
Note: you may need to restart the kernel to use updated packages.


In [4]:

def load_parquet_to_minio(parquet_url, batch_size=50000):
    """
    Downloads and processes a remote Parquet file in batches, yielding pandas DataFrames.

    This version is optimized for single Parquet files (not directory-partitioned).
    It uses `fsspec` to stream the file over HTTPS and `pyarrow.parquet.ParquetFile`
    to read it efficiently in row groups or batches. Ideal for scalable ingestion
    in data pipelines like DLT, especially when loading into destinations such as MinIO.

    Args:
        parquet_url (str): Full URL of the remote Parquet file.
        batch_size (int, optional): Number of rows per batch. Default is 50,000.

    Yields:
        pandas.DataFrame: A batch of rows converted from Parquet to pandas format.

    Example:
        >>> for df in load_parquet_to_minio("https://.../posts/2022.parquet"):
        >>>     process(df)
    """
    timeout = ClientTimeout(total=600)
    fs = fsspec.filesystem("https", client_kwargs={"timeout": timeout})

    with fs.open(parquet_url) as f:
        parquet_file = pq.ParquetFile(f)
        for batch in parquet_file.iter_batches(batch_size=batch_size):
            yield batch.to_pandas()

# POSTS 2022
@dlt.resource(table_name="posts_2022")
def posts_2022():
    url = "https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/posts/2022.parquet"
    yield from load_parquet_to_minio(url)

# POSTS 2023
@dlt.resource(table_name="posts_2023")
def posts_2023():
    url = "https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/posts/2023.parquet"
    yield from load_parquet_to_minio(url)

# VOTES 2022
@dlt.resource(table_name="votes_2022")
def votes_2022():
    url = "https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/votes/2022.parquet"
    yield from load_parquet_to_minio(url)

# VOTES 2023
@dlt.resource(table_name="votes_2023")
def votes_2023():
    url = "https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/votes/2023.parquet"
    yield from load_parquet_to_minio(url)

In [5]:
pipeline_posts = dlt.pipeline(
    pipeline_name="parquet_to_minio",
    destination="filesystem",
    dataset_name="posts",
)

In [6]:
pipeline_votes = dlt.pipeline(
    pipeline_name="parquet_to_minio",
    destination="filesystem",
    dataset_name="votes",
)

In [9]:
# POSTS 2022
try:
    load_info = pipeline_posts.run(
        posts_2022(),
        table_name="2022",
        loader_file_format="parquet",
        write_disposition="replace"
    )
    print("✅ posts/2022 loaded:", load_info)
except Exception as e:
    print("❌ Error loading posts/2023:", e)



✅ posts/2022 loaded: Pipeline parquet_to_minio load step completed in 1 minute and 12.62 seconds
1 load package(s) were loaded to destination filesystem and into dataset posts
The filesystem destination used s3://bronze location to store data
Load package 1760342501.494119 is LOADED and contains no failed jobs


In [10]:
# POSTS 2023
try:
    load_info = pipeline_posts.run(
        posts_2023(),
        table_name="2023",
        loader_file_format="parquet",
        write_disposition="replace"
    )
    print("✅ posts/2023 loaded:", load_info)
except Exception as e:
    print("❌ Error loading posts/2023:", e)



✅ posts/2023 loaded: Pipeline parquet_to_minio load step completed in 38.75 seconds
1 load package(s) were loaded to destination filesystem and into dataset posts
The filesystem destination used s3://bronze location to store data
Load package 1760343432.290405 is LOADED and contains no failed jobs


In [8]:
# VOTES 2022
try:
    load_info = pipeline_votes.run(
        votes_2022(),
        table_name="2022",
        loader_file_format="parquet",
        write_disposition="replace"
    )
    print("✅ votes/2022 loaded:", load_info)
except Exception as e:
    print("❌ Error loading votes/2022:", e)


✅ votes/2022 loaded: Pipeline parquet_to_minio load step completed in 5.91 seconds
1 load package(s) were loaded to destination filesystem and into dataset votes
The filesystem destination used s3://bronze location to store data
Load package 1760342427.0589252 is LOADED and contains no failed jobs


In [7]:
# VOTES 2023
try:
    load_info = pipeline_votes.run(
        votes_2023(),
        table_name="2023",
        loader_file_format="parquet",
        write_disposition="replace"
    )
    print("✅ votes/2023 loaded:", load_info)
except Exception as e:
    print("❌ Error loading votes/2023:", e)

✅ votes/2023 loaded: Pipeline parquet_to_minio load step completed in 3.92 seconds
1 load package(s) were loaded to destination filesystem and into dataset votes
The filesystem destination used s3://bronze location to store data
Load package 1760341788.5734918 is LOADED and contains no failed jobs
