# 1. http to bucket

This notebook demonstrates how to save a Parquet file from a remote data source into a MinIO object storage bucket.

In [1]:
%pip install pandas pyarrow fsspec dlt[filesystem] s3fs adlfs tzdata

Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import logging
import pyarrow.parquet as pq
import fsspec
import dlt

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger("parquet_pipeline")

In [3]:
url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-01.parquet"

In [None]:

@dlt.resource(table_name="df_data")
def my_df(url: str) -> pd.DataFrame:
    try:
        with fsspec.open(url, mode="rb") as f:
            table = pq.read_table(f)
            df = table.to_pandas()
            yield df

    except Exception as e:
        logger.error(f"Error: {e}")
        raise

In [5]:
pipeline_config = {
    "pipeline_name": "parquet_to_minio",
    "destination": "filesystem",
    "dataset_name": "grupo_2_parquet",
}

try:
    pipeline = dlt.pipeline(**pipeline_config)
    logger.info("Pipeline configured successfully.")
except Exception as e:
    logger.error(f"Error configuring the pipeline: {e}")
    raise

2025-09-12 15:44:59,221 - INFO - Pipeline configured successfully.


In [6]:
try:
    load_info = pipeline.run(
        my_df(url),
        loader_file_format="parquet",
        write_disposition="replace"
    )
    logger.info(f"Pipeline sucessfully executed. Load info: {load_info}")
except Exception as e:
    logger.error(f"Error while executing pipeline: {e}")
    raise

2025-09-12 15:45:10,947 - INFO - Pipeline sucessfully executed. Load info: Pipeline parquet_to_minio load step completed in 1.11 seconds
1 load package(s) were loaded to destination filesystem and into dataset grupo_2_parquet
The filesystem destination used s3://grupo-2 location to store data
Load package 1757691902.3208005 is LOADED and contains no failed jobs
