In [1]:
%pip install pandas pyarrow fsspec dlt[filesystem] s3fs adlfs

Collecting pandas
  Downloading pandas-2.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting pyarrow
  Downloading pyarrow-21.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting fsspec
  Downloading fsspec-2025.9.0-py3-none-any.whl.metadata (10 kB)
Collecting s3fs
  Downloading s3fs-2025.9.0-py3-none-any.whl.metadata (1.4 kB)
Collecting adlfs
  Downloading adlfs-2025.8.0-py3-none-any.whl.metadata (7.7 kB)
Collecting dlt[filesystem]
  Downloading dlt-1.15.0-py3-none-any.whl.metadata (12 kB)
Collecting numpy>=1.23.2 (from pandas)
  Downloading numpy-2.3.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting tzdata>=2022.7 (from pandas)
 

In [1]:
import pandas as pd
import logging
import pyarrow.parquet as pq
import fsspec
import dlt

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger("parquet_pipeline")

In [3]:
url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-01.parquet"

In [4]:
@dlt.resource(table_name="df_data")
def my_df (url: str) -> pd.DataFrame:
    try:
        with fsspec.open(url, mode="rb") as f:
            table = pq.read_table(f)
            df = table.to_pandas()
            yield df
    except Exception as e:
        logger.error(f"Error al inicializar el pipeline: {e}")
        raise

In [5]:
pipeline_config = {
    "pipeline_name": "parquet_to_minio",
    "destination": "filesystem",
    "dataset_name": "grupo_2_parquet",
}

try:
    pipeline = dlt.pipeline(**pipeline_config)
    logger.info("Pipeline inicializado correctamente.")
except Exception as e:
    logger.error(f"Error al inicializar el pipeline: {e}")
    raise

2025-09-11 00:33:54,712 - INFO - Pipeline inicializado correctamente.


In [6]:
try:
    load_info = pipeline.run(
        my_df(url),
        loader_file_format="parquet",
        write_disposition="replace"
    )
    logger.info(f"Pipeline ejecutado con éxito. Info de carga: {load_info}")
except Exception as e:
    logger.error(f"Error al ejecutar el pipeline: {e}")
    raise

2025-09-11 00:34:02,916 - INFO - Pipeline ejecutado con éxito. Info de carga: Pipeline parquet_to_minio load step completed in 0.85 seconds
1 load package(s) were loaded to destination filesystem and into dataset grupo_2_parquet
The filesystem destination used s3://grupo-2 location to store data
Load package 1757550836.4585238 is LOADED and contains no failed jobs
