In [1]:
!pip install pandas pyarrow fsspec dlt[filesystem] s3fs adlfs



In [2]:
import pandas as pd
import logging
import pyarrow.parquet as pq
import fsspec
import dlt

In [3]:
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger("parquet_pipeline")

In [4]:
url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-01.parquet"

In [5]:
@dlt.resource(table_name="df_data")
def my_df (url: str) -> pd.DataFrame:
    try:
        with fsspec.open(url, mode="rb") as f:
            table = pq.read_table(f)
            df = table.to_pandas()
            yield df
    except Exception as e:
        logger.error(f"Error al inicializar el pipeline: {e}")
        raise

In [6]:
pipeline_config = {
    "pipeline_name": "parquet_to_minio",
    "destination": "filesystem",
    "dataset_name": "grupo_2_parquet",
}

try:
    pipeline = dlt.pipeline(**pipeline_config)
    logger.info("Pipeline inicializado correctamente.")
except Exception as e:
    logger.error(f"Error al inicializar el pipeline: {e}")
    raise

2025-09-09 05:45:21,036 - INFO - Pipeline inicializado correctamente.


In [7]:
try:
    load_info = pipeline.run(
        my_df(url),
        loader_file_format="parquet",
        write_disposition="replace"
    )
    logger.info(f"Pipeline ejecutado con éxito. Info de carga: {load_info}")
except Exception as e:
    logger.error(f"Error al ejecutar el pipeline: {e}")
    raise

2025-09-09 05:45:33,774 - INFO - Pipeline ejecutado con éxito. Info de carga: Pipeline parquet_to_minio load step completed in 1.46 seconds
1 load package(s) were loaded to destination filesystem and into dataset grupo_2_parquet
The filesystem destination used s3://grupo-2 location to store data
Load package 1757396721.711363 is LOADED and contains no failed jobs
