In [1]:
!pip install dlt



In [2]:
import dlt
import pyarrow.parquet as pq
import requests
import pandas as pd
import os
import tempfile
import pyarrow.parquet as pq
from minio import Minio

In [3]:
# Configuración de la URL
URL = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-01.parquet"
@dlt.resource(name="yellow_trip_data", write_disposition="replace")
def yellow_trip_data():
    try:
        print("Descargando archivo parquet...")
        response = requests.get(URL, stream=True)
        response.raise_for_status()

        with tempfile.NamedTemporaryFile(suffix=".parquet") as tmp:
            tmp.write(response.content)
            tmp.flush()

            # Leer en formato Arrow
            table = pq.read_table(tmp.name)
            print(f"Datos cargados en Arrow: {table.num_rows} filas, {table.num_columns} columnas")
            yield table

    except Exception as e:
        print(f"Error al descargar o procesar datos: {e}")
        raise

def main():
    try:
        # Configuración de MinIO
        minio_endpoint = os.getenv('MINIO_ENDPOINT', 'minio:9000')  # Cambiado a 'minio' para Docker
        bucket_name = os.getenv('BUCKET_NAME', 'warehouse')         # Usando el bucket existente
        access_key = os.getenv('MINIO_ACCESS_KEY', 'admin')
        secret_key = os.getenv('MINIO_SECRET_KEY', 'password')
        
        # Construir la URL de MinIO (S3 compatible)
        minio_url = f"s3://{bucket_name}?endpoint=http://{minio_endpoint}&access_key={access_key}&secret_key={secret_key}&region=us-east-1"
        print(f"MinIO URL: {minio_url}")
        print("pipeline configured")
        # Configurar pipeline
        
        pipeline = dlt.pipeline(
            pipeline_name="yellow_trip_data_pipeline",
            destination=dlt.destinations.filesystem(
                bucket_url="s3://warehouse",
                aws_access_key_id = "admin",
                aws_secret_access_key = "password", # copy the secret access key here
                endpoint_url = "minio:9000", # copy your endpoint URL here
            ),
            dataset_name="yellow_trip",
            full_refresh=True,
        )
        
        # Ejecutar el pipeline
        print("Iniciando ingesta de datos...")
        load_info = pipeline.run(
            yellow_trip_data(),
            loader_file_format="parquet",
        )
        
        print("Pipeline ejecutado exitosamente!")
        print(f"Load info: {load_info}")
        
    except Exception as e:
        print(f"Error en el pipeline: {e}")
        raise

if __name__ == "__main__":
    main()

MinIO URL: s3://warehouse?endpoint=http://http://minio:9000&access_key=admin&secret_key=password&region=us-east-1
pipeline configured
Iniciando ingesta de datos...
Descargando archivo parquet...


  full_refresh_argument_deprecated("pipeline", full_refresh)


Datos cargados en Arrow: 3475226 filas, 20 columnas
Pipeline ejecutado exitosamente!
Load info: Pipeline yellow_trip_data_pipeline load step completed in 1.71 seconds
1 load package(s) were loaded to destination filesystem and into dataset yellow_trip_20250912011514
The filesystem destination used s3://warehouse location to store data
Load package 1757639714.3133307 is LOADED and contains no failed jobs


In [4]:
!cat .dlt/secrets.toml

[yellow_trip_data_pipeline.destination.filesystem]
bucket_url = "s3://warehouse" # replace with your bucket name,

[yellow_trip_data_pipeline.destination.filesystem.credentials]
aws_access_key_id = "admin" # copy the access key here
aws_secret_access_key = "password" # copy the secret access key here
endpoint_url = "http://minio:9000" # copy your endpoint URL here