In [1]:
import dlt
from dlt.destinations.filesystem import filesystem
import pyarrow.parquet as pq
import requests
import pandas as pd
import os
from minio import Minio
from io import BytesIO

In [None]:
# Configuración de la URL
URL = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-01.parquet"

@dlt.resource(name="yellow_trip_data", write_disposition="replace")
def yellow_trip_data():
    """Resource para leer datos Parquet desde URL"""
    try:
        print("Descargando datos desde la URL...")
        df = pd.read_parquet(URL)
        print(f"Datos descargados. Shape: {df.shape}")
        
        # Yield cada fila como dict
        for row in df.to_dict(orient="records"):
            yield row
            
    except Exception as e:
        print(f"Error al descargar datos: {e}")
        raise

def main():
    try:
        # Configuración de MinIO
        minio_endpoint = os.getenv('MINIO_ENDPOINT', 'minio:9000')  # Cambiado a 'minio' para Docker
        bucket_name = os.getenv('BUCKET_NAME', 'warehouse')         # Usando el bucket existente
        access_key = os.getenv('MINIO_ACCESS_KEY', 'admin')
        secret_key = os.getenv('MINIO_SECRET_KEY', 'password')
        
        # Construir la URL de MinIO (S3 compatible)
        minio_url = f"s3://{bucket_name}?endpoint=http://{minio_endpoint}&access_key={access_key}&secret_key={secret_key}&region=us-east-1"
        print(f"MinIO URL: {minio_url}")
        
        # Configurar pipeline
        pipeline = dlt.pipeline(
            pipeline_name="yellow_trip_data_pipeline",
            destination="filesystem",
            dataset_name="yellow_trip",
            full_refresh=True,
        )
        
        # Ejecutar el pipeline
        print("Iniciando ingesta de datos...")
        load_info = pipeline.run(
            yellow_trip_data(),
            loader_file_format="parquet",
            destination=dlt.destinations.filesystem(bucket_url=minio_url)
        )
        
        print("Pipeline ejecutado exitosamente!")
        print(f"Load info: {load_info}")
        
    except Exception as e:
        print(f"Error en el pipeline: {e}")
        raise

if __name__ == "__main__":
    main()

Fetching data from URL: https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-01.parquet
Setting up DLT pipeline...
Running DLT pipeline...
