In [1]:
import os
import sys
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import requests
import tempfile

print("Imports completados")

Imports completados


In [2]:


SNOWFLAKE_ACCOUNT = os.getenv('SNOWFLAKE_ACCOUNT')
SNOWFLAKE_USER = os.getenv('SNOWFLAKE_USER')
SNOWFLAKE_PASSWORD = os.getenv('SNOWFLAKE_PASSWORD')
SNOWFLAKE_ROLE = os.getenv('SNOWFLAKE_ROLE', 'ACCOUNTADMIN')
SNOWFLAKE_DATABASE = os.getenv('SNOWFLAKE_DATABASE')
SNOWFLAKE_WAREHOUSE = os.getenv('SNOWFLAKE_WAREHOUSE')
SNOWFLAKE_SCHEMA_RAW = os.getenv('SNOWFLAKE_SCHEMA_RAW', 'RAW')

# Data parameters
START_YEAR = int(os.getenv('START_YEAR', '2015'))
END_YEAR = int(os.getenv('END_YEAR', '2025'))
SERVICES = os.getenv('SERVICES', 'yellow,green').split(',')
RUN_ID = os.getenv('RUN_ID', f"run_{datetime.now().strftime('%Y%m%d_%H%M%S')}")

# Base URL
BASE_URL = "https://d37ci6vzurychx.cloudfront.net/trip-data"


print("CONFIGURACIÓN DE INGESTA")

print(f"RUN_ID: {RUN_ID}")
print(f"Base de datos: {SNOWFLAKE_DATABASE}")
print(f"Schema RAW: {SNOWFLAKE_SCHEMA_RAW}")
print(f"Años: {START_YEAR} - {END_YEAR}")
print(f"Servicios: {SERVICES}")



CONFIGURACIÓN DE INGESTA
RUN_ID: manual
Base de datos: NYC_TLC_P03
Schema RAW: RAW
Años: 2015 - 2025
Servicios: ['yellow', 'green']


In [3]:
print("\n Inicializando Spark...")

spark = SparkSession.builder \
    .appName("NYC_TLC_Ingesta_Raw") \
    .config("spark.jars.packages", 
            "net.snowflake:spark-snowflake_2.12:2.11.0-spark_3.3,"
            "net.snowflake:snowflake-jdbc:3.13.30") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")

print(f" Spark {spark.version} inicializado correctamente")
print(f" Spark UI disponible en: http://localhost:4040")


 Inicializando Spark...
 Spark 3.5.0 inicializado correctamente
 Spark UI disponible en: http://localhost:4040


In [4]:
sfOptions = {
    "sfURL": f"{SNOWFLAKE_ACCOUNT}.snowflakecomputing.com",
    "sfUser": SNOWFLAKE_USER,
    "sfPassword": SNOWFLAKE_PASSWORD,
    "sfDatabase": SNOWFLAKE_DATABASE,
    "sfSchema": SNOWFLAKE_SCHEMA_RAW,
    "sfWarehouse": SNOWFLAKE_WAREHOUSE,
    "sfRole": SNOWFLAKE_ROLE
}

print(" Configuración de Snowflake establecida")

 Configuración de Snowflake establecida


In [5]:
print("\n Verificando que las tablas existan en Snowflake...")

try:
    test_yellow = spark.read \
        .format("snowflake") \
        .options(**sfOptions) \
        .option("dbtable", "YELLOW_TRIPS") \
        .load() \
        .limit(1)
    print(" Tabla YELLOW_TRIPS existe")
except Exception as e:
    print(f" Tabla YELLOW_TRIPS NO existe: {e}")

try:
    test_green = spark.read \
        .format("snowflake") \
        .options(**sfOptions) \
        .option("dbtable", "GREEN_TRIPS") \
        .load() \
        .limit(1)
    print("Tabla GREEN_TRIPS existe")
except Exception as e:
    print(f" Tabla GREEN_TRIPS NO existe: {e}")

try:
    test_audit = spark.read \
        .format("snowflake") \
        .options(**sfOptions) \
        .option("dbtable", "INGESTION_AUDIT") \
        .load() \
        .limit(1)
    print(" Tabla INGESTION_AUDIT existe")
except Exception as e:
    print(f" Tabla INGESTION_AUDIT NO existe: {e}")

print(" Verificación completada")



 Verificando que las tablas existan en Snowflake...
 Tabla YELLOW_TRIPS existe
Tabla GREEN_TRIPS existe
 Tabla INGESTION_AUDIT existe
 Verificación completada


In [6]:

def check_parquet_exists(url):
    
    try:
        response = requests.head(url, timeout=10)
        return response.status_code == 200
    except:
        return False

def get_parquet_url(service, year, month):
   
    filename = f"{service}_tripdata_{year}-{month:02d}.parquet"
    return f"{BASE_URL}/{filename}"

def download_parquet(url):
  
    tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".parquet")
    tmp_path = tmp_file.name
    tmp_file.close()
    
    print(f" Descargando a: {tmp_path}")
    
    response = requests.get(url, stream=True, timeout=60)
    response.raise_for_status()
    
    with open(tmp_path, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
    
    # Obtener tamaño del archivo
    file_size = os.path.getsize(tmp_path) / (1024*1024)  # MB
    print(f" Descargado: {file_size:.2f} MB")
    
    return tmp_path

def add_metadata_columns(df, service, year, month, source_path):
    
    return df \
        .withColumn("run_id", lit(RUN_ID)) \
        .withColumn("service_type", lit(service)) \
        .withColumn("source_year", lit(year)) \
        .withColumn("source_month", lit(month)) \
        .withColumn("ingested_at_utc", current_timestamp()) \
        .withColumn("source_path", lit(source_path))

print(" Funciones de utilidad definidas")

 Funciones de utilidad definidas


In [7]:
def ingest_month(service, year, month):
    
    
    start_time = datetime.now()
    url = get_parquet_url(service, year, month)
    table_name = f"{service.upper()}_TRIPS"
    tmp_path = None
    
  
    print(f"Procesando: {service.upper()} {year}-{month:02d}")
    print(f"URL: {url}")
  
    
    # Verificar si el archivo existe
    print(" Verificando si el archivo existe...")
    if not check_parquet_exists(url):
        print(f" Archivo no encontrado: {url}")
        return {
            'service': service,
            'year': year,
            'month': month,
            'status': 'NOT_FOUND',
            'records': 0,
            'duration': 0,
            'error': 'Archivo Parquet no disponible'
        }
    
    print(" Archivo encontrado")
    
    try:
        # Descargar el archivo
        tmp_path = download_parquet(url)
        
        # Leer el archivo Parquet local con Spark
        print("Leyendo Parquet con Spark...")
        df = spark.read.parquet(tmp_path)
        
        record_count = df.count()
        print(f"Registros leídos: {record_count:,}")
        
        # Mostrar schema
        print(" Schema del archivo:")
        df.printSchema()
        
        # Mostrar muestra de datos
        print(" Muestra de datos:")
        df.show(3, truncate=True)
        
        # Agregar metadatos
        print("  Agregando metadatos...")
        df = add_metadata_columns(df, service, year, month, url)
        
        # IDEMPOTENCIA: Eliminar datos del mismo año/mes/servicio antes de insertar
        print(f" Aplicando idempotencia...")
        delete_query = f"""
        DELETE FROM {table_name} 
        WHERE source_year = {year} 
        AND source_month = {month}
        AND service_type = '{service}'
        """
        
        try:
            spark.read \
                .format("snowflake") \
                .options(**sfOptions) \
                .option("query", delete_query) \
                .load()
            print(f"Registros previos eliminados (si existían)")
        except Exception as e_del:
            print(f" No se pudieron eliminar registros previos (puede ser primera carga): {e_del}")
        
        # Escribir a Snowflake con Spark
        print(f"Escribiendo a Snowflake: {SNOWFLAKE_SCHEMA_RAW}.{table_name}...")
        
        df.write \
            .format("snowflake") \
            .options(**sfOptions) \
            .option("dbtable", table_name) \
            .mode("append") \
            .save()
        
        # Limpiar archivo temporal
        if tmp_path and os.path.exists(tmp_path):
            os.remove(tmp_path)
            print("  Archivo temporal eliminado")
        
        end_time = datetime.now()
        duration = (end_time - start_time).total_seconds()
        
        print(f" Carga exitosa: {record_count:,} registros en {duration:.2f}s")
        
        return {
            'service': service,
            'year': year,
            'month': month,
            'status': 'SUCCESS',
            'records': record_count,
            'duration': duration,
            'error': None
        }
        
    except Exception as e:
        end_time = datetime.now()
        duration = (end_time - start_time).total_seconds()
        
        # Limpiar archivo temporal si existe
        if tmp_path and os.path.exists(tmp_path):
            os.remove(tmp_path)
        
        # Imprimir el error completo
        import traceback
        error_msg = traceback.format_exc()
        
        print(f" Error en la carga:")
        print(error_msg)
        
        return {
            'service': service,
            'year': year,
            'month': month,
            'status': 'FAILED',
            'records': 0,
            'duration': duration,
            'error': str(e)
        }

print(" Función de ingesta con Spark definida")

 Función de ingesta con Spark definida


In [13]:


import urllib.request
import snowflake.connector

def ingest_month(service, year, month):
    """
    Ingesta datos de un mes específico desde Parquet a Snowflake RAW usando Spark
    """
    
    start_time = datetime.now()
    url = get_parquet_url(service, year, month)
    table_name = f"{service.upper()}_TRIPS"
    tmp_path = None
    
    print(f"\n{'='*70}")
    print(f" Procesando: {service.upper()} {year}-{month:02d}")
    print(f"URL: {url}")
    print(f"{'='*70}")
    
    # Verificar si el archivo existe
    print("Verificando si el archivo existe...")
    if not check_parquet_exists(url):
        print(f" Archivo no encontrado: {url}")
        return {
            'service': service,
            'year': year,
            'month': month,
            'status': 'NOT_FOUND',
            'records': 0,
            'duration': 0,
            'error': 'Archivo Parquet no disponible'
        }
    
    print("Archivo encontrado")
    
    try:
        # Descargar el archivo localmente
        tmp_path = f"/tmp/{service}_tripdata_{year}-{month:02d}.parquet"
        print(f" Descargando a: {tmp_path}")
        
        urllib.request.urlretrieve(url, tmp_path)
        
        file_size = os.path.getsize(tmp_path) / (1024*1024)  # MB
        print(f" Descargado: {file_size:.2f} MB")
        
        # Leer el archivo Parquet LOCAL con Spark
        print(" Leyendo Parquet con Spark...")
        df = spark.read.parquet(tmp_path)
        
        record_count = df.count()
        print(f" Registros leídos: {record_count:,}")
        
        # Mostrar muestra
        print(" Muestra de datos:")
        df.show(3, truncate=True)
        
        # Convertir timestamps a formato compatible con Snowflake
        print("Convirtiendo tipos de datos...")
        
        # Para Yellow
        if service == 'yellow':
            df = df.withColumn("tpep_pickup_datetime", 
                              col("tpep_pickup_datetime").cast("timestamp"))
            df = df.withColumn("tpep_dropoff_datetime", 
                              col("tpep_dropoff_datetime").cast("timestamp"))
        
        # Para Green
        if service == 'green':
            df = df.withColumn("lpep_pickup_datetime", 
                              col("lpep_pickup_datetime").cast("timestamp"))
            df = df.withColumn("lpep_dropoff_datetime", 
                              col("lpep_dropoff_datetime").cast("timestamp"))
        
        # Agregar metadatos
        print("  Agregando metadatos...")
        df = add_metadata_columns(df, service, year, month, url)
        
        
        # REORDENAR COLUMNAS PARA COINCIDIR CON SNOWFLAKE
        
        print(" Reordenando columnas para coincidir con la tabla...")
        
        if service == 'yellow':
            # Orden exacto de columnas en YELLOW_TRIPS
            df_ordered = df.select(
                "tpep_pickup_datetime",
                "tpep_dropoff_datetime",
                "PULocationID",
                "DOLocationID",
                "passenger_count",
                "trip_distance",
                "RatecodeID",
                "store_and_fwd_flag",
                "payment_type",
                "VendorID",
                "fare_amount",
                "extra",
                "mta_tax",
                "tip_amount",
                "tolls_amount",
                "improvement_surcharge",
                "total_amount",
                "congestion_surcharge",
                "Airport_fee",
                "run_id",
                "service_type",
                "source_year",
                "source_month",
                "ingested_at_utc",
                "source_path"
            )
        else:  # green
            # Orden exacto de columnas en GREEN_TRIPS
            df_ordered = df.select(
                "lpep_pickup_datetime",
                "lpep_dropoff_datetime",
                "PULocationID",
                "DOLocationID",
                "passenger_count",
                "trip_distance",
                "RatecodeID",
                "store_and_fwd_flag",
                "payment_type",
                "VendorID",
                "trip_type",
                "fare_amount",
                "extra",
                "mta_tax",
                "tip_amount",
                "tolls_amount",
                "improvement_surcharge",
                "total_amount",
                "congestion_surcharge",
                "run_id",
                "service_type",
                "source_year",
                "source_month",
                "ingested_at_utc",
                "source_path"
            )
        
        print("Columnas reordenadas correctamente")
        
        
        print(" Aplicando idempotencia (eliminando datos previos del periodo)...")
        
        try:
            conn = snowflake.connector.connect(
                account=SNOWFLAKE_ACCOUNT,
                user=SNOWFLAKE_USER,
                password=SNOWFLAKE_PASSWORD,
                role=SNOWFLAKE_ROLE,
                warehouse=SNOWFLAKE_WAREHOUSE,
                database=SNOWFLAKE_DATABASE,
                schema=SNOWFLAKE_SCHEMA_RAW
            )
            
            cursor = conn.cursor()
            
            delete_sql = f"""
            DELETE FROM {table_name} 
            WHERE source_year = {year} 
              AND source_month = {month}
              AND service_type = '{service}'
            """
            
            cursor.execute(delete_sql)
            deleted_rows = cursor.rowcount
            
            cursor.close()
            conn.close()
            
            print(f"Registros previos eliminados: {deleted_rows}")
            
        except Exception as e_del:
            print(f"No se pudieron eliminar registros previos: {e_del}")
        
        
        print(f" Escribiendo a Snowflake: {SNOWFLAKE_SCHEMA_RAW}.{table_name}...")
        
        df_ordered.write \
            .format("snowflake") \
            .options(**sfOptions) \
            .option("dbtable", table_name) \
            .mode("append") \
            .save()
        
        # Limpiar archivo temporal
        if tmp_path and os.path.exists(tmp_path):
            os.remove(tmp_path)
            print("  Archivo temporal eliminado")
        
        end_time = datetime.now()
        duration = (end_time - start_time).total_seconds()
        
        print(f"Carga exitosa: {record_count:,} registros en {duration:.2f}s")
        
        return {
            'service': service,
            'year': year,
            'month': month,
            'status': 'SUCCESS',
            'records': record_count,
            'duration': duration,
            'error': None
        }
        
    except Exception as e:
        end_time = datetime.now()
        duration = (end_time - start_time).total_seconds()
        
        # Limpiar archivo temporal si existe
        if tmp_path and os.path.exists(tmp_path):
            os.remove(tmp_path)
        
        # Imprimir el error completo
        import traceback
        error_msg = traceback.format_exc()
        
        print(f" Error en la carga:")
        print(error_msg)
        
        return {
            'service': service,
            'year': year,
            'month': month,
            'status': 'FAILED',
            'records': 0,
            'duration': duration,
            'error': str(e)
        }

print("Función de ingesta con descarga e idempotencia definida")

Función de ingesta con descarga e idempotencia definida


In [14]:

print(" PRUEBA CON UN SOLO MES")


# Probar con un mes reciente (2024-01)
result = ingest_month('yellow', 2024, 1)

print("\n RESULTADO:")
print(f"Status: {result['status']}")
print(f"Records: {result['records']:,}")
print(f"Duration: {result['duration']:.2f}s")
print(f"Error: {result['error']}")

 PRUEBA CON UN SOLO MES

 Procesando: YELLOW 2024-01
URL: https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet
Verificando si el archivo existe...
Archivo encontrado
 Descargando a: /tmp/yellow_tripdata_2024-01.parquet
 Descargado: 47.65 MB
 Leyendo Parquet con Spark...
 Registros leídos: 2,964,624
 Muestra de datos:
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|Airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------

In [15]:
print(" PRUEBA CON UN SOLO MES")


# Probar con un mes reciente (2024-01)
result = ingest_month('green', 2024, 1)

print("\n RESULTADO:")
print(f"Status: {result['status']}")
print(f"Records: {result['records']:,}")
print(f"Duration: {result['duration']:.2f}s")
print(f"Error: {result['error']}")

 PRUEBA CON UN SOLO MES

 Procesando: GREEN 2024-01
URL: https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-01.parquet
Verificando si el archivo existe...
Archivo encontrado
 Descargando a: /tmp/green_tripdata_2024-01.parquet
 Descargado: 1.30 MB
 Leyendo Parquet con Spark...
 Registros leídos: 56,551
 Muestra de datos:
+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|VendorID|lpep_pickup_datetime|lpep_dropoff_datetime|store_and_fwd_flag|RatecodeID|PULocationID|DOLocationID|passenger_count|trip_distance|fare_amount|extra|mta_tax|tip_amount|tolls_amount|ehail_fee|improvement_surcharge|total_amount|payment_type|trip_type|congestion_surcharge|
+--------+--------------------+---------------------+------------------+----------+------------+-------

In [16]:


print(" INICIANDO INGESTA COMPLETA")


# Lista para almacenar resultados
results = []

# Iterar sobre servicios, años y meses
for service in SERVICES:
    for year in range(START_YEAR, END_YEAR + 1):
        for month in range(1, 13):
            
            
            if year == 2025 and month > 8:  
                continue
            
            result = ingest_month(service, year, month)
            results.append(result)
            
            # Pequeña pausa entre archivos para no saturar
            import time
            time.sleep(2)


print("INGESTA COMPLETADA")


 INICIANDO INGESTA COMPLETA

 Procesando: YELLOW 2015-01
URL: https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2015-01.parquet
Verificando si el archivo existe...
Archivo encontrado
 Descargando a: /tmp/yellow_tripdata_2015-01.parquet
 Descargado: 167.20 MB
 Leyendo Parquet con Spark...
 Registros leídos: 12,741,035
 Muestra de datos:
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------

In [18]:


import pandas as pd

print("\n RESUMEN DE INGESTA:")



results_pd = pd.DataFrame(results)

results_pd['records'] = results_pd['records'].astype(int)
results_pd['duration'] = results_pd['duration'].astype(float)


print("\n Por Status:")
status_summary = results_pd.groupby('status').size().reset_index(name='count')
print(status_summary.to_string(index=False))


print("\n Por Servicio:")
service_summary = results_pd.groupby(['service', 'status']).agg({
    'status': 'size',
    'records': 'sum'
}).rename(columns={'status': 'archivos', 'records': 'total_registros'})
print(service_summary.to_string())


print("\n Por Año:")
year_summary = results_pd.groupby(['year', 'status']).agg({
    'status': 'size',
    'records': 'sum'
}).rename(columns={'status': 'archivos', 'records': 'total_registros'})
print(year_summary.to_string())


total_success = len(results_pd[results_pd['status'] == 'SUCCESS'])
total_failed = len(results_pd[results_pd['status'] == 'FAILED'])
total_notfound = len(results_pd[results_pd['status'] == 'NOT_FOUND'])
total_records = results_pd['records'].sum()
total_duration = results_pd['duration'].sum()

print("\n ESTADÍSTICAS GENERALES:")

print(f" Exitosos: {total_success}")
print(f" Fallidos: {total_failed}")
print(f" No encontrados: {total_notfound}")
print(f" Total registros cargados: {total_records:,}")
print(f"  Tiempo total: {total_duration/60:.2f} minutos")
print(f"  Tiempo promedio por archivo: {total_duration/len(results_pd):.2f} segundos")



if total_failed > 0 or total_notfound > 0:
    print("\n  ARCHIVOS CON PROBLEMAS:")
  
    problemas = results_pd[results_pd['status'].isin(['FAILED', 'NOT_FOUND'])]
    print(problemas[['service', 'year', 'month', 'status', 'error']].to_string(index=False))


 RESUMEN DE INGESTA:

 Por Status:
   status  count
NOT_FOUND      1
  SUCCESS    255

 Por Servicio:
                   archivos  total_registros
service status                              
green   SUCCESS         128         68045597
yellow  NOT_FOUND         1                0
        SUCCESS         127        772827410

 Por Año:
                archivos  total_registros
year status                              
2015 NOT_FOUND         1                0
     SUCCESS          23        153713330
2016 SUCCESS          24        147517346
2017 SUCCESS          24        125237386
2018 SUCCESS          24        111771105
2019 SUCCESS          24         90899429
2020 SUCCESS          24         26383268
2021 SUCCESS          24         31973063
2022 SUCCESS          24         40496500
2023 SUCCESS          24         39097286
2024 SUCCESS          24         41829938
2025 SUCCESS          16         31954356

 ESTADÍSTICAS GENERALES:
 Exitosos: 255
 Fallidos: 0
 No encontrados: 1


In [20]:

print("\n VERIFICANDO DATOS EN SNOWFLAKE:")


# Contar registros en YELLOW_TRIPS
yellow_count_df = spark.read \
    .format("snowflake") \
    .options(**sfOptions) \
    .option("query", "SELECT COUNT(*) as COUNT FROM YELLOW_TRIPS") \
    .load()

yellow_count = yellow_count_df.collect()[0]['COUNT']
print(f" YELLOW_TRIPS: {yellow_count:,} registros")

# Contar registros en GREEN_TRIPS
green_count_df = spark.read \
    .format("snowflake") \
    .options(**sfOptions) \
    .option("query", "SELECT COUNT(*) as COUNT FROM GREEN_TRIPS") \
    .load()

green_count = green_count_df.collect()[0]['COUNT']
print(f" GREEN_TRIPS: {green_count:,} registros")

print(f"\nTOTAL EN SNOWFLAKE: {yellow_count + green_count:,} registros")



print("\n Distribución YELLOW por año:")
yellow_by_year = spark.read \
    .format("snowflake") \
    .options(**sfOptions) \
    .option("query", """
        SELECT 
            source_year as YEAR,
            COUNT(*) as REGISTROS,
            COUNT(DISTINCT source_month) as MESES_CARGADOS
        FROM YELLOW_TRIPS
        GROUP BY source_year
        ORDER BY source_year
    """) \
    .load()

yellow_by_year.show(20, truncate=False)

print("\n Distribución GREEN por año:")
green_by_year = spark.read \
    .format("snowflake") \
    .options(**sfOptions) \
    .option("query", """
        SELECT 
            source_year as YEAR,
            COUNT(*) as REGISTROS,
            COUNT(DISTINCT source_month) as MESES_CARGADOS
        FROM GREEN_TRIPS
        GROUP BY source_year
        ORDER BY source_year
    """) \
    .load()

green_by_year.show(20, truncate=False)

print("\n Verificación en Snowflake completada")


 VERIFICANDO DATOS EN SNOWFLAKE:
 YELLOW_TRIPS: 772,827,410 registros
 GREEN_TRIPS: 68,045,597 registros

TOTAL EN SNOWFLAKE: 840,873,007 registros

 Distribución YELLOW por año:
+----+---------+--------------+
|YEAR|REGISTROS|MESES_CARGADOS|
+----+---------+--------------+
|2015|134479565|11            |
|2016|131131805|12            |
|2017|113500327|12            |
|2018|102871387|12            |
|2019|84598444 |12            |
|2020|24649092 |12            |
|2021|30904308 |12            |
|2022|39656098 |12            |
|2023|38310226 |12            |
|2024|41169720 |12            |
|2025|31556438 |8             |
+----+---------+--------------+


 Distribución GREEN por año:
+----+---------+--------------+
|YEAR|REGISTROS|MESES_CARGADOS|
+----+---------+--------------+
|2015|19233765 |12            |
|2016|16385541 |12            |
|2017|11737059 |12            |
|2018|8899718  |12            |
|2019|6300985  |12            |
|2020|1734176  |12            |
|2021|1068755  |12   

In [22]:


import pandas as pd

print("\n Guardando matriz de cobertura...")

coverage_df = pd.DataFrame(results)


coverage_matrix = coverage_df[['service', 'year', 'month', 'status', 'records', 'duration']]


coverage_matrix = coverage_matrix.sort_values(['service', 'year', 'month'])


print("\n MATRIZ DE COBERTURA (primeras 20 filas):")

print(coverage_matrix.head(20).to_string(index=False))


csv_path = '/home/jovyan/work/coverage_matrix.csv'
coverage_matrix.to_csv(csv_path, index=False)
print(f"\n✓ Matriz guardada en: {csv_path}")


print("\n RESUMEN VISUAL DE COBERTURA:")



for service in ['yellow', 'green']:
  
    print(f"Servicio: {service.upper()}")
    print(f"{'='*70}")
    
    service_data = coverage_matrix[coverage_matrix['service'] == service]
    
   
    pivot = service_data.pivot_table(
        index='year',
        columns='month',
        values='status',
        aggfunc='first',
        fill_value='MISSING'
    )
    
    
    pivot = pivot.replace({
        'SUCCESS': '✓',
        'NOT_FOUND': '✗',
        'FAILED': '✗',
        'MISSING': '-'
    })
    
    print(pivot.to_string())

# Estadísticas de cobertura
print("\n\n ESTADÍSTICAS DE COBERTURA:")


for service in ['yellow', 'green']:
    service_data = coverage_matrix[coverage_matrix['service'] == service]
    
    total_esperados = len(service_data)
    exitosos = len(service_data[service_data['status'] == 'SUCCESS'])
    fallidos = len(service_data[service_data['status'] == 'FAILED'])
    no_encontrados = len(service_data[service_data['status'] == 'NOT_FOUND'])
    
    cobertura_pct = (exitosos / total_esperados * 100) if total_esperados > 0 else 0
    
    print(f"\n{service.upper()}:")
    print(f"  Total esperados: {total_esperados}")
    print(f"   Exitosos: {exitosos}")
    print(f"   Fallidos: {fallidos}")
    print(f"    No encontrados: {no_encontrados}")
    print(f"   Cobertura: {cobertura_pct:.1f}%")


print(" PROCESO DE INGESTA COMPLETADO!")

print("\n Archivos generados:")
print(f"  - {csv_path}")



 Guardando matriz de cobertura...

 MATRIZ DE COBERTURA (primeras 20 filas):
service  year  month  status  records  duration
  green  2015      1 SUCCESS  1508493 44.034139
  green  2015      2 SUCCESS  1574830 44.752931
  green  2015      3 SUCCESS  1722574 45.236302
  green  2015      4 SUCCESS  1664394 46.644627
  green  2015      5 SUCCESS  1786848 48.034444
  green  2015      6 SUCCESS  1638868 45.511620
  green  2015      7 SUCCESS  1541671 44.064280
  green  2015      8 SUCCESS  1532343 44.355789
  green  2015      9 SUCCESS  1494927 43.068413
  green  2015     10 SUCCESS  1630536 45.179705
  green  2015     11 SUCCESS  1529984 46.048084
  green  2015     12 SUCCESS  1608297 46.589371
  green  2016      1 SUCCESS  1445292 43.957365
  green  2016      2 SUCCESS  1510722 44.334377
  green  2016      3 SUCCESS  1576393 44.731798
  green  2016      4 SUCCESS  1543926 48.838017
  green  2016      5 SUCCESS  1536979 48.410926
  green  2016      6 SUCCESS  1404727 46.896115
  green  2