In [17]:
import os
import requests
import pandas as pd
from datetime import datetime
from io import StringIO
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, when, trim, upper, coalesce
from pyspark.sql.types import *
import snowflake.connector
from snowflake.connector.pandas_tools import write_pandas

In [18]:
#Crear SparkSession
spark = SparkSession.builder \
    .appName("NYC_TLC_Enriquecimiento") \
    .config("spark.sql.legacy.timeParserPolicy", "LEGACY") \
    .getOrCreate()

print(f"Spark Version: {spark.version}")
print(f"Spark UI: http://localhost:4040")

Spark Version: 3.5.0
Spark UI: http://localhost:4040


In [19]:
SNOWFLAKE_ACCOUNT = os.getenv("SNOWFLAKE_ACCOUNT")
SNOWFLAKE_USER = os.getenv("SNOWFLAKE_USER")
SNOWFLAKE_PASSWORD = os.getenv("SNOWFLAKE_PASSWORD")
SNOWFLAKE_WAREHOUSE = os.getenv("SNOWFLAKE_WAREHOUSE")
SNOWFLAKE_DATABASE = os.getenv("SNOWFLAKE_DATABASE")
SNOWFLAKE_ROLE = os.getenv("SNOWFLAKE_ROLE")
SNOWFLAKE_SCHEMA_RAW = os.getenv("SNOWFLAKE_SCHEMA_RAW", "RAW")

print(f"\nConfiguración:")
print(f"  - Database: {SNOWFLAKE_DATABASE}")
print(f"  - Schema RAW: {SNOWFLAKE_SCHEMA_RAW}")


Configuración:
  - Database: NYC_TAXI
  - Schema RAW: RAW


In [20]:
SNOWFLAKE_OPTIONS = {
    "sfURL": f"{SNOWFLAKE_ACCOUNT}.snowflakecomputing.com",
    "sfUser": SNOWFLAKE_USER,
    "sfPassword": SNOWFLAKE_PASSWORD,
    "sfDatabase": SNOWFLAKE_DATABASE,
    "sfWarehouse": SNOWFLAKE_WAREHOUSE,
    "sfRole": SNOWFLAKE_ROLE,
}

print(f"\nConfiguración:")
print(f"  - Database: {SNOWFLAKE_DATABASE}")
print(f"  - Schema RAW: {SNOWFLAKE_SCHEMA_RAW}")


Configuración:
  - Database: NYC_TAXI
  - Schema RAW: RAW


In [None]:
def get_snowflake_conn():
    return snowflake.connector.connect(
        user=os.getenv("SNOWFLAKE_USER"),
        password=os.getenv("SNOWFLAKE_PASSWORD"),
        account=os.getenv("SNOWFLAKE_ACCOUNT"),
        warehouse=os.getenv("SNOWFLAKE_WAREHOUSE"),
        database=os.getenv("SNOWFLAKE_DATABASE"),
        schema=os.getenv("SNOWFLAKE_SCHEMA_RAW", "RAW"),
        role=os.getenv("SNOWFLAKE_ROLE"),
        client_session_keep_alive=True,
    )
print("\nProbando conexión a Snowflake...")
try:
    conn = get_snowflake_conn()
    cur = conn.cursor()
    cur.execute("SELECT CURRENT_VERSION()")
    version = cur.fetchone()[0]
    print(f"Conectado a Snowflake versión: {version}")
    cur.close()
    conn.close()
except Exception as e:
    print(f"Error de conexión: {e}")
    raise



Probando conexión a Snowflake...
Conectado a Snowflake versión: 9.37.0


In [22]:
def load_taxi_zones():
    url = "https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv"
    table_name = "TAXI_ZONES"
    
    print(f"\n{'='*80}")
    print(f" CARGANDO TAXI ZONES")
    print(f"{'='*80}")
    print(f" Descargando desde: {url}")
    
    try:
        # Descargar CSV
        df = pd.read_csv(url)
        print(f" Archivo descargado: {len(df)} filas, {len(df.columns)} columnas")
        print(f"   Columnas: {list(df.columns)}")
        
        # Normalizar nombres de columnas
        df.columns = [col.strip().upper().replace(" ", "_") for col in df.columns]
        
        # Mostrar muestra
        print("\n Muestra de datos:")
        print(df.head())
        
        # Verificar si la tabla ya existe
        conn = get_snowflake_conn()
        cur = conn.cursor()
        
        cur.execute(f"""
            SELECT COUNT(*) 
            FROM {SNOWFLAKE_DATABASE}.INFORMATION_SCHEMA.TABLES 
            WHERE TABLE_SCHEMA = '{SNOWFLAKE_SCHEMA_RAW}'
              AND TABLE_NAME = '{table_name}'
        """)
        table_exists = cur.fetchone()[0] > 0
        
        if table_exists:
            cur.execute(f"SELECT COUNT(*) FROM {SNOWFLAKE_SCHEMA_RAW}.{table_name}")
            existing_rows = cur.fetchone()[0]
            print(f"\nTabla {table_name} ya existe con {existing_rows} filas")
            
            response = input("¿Deseas reemplazarla? (y/n): ").lower()
            if response != 'y':
                print(" Carga omitida")
                cur.close()
                conn.close()
                return
            else:
                cur.execute(f"TRUNCATE TABLE {SNOWFLAKE_SCHEMA_RAW}.{table_name}")
                print(f"Tabla truncada")
        
        # Escribir a Snowflake
        success, nchunks, nrows, _ = write_pandas(
            conn,
            df,
            table_name=table_name,
            auto_create_table=True,
            overwrite=False,
            quote_identifiers=True,
        )
        
        if success:
            print(f"{nrows} filas exportadas a {table_name} ({nchunks} chunk(s))")
        else:
            print(f"Error en exportación")
        
        cur.close()
        conn.close()
        
    except Exception as e:
        print(f"Error cargando Taxi Zones: {e}")
        raise

# Ejecutar carga
load_taxi_zones()


 CARGANDO TAXI ZONES
 Descargando desde: https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv
 Archivo descargado: 265 filas, 4 columnas
   Columnas: ['LocationID', 'Borough', 'Zone', 'service_zone']

 Muestra de datos:
   LOCATIONID        BOROUGH                     ZONE SERVICE_ZONE
0           1            EWR           Newark Airport          EWR
1           2         Queens              Jamaica Bay    Boro Zone
2           3          Bronx  Allerton/Pelham Gardens    Boro Zone
3           4      Manhattan            Alphabet City  Yellow Zone
4           5  Staten Island            Arden Heights    Boro Zone

Tabla TAXI_ZONES ya existe con 265 filas


¿Deseas reemplazarla? (y/n):  y


Tabla truncada
265 filas exportadas a TAXI_ZONES (1 chunk(s))


In [24]:
def create_payment_type_catalog():
    table_name = "DIM_PAYMENT_TYPE"
    
    print(f"\n{'='*80}")
    print(f"CREANDO CATÁLOGO: PAYMENT TYPE")
    print(f"{'='*80}")
    
    payment_types = pd.DataFrame({
        'PAYMENT_TYPE_ID': [0, 1, 2, 3, 4, 5, 6],
        'PAYMENT_TYPE_DESC': [
            'Flex Fare trip',
            'Credit card',
            'Cash',
            'No charge',
            'Dispute',
            'Unknown',
            'Voided trip'
        ]
    })
    
    print(payment_types)
    
    conn = get_snowflake_conn()
    cur = conn.cursor()
    
    # Crear tabla si no existe
    cur.execute(f"""
        CREATE TABLE IF NOT EXISTS {SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA_RAW}.{table_name} (
            PAYMENT_TYPE_ID NUMBER PRIMARY KEY,
            PAYMENT_TYPE_DESC VARCHAR(50)
        )
    """)
    
    # Truncar y cargar
    cur.execute(f"TRUNCATE TABLE {SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA_RAW}.{table_name}")
    
    success, nchunks, nrows, _ = write_pandas(
        conn,
        payment_types,
        table_name=table_name,
        auto_create_table=False,
        overwrite=False,
        quote_identifiers=True,
    )
    
    print(f"{nrows} tipos de pago cargados")
    
    cur.close()
    conn.close()

create_payment_type_catalog()



CREANDO CATÁLOGO: PAYMENT TYPE
   PAYMENT_TYPE_ID PAYMENT_TYPE_DESC
0                0    Flex Fare trip
1                1       Credit card
2                2              Cash
3                3         No charge
4                4           Dispute
5                5           Unknown
6                6       Voided trip
7 tipos de pago cargados


In [25]:
def create_rate_code_catalog():
    table_name = "DIM_RATE_CODE"
    
    print(f"\n{'='*80}")
    print(f"CREANDO CATÁLOGO: RATE CODE")
    print(f"{'='*80}")
    
    rate_codes = pd.DataFrame({
        'RATE_CODE_ID': [1, 2, 3, 4, 5, 6, 99],
        'RATE_CODE_DESC': [
            'Standard rate',
            'JFK',
            'Newark',
            'Nassau or Westchester',
            'Negotiated fare',
            'Group ride',
            'Unknown'
        ]
    })
    
    print(rate_codes)
    
    conn = get_snowflake_conn()
    cur = conn.cursor()
    
    # Crear tabla si no existe
    cur.execute(f"""
        CREATE TABLE IF NOT EXISTS {SNOWFLAKE_SCHEMA_RAW}.{table_name} (
            RATE_CODE_ID NUMBER PRIMARY KEY,
            RATE_CODE_DESC VARCHAR(50)
        )
    """)
    
    # Truncar y cargar
    cur.execute(f"TRUNCATE TABLE {SNOWFLAKE_SCHEMA_RAW}.{table_name}")
    
    success, nchunks, nrows, _ = write_pandas(
        conn,
        rate_codes,
        table_name=table_name,
        auto_create_table=False,
        overwrite=False,
        quote_identifiers=True,
    )
    
    print(f"{nrows} rate codes cargados")
    
    cur.close()
    conn.close()

create_rate_code_catalog()


CREANDO CATÁLOGO: RATE CODE
   RATE_CODE_ID         RATE_CODE_DESC
0             1          Standard rate
1             2                    JFK
2             3                 Newark
3             4  Nassau or Westchester
4             5        Negotiated fare
5             6             Group ride
6            99                Unknown
7 rate codes cargados


In [11]:
def create_vendor_catalog():
    table_name = "DIM_VENDOR"
    
    print(f"\n{'='*80}")
    print(f"CREANDO CATÁLOGO: VENDOR")
    print(f"{'='*80}")
    
    vendors = pd.DataFrame({
        'VENDOR_ID': [1, 2, 6, 7],
        'VENDOR_NAME': [
            'Creative Mobile Technologies, LLC',
            'Curb Mobility, LLC',
            'Myle Technologies Inc',
            'Helix'
        ]
    })
    
    print(vendors)
    
    conn = get_snowflake_conn()
    cur = conn.cursor()
    
    # Crear tabla si no existe
    cur.execute(f"""
        CREATE TABLE IF NOT EXISTS {SNOWFLAKE_SCHEMA_RAW}.{table_name} (
            VENDOR_ID NUMBER PRIMARY KEY,
            VENDOR_NAME VARCHAR(100)
        )
    """)
    
    # Truncar y cargar
    cur.execute(f"TRUNCATE TABLE {SNOWFLAKE_SCHEMA_RAW}.{table_name}")
    
    success, nchunks, nrows, _ = write_pandas(
        conn,
        vendors,
        table_name=table_name,
        auto_create_table=False,
        overwrite=False,
        quote_identifiers=True,
    )
    
    print(f"{nrows} vendors cargados")
    
    cur.close()
    conn.close()

create_vendor_catalog()


CREANDO CATÁLOGO: VENDOR
   VENDOR_ID                        VENDOR_NAME
0          1  Creative Mobile Technologies, LLC
1          2                 Curb Mobility, LLC
2          6              Myle Technologies Inc
3          7                              Helix
4 vendors cargados


In [26]:
def create_trip_type_catalog():

    table_name = "DIM_TRIP_TYPE"
    
    print(f"\n{'='*80}")
    print(f"CREANDO CATÁLOGO: TRIP TYPE (Green)")
    print(f"{'='*80}")
    
    trip_types = pd.DataFrame({
        'TRIP_TYPE_ID': [1, 2],
        'TRIP_TYPE_DESC': [
            'Street-hail',
            'Dispatch'
        ]
    })
    
    print(trip_types)
    
    conn = get_snowflake_conn()
    cur = conn.cursor()
    
    # Crear tabla si no existe
    cur.execute(f"""
        CREATE TABLE IF NOT EXISTS {SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA_RAW}.{table_name} (
            TRIP_TYPE_ID NUMBER PRIMARY KEY,
            TRIP_TYPE_DESC VARCHAR(50)
        )
    """)
    
    # Truncar y cargar
    cur.execute(f"TRUNCATE TABLE {SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA_RAW}.{table_name}")
    
    success, nchunks, nrows, _ = write_pandas(
        conn,
        trip_types,
        table_name=table_name,
        auto_create_table=False,
        overwrite=False,
        quote_identifiers=True,
    )
    
    print(f"{nrows} trip types cargados")
    
    cur.close()
    conn.close()

create_trip_type_catalog()


CREANDO CATÁLOGO: TRIP TYPE (Green)
   TRIP_TYPE_ID TRIP_TYPE_DESC
0             1    Street-hail
1             2       Dispatch
2 trip types cargados


In [13]:
def validate_location_ids():
    print(f"\n{'='*80}")
    print(f"VALIDANDO INTEGRIDAD REFERENCIAL")
    print(f"{'='*80}")
    
    conn = get_snowflake_conn()
    cur = conn.cursor()
    
    # Validar Yellow
    print("\nValidando YELLOW_TAXIS...")
    try:
        cur.execute(f"""
            SELECT COUNT(DISTINCT y.PULOCATIONID) as missing_pu
            FROM {SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA_RAW}.YELLOW_TAXIS y
            LEFT JOIN {SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA_RAW}.TAXI_ZONES tz 
                ON y.PULOCATIONID = tz.LOCATIONID
            WHERE tz.LOCATIONID IS NULL 
              AND y.PULOCATIONID IS NOT NULL
        """)
        missing_pu = cur.fetchone()[0]
        
        cur.execute(f"""
            SELECT COUNT(DISTINCT y.DOLOCATIONID) as missing_do
            FROM {SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA_RAW}.YELLOW_TAXIS y
            LEFT JOIN {SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA_RAW}.TAXI_ZONES tz 
                ON y.DOLOCATIONID = tz.LOCATIONID
            WHERE tz.LOCATIONID IS NULL 
              AND y.DOLOCATIONID IS NOT NULL
        """)
        missing_do = cur.fetchone()[0]
        
        if missing_pu > 0 or missing_do > 0:
            print(f"PU LocationIDs sin match: {missing_pu}")
            print(f"DO LocationIDs sin match: {missing_do}")
        else:
            print(f" Todos los LocationIDs tienen match")
    except Exception as e:
        print(f" No se pudo validar: {e}")
    
    # Validar Green
    print("\nValidando GREEN_TAXIS...")
    try:
        cur.execute(f"""
            SELECT COUNT(DISTINCT g.PULOCATIONID) as missing_pu
            FROM {SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA_RAW}.GREEN_TAXIS g
            LEFT JOIN {SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA_RAW}.TAXI_ZONES tz 
                ON g.PULOCATIONID = tz.LOCATIONID
            WHERE tz.LOCATIONID IS NULL 
              AND g.PULOCATIONID IS NOT NULL
        """)
        missing_pu = cur.fetchone()[0]
        
        cur.execute(f"""
            SELECT COUNT(DISTINCT g.DOLOCATIONID) as missing_do
            FROM {SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA_RAW}.GREEN_TAXIS g
            LEFT JOIN {SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA_RAW}.TAXI_ZONES tz 
                ON g.DOLOCATIONID = tz.LOCATIONID
            WHERE tz.LOCATIONID IS NULL 
              AND g.DOLOCATIONID IS NOT NULL
        """)
        missing_do = cur.fetchone()[0]
        
        if missing_pu > 0 or missing_do > 0:
            print(f"  PU LocationIDs sin match: {missing_pu}")
            print(f" DO LocationIDs sin match: {missing_do}")
        else:
            print(f" Todos los LocationIDs tienen match")
    except Exception as e:
        print(f"  No se pudo validar: {e}")
    
    cur.close()
    conn.close()

validate_location_ids()


VALIDANDO INTEGRIDAD REFERENCIAL

Validando YELLOW_TAXIS...
 Todos los LocationIDs tienen match

Validando GREEN_TAXIS...
 Todos los LocationIDs tienen match


In [27]:
def show_catalog_summary():
    print(f"\n{'='*80}")
    print(f"RESUMEN DE CATÁLOGOS")
    print(f"{'='*80}")
    
    conn = get_snowflake_conn()
    cur = conn.cursor()
    
    catalogs = [
        "TAXI_ZONES",
        "DIM_PAYMENT_TYPE",
        "DIM_RATE_CODE",
        "DIM_VENDOR",
        "DIM_TRIP_TYPE"
    ]
    
    for catalog in catalogs:
        try:
            cur.execute(f"SELECT COUNT(*) FROM {SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA_RAW}.{catalog}")
            count = cur.fetchone()[0]
            print(f" {catalog:25} {count:>5} registros")
        except Exception as e:
            print(f" {catalog:25} Error: {e}")
    
    cur.close()
    conn.close()

show_catalog_summary()

print(f"\n{'='*80}")
print(f"TABLAS EN SCHEMA {SNOWFLAKE_SCHEMA_RAW}")
print(f"{'='*80}")

conn = get_snowflake_conn()
cur = conn.cursor()

cur.execute(f"""
    SELECT TABLE_NAME, ROW_COUNT 
    FROM {SNOWFLAKE_DATABASE}.INFORMATION_SCHEMA.TABLES 
    WHERE TABLE_SCHEMA = '{SNOWFLAKE_SCHEMA_RAW}'
    ORDER BY TABLE_NAME
""")

tables = cur.fetchall()
for table in tables:
    print(f" {table[0]:30} {table[1]:>10} filas")

cur.close()
conn.close()


RESUMEN DE CATÁLOGOS
 TAXI_ZONES                  265 registros
 DIM_PAYMENT_TYPE              7 registros
 DIM_RATE_CODE                 7 registros
 DIM_VENDOR                    4 registros
 DIM_TRIP_TYPE                 2 registros

TABLAS EN SCHEMA RAW
 AUDIT_GREEN                            48 filas
 AUDIT_YELLOW                           49 filas
 DIM_PAYMENT_TYPE                        7 filas
 DIM_RATE_CODE                           7 filas
 DIM_TRIP_TYPE                           2 filas
 DIM_VENDOR                              4 filas
 GREEN_TAXIS                      56256083 filas
 TAXI_ZONES                            265 filas
 UNIFIED_TRIPS                   312790342 filas
 YELLOW_TAXIS                    493542750 filas


In [28]:
print(f"\n{'='*80}")
print(f"UNIFICANDO YELLOW Y GREEN")
print(f"{'='*80}")

conn = get_snowflake_conn()
cur = conn.cursor()

print("\nCreando tabla UNIFIED_TRIPS...")

# Crear tabla unificada con SQL
create_unified_sql = f"""
CREATE OR REPLACE TABLE {SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA_RAW}.UNIFIED_TRIPSV2 AS

-- Yellow Taxis estandarizado
SELECT 
    VENDORID AS VENDOR_ID,
    TPEP_PICKUP_DATETIME AS PICKUP_DATETIME,
    TPEP_DROPOFF_DATETIME AS DROPOFF_DATETIME,
    PASSENGER_COUNT,
    TRIP_DISTANCE,
    RATECODEID AS RATE_CODE_ID,
    STORE_AND_FWD_FLAG,
    PULOCATIONID AS PU_LOCATION_ID,
    DOLOCATIONID AS DO_LOCATION_ID,
    PAYMENT_TYPE,
    FARE_AMOUNT,
    EXTRA,
    MTA_TAX,
    TIP_AMOUNT,
    TOLLS_AMOUNT,
    IMPROVEMENT_SURCHARGE,
    TOTAL_AMOUNT,
    CONGESTION_SURCHARGE,
    AIRPORT_FEE,
    COALESCE(CBD_CONGESTION_FEE, 0.0) AS CBD_CONGESTION_FEE,
    NULL AS EHAIL_FEE,
    NULL AS TRIP_TYPE,
    RUN_ID,
    INGESTED_AT_UTC,
    SOURCE_YEAR,
    SOURCE_MONTH,
    'yellow' AS SERVICE_TYPE,
    -- Enriquecimiento con zonas PU
    tz_pu.BOROUGH AS PU_BOROUGH,
    tz_pu.ZONE AS PU_ZONE,
    tz_pu.SERVICE_ZONE AS PU_SERVICE_ZONE,
    -- Enriquecimiento con zonas DO
    tz_do.BOROUGH AS DO_BOROUGH,
    tz_do.ZONE AS DO_ZONE,
    tz_do.SERVICE_ZONE AS DO_SERVICE_ZONE,
    -- Enriquecimiento con catálogos
    pt.PAYMENT_TYPE_DESC,
    rc.RATE_CODE_DESC,
    v.VENDOR_NAME,
    NULL AS TRIP_TYPE_DESC
FROM {SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA_RAW}.YELLOW_TAXIS
LEFT JOIN {SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA_RAW}.TAXI_ZONES tz_pu 
    ON PULOCATIONID = tz_pu.LOCATIONID
LEFT JOIN {SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA_RAW}.TAXI_ZONES tz_do 
    ON DOLOCATIONID = tz_do.LOCATIONID
LEFT JOIN {SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA_RAW}.DIM_PAYMENT_TYPE pt 
    ON PAYMENT_TYPE = pt.PAYMENT_TYPE_ID
LEFT JOIN {SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA_RAW}.DIM_RATE_CODE rc 
    ON RATECODEID = rc.RATE_CODE_ID
LEFT JOIN {SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA_RAW}.DIM_VENDOR v 
    ON VENDORID = v.VENDOR_ID

UNION ALL

-- Green Taxis estandarizado
SELECT 
    VENDORID AS VENDOR_ID,
    LPEP_PICKUP_DATETIME AS PICKUP_DATETIME,
    LPEP_DROPOFF_DATETIME AS DROPOFF_DATETIME,
    PASSENGER_COUNT,
    TRIP_DISTANCE,
    RATECODEID AS RATE_CODE_ID,
    STORE_AND_FWD_FLAG,
    PULOCATIONID AS PU_LOCATION_ID,
    DOLOCATIONID AS DO_LOCATION_ID,
    PAYMENT_TYPE,
    FARE_AMOUNT,
    EXTRA,
    MTA_TAX,
    TIP_AMOUNT,
    TOLLS_AMOUNT,
    IMPROVEMENT_SURCHARGE,
    TOTAL_AMOUNT,
    CONGESTION_SURCHARGE,
    AIRPORT_FEE,
    COALESCE(CBD_CONGESTION_FEE, 0.0) AS CBD_CONGESTION_FEE,
    EHAIL_FEE,
    TRIP_TYPE,
    RUN_ID,
    INGESTED_AT_UTC,
    SOURCE_YEAR,
    SOURCE_MONTH,
    'green' AS SERVICE_TYPE,
    -- Enriquecimiento con zonas PU
    tz_pu.BOROUGH AS PU_BOROUGH,
    tz_pu.ZONE AS PU_ZONE,
    tz_pu.SERVICE_ZONE AS PU_SERVICE_ZONE,
    -- Enriquecimiento con zonas DO
    tz_do.BOROUGH AS DO_BOROUGH,
    tz_do.ZONE AS DO_ZONE,
    tz_do.SERVICE_ZONE AS DO_SERVICE_ZONE,
    -- Enriquecimiento con catálogos
    pt.PAYMENT_TYPE_DESC,
    rc.RATE_CODE_DESC,
    v.VENDOR_NAME,
    tt.TRIP_TYPE_DESC
FROM {SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA_RAW}.GREEN_TAXIS
LEFT JOIN {SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA_RAW}.TAXI_ZONES tz_pu 
    ON PULOCATIONID = tz_pu.LOCATIONID
LEFT JOIN {SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA_RAW}.TAXI_ZONES tz_do 
    ON DOLOCATIONID = tz_do.LOCATIONID
LEFT JOIN {SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA_RAW}.DIM_PAYMENT_TYPE pt 
    ON PAYMENT_TYPE = pt.PAYMENT_TYPE_ID
LEFT JOIN {SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA_RAW}.DIM_RATE_CODE rc 
    ON RATECODEID = rc.RATE_CODE_ID
LEFT JOIN {SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA_RAW}.DIM_VENDOR v 
    ON VENDORID = v.VENDOR_ID
LEFT JOIN {SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA_RAW}.DIM_TRIP_TYPE tt 
    ON TRIP_TYPE = tt.TRIP_TYPE_ID
"""

print("Ejecutando UNION ALL con enriquecimiento (esto puede tomar varios minutos)...")
cur.execute(create_unified_sql)
print(" Tabla UNIFIED_TRIPS creada")


UNIFICANDO YELLOW Y GREEN

Creando tabla UNIFIED_TRIPS...
Ejecutando UNION ALL con enriquecimiento (esto puede tomar varios minutos)...
 Tabla UNIFIED_TRIPS creada


In [16]:
print("\nVerificando tabla UNIFIED_TRIPS...")
conn = get_snowflake_conn()
cur = conn.cursor()

cur.execute(f"SELECT COUNT(*) FROM {SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA_RAW}.UNIFIED_TRIPSV2")
count = cur.fetchone()[0]
print(f" Total registros en Snowflake: {count:,}")

cur.execute(f"""
    SELECT SERVICE_TYPE, COUNT(*) as cnt
    FROM {SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA_RAW}.UNIFIED_TRIPSV2
    GROUP BY SERVICE_TYPE
""")
print("\n Por servicio:")
for row in cur.fetchall():
    print(f"    {row[0]:10} {row[1]:>15,} registros")

cur.close()
conn.close()
print(f"\n{'='*80}")
print(f"RESUMEN DE CATÁLOGOS")
print(f"{'='*80}")
    
conn = get_snowflake_conn()
cur = conn.cursor()
    
catalogs = [
        "TAXI_ZONES",
        "DIM_PAYMENT_TYPE",
        "DIM_RATE_CODE",
        "DIM_VENDOR",
        "DIM_TRIP_TYPE"
    ]
    
for catalog in catalogs:
    try:
        cur.execute(f"SELECT COUNT(*) FROM {SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA_RAW}.{catalog}")
        count = cur.fetchone()[0]
        print(f" {catalog:25} {count:>5} registros")
    except Exception as e:
        print(f"  {catalog:25} Error: {e}")
    
cur.close()
conn.close()

show_catalog_summary()



Verificando tabla UNIFIED_TRIPS...
 Total registros en Snowflake: 312,790,342

 Por servicio:
    green           35,619,306 registros
    yellow         277,171,036 registros

RESUMEN DE CATÁLOGOS
 TAXI_ZONES                  265 registros
 DIM_PAYMENT_TYPE              7 registros
 DIM_RATE_CODE                 7 registros
 DIM_VENDOR                    4 registros
 DIM_TRIP_TYPE                 2 registros

RESUMEN DE CATÁLOGOS
 TAXI_ZONES                  265 registros
 DIM_PAYMENT_TYPE              7 registros
 DIM_RATE_CODE                 7 registros
 DIM_VENDOR                    4 registros
 DIM_TRIP_TYPE                 2 registros
