In [1]:
import os
import sys
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import requests
import urllib.request
import pandas as pd
import snowflake.connector

print(" Imports completados")

 Imports completados


In [2]:

SNOWFLAKE_ACCOUNT = os.getenv('SNOWFLAKE_ACCOUNT')
SNOWFLAKE_USER = os.getenv('SNOWFLAKE_USER')
SNOWFLAKE_PASSWORD = os.getenv('SNOWFLAKE_PASSWORD')
SNOWFLAKE_ROLE = os.getenv('SNOWFLAKE_ROLE', 'ACCOUNTADMIN')
SNOWFLAKE_DATABASE = os.getenv('SNOWFLAKE_DATABASE')
SNOWFLAKE_WAREHOUSE = os.getenv('SNOWFLAKE_WAREHOUSE')
SNOWFLAKE_SCHEMA_RAW = os.getenv('SNOWFLAKE_SCHEMA_RAW', 'RAW')
SNOWFLAKE_SCHEMA_ANALYTICS = os.getenv('SNOWFLAKE_SCHEMA_ANALYTICS', 'ANALYTICS')

# Taxi Zone Lookup URL
TAXI_ZONE_URL = "https://d37ci6vzurychx.cloudfront.net/misc/taxi+_zone_lookup.csv"


print("CONFIGURACIÓN DE ENRIQUECIMIENTO")

print(f"Base de datos: {SNOWFLAKE_DATABASE}")
print(f"Schema RAW: {SNOWFLAKE_SCHEMA_RAW}")
print(f"Schema ANALYTICS: {SNOWFLAKE_SCHEMA_ANALYTICS}")
print(f"Taxi Zone URL: {TAXI_ZONE_URL}")


CONFIGURACIÓN DE ENRIQUECIMIENTO
Base de datos: NYC_TLC_P03
Schema RAW: RAW
Schema ANALYTICS: ANALYTICS
Taxi Zone URL: https://d37ci6vzurychx.cloudfront.net/misc/taxi+_zone_lookup.csv


In [3]:
print("\n Inicializando Spark...")

spark = SparkSession.builder \
    .appName("NYC_TLC_Enriquecimiento") \
    .config("spark.jars.packages", 
            "net.snowflake:spark-snowflake_2.12:2.11.0-spark_3.3,"
            "net.snowflake:snowflake-jdbc:3.13.30") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")

print(f" Spark {spark.version} inicializado")


 Inicializando Spark...
 Spark 3.5.0 inicializado


In [5]:
sfOptions = {
    "sfURL": f"{SNOWFLAKE_ACCOUNT}.snowflakecomputing.com",
    "sfUser": SNOWFLAKE_USER,
    "sfPassword": SNOWFLAKE_PASSWORD,
    "sfDatabase": SNOWFLAKE_DATABASE,
    "sfSchema": SNOWFLAKE_SCHEMA_ANALYTICS,
    "sfWarehouse": SNOWFLAKE_WAREHOUSE,
    "sfRole": SNOWFLAKE_ROLE
}

print("Configuración de Snowflake establecida")

Configuración de Snowflake establecida


In [9]:


print(" DESCARGANDO TAXI ZONE LOOKUP")

# Descargar archivo
tmp_zones_path = "/tmp/taxi_zone_lookup.csv"
print(f"  Descargando: {TAXI_ZONE_URL}")
urllib.request.urlretrieve(TAXI_ZONE_URL, tmp_zones_path)
print(f" Descargado a: {tmp_zones_path}")

# Leer con Spark
print(" Leyendo CSV con Spark...")
taxi_zones_df = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv(tmp_zones_path)

print(f" Registros leídos: {taxi_zones_df.count()}")

# Mostrar schema y muestra
print("\nSchema:")
taxi_zones_df.printSchema()

print("\nMuestra de datos:")
taxi_zones_df.show(10, truncate=False)

# NO BORRAR TODAVÍA - Spark lo necesita
print("  Archivo temporal mantiene para escritura a Snowflake")

 DESCARGANDO TAXI ZONE LOOKUP
  Descargando: https://d37ci6vzurychx.cloudfront.net/misc/taxi+_zone_lookup.csv
 Descargado a: /tmp/taxi_zone_lookup.csv
 Leyendo CSV con Spark...
 Registros leídos: 265

Schema:
root
 |-- LocationID: integer (nullable = true)
 |-- Borough: string (nullable = true)
 |-- Zone: string (nullable = true)
 |-- service_zone: string (nullable = true)


Muestra de datos:
+----------+-------------+-----------------------+------------+
|LocationID|Borough      |Zone                   |service_zone|
+----------+-------------+-----------------------+------------+
|1         |EWR          |Newark Airport         |EWR         |
|2         |Queens       |Jamaica Bay            |Boro Zone   |
|3         |Bronx        |Allerton/Pelham Gardens|Boro Zone   |
|4         |Manhattan    |Alphabet City          |Yellow Zone |
|5         |Staten Island|Arden Heights          |Boro Zone   |
|6         |Staten Island|Arrochar/Fort Wadsworth|Boro Zone   |
|7         |Queens       |As

In [10]:

print(" CARGANDO TAXI ZONES A SNOWFLAKE")

# Cambiar a schema ANALYTICS
sfOptions_analytics = sfOptions.copy()
sfOptions_analytics["sfSchema"] = SNOWFLAKE_SCHEMA_ANALYTICS

print(f" Escribiendo a {SNOWFLAKE_SCHEMA_ANALYTICS}.TAXI_ZONES...")

# IMPORTANTE: Cachear el DataFrame antes de escribir
taxi_zones_df.cache()

# Escribir a Snowflake (truncate and replace)
taxi_zones_df.write \
    .format("snowflake") \
    .options(**sfOptions_analytics) \
    .option("dbtable", "TAXI_ZONES") \
    .mode("overwrite") \
    .save()

print("Taxi Zones cargados exitosamente")

# Verificar
zones_count = spark.read \
    .format("snowflake") \
    .options(**sfOptions_analytics) \
    .option("dbtable", "TAXI_ZONES") \
    .load() \
    .count()

print(f" Verificado: {zones_count} zonas en ANALYTICS.TAXI_ZONES")

# AHORA SÍ: Limpiar archivo temporal
if os.path.exists(tmp_zones_path):
    os.remove(tmp_zones_path)
    print("  Archivo temporal eliminado")

# Liberar cache
taxi_zones_df.unpersist()

 CARGANDO TAXI ZONES A SNOWFLAKE
 Escribiendo a ANALYTICS.TAXI_ZONES...
Taxi Zones cargados exitosamente
 Verificado: 265 zonas en ANALYTICS.TAXI_ZONES
  Archivo temporal eliminado


DataFrame[LocationID: int, Borough: string, Zone: string, service_zone: string]

In [11]:
print(" CREANDO CATÁLOGO: PAYMENT_TYPE")

# Definir catálogo según documentación NYC TLC
payment_types_data = [
    (1, "Credit card"),
    (2, "Cash"),
    (3, "No charge"),
    (4, "Dispute"),
    (5, "Unknown"),
    (6, "Voided trip")
]

payment_types_df = spark.createDataFrame(
    payment_types_data,
    ["payment_type_id", "payment_type_desc"]
)

print("Catálogo Payment Type:")
payment_types_df.show(truncate=False)

# Escribir a Snowflake
print(f" Escribiendo a {SNOWFLAKE_SCHEMA_ANALYTICS}.PAYMENT_TYPE_LOOKUP...")

payment_types_df.write \
    .format("snowflake") \
    .options(**sfOptions_analytics) \
    .option("dbtable", "PAYMENT_TYPE_LOOKUP") \
    .mode("overwrite") \
    .save()

print(" Catálogo Payment Type cargado")

 CREANDO CATÁLOGO: PAYMENT_TYPE
Catálogo Payment Type:
+---------------+-----------------+
|payment_type_id|payment_type_desc|
+---------------+-----------------+
|1              |Credit card      |
|2              |Cash             |
|3              |No charge        |
|4              |Dispute          |
|5              |Unknown          |
|6              |Voided trip      |
+---------------+-----------------+

 Escribiendo a ANALYTICS.PAYMENT_TYPE_LOOKUP...
 Catálogo Payment Type cargado


In [12]:
print(" CREANDO CATÁLOGO: RATE_CODE")

# Definir catálogo según documentación NYC TLC
rate_code_data = [
    (1, "Standard rate"),
    (2, "JFK"),
    (3, "Newark"),
    (4, "Nassau or Westchester"),
    (5, "Negotiated fare"),
    (6, "Group ride")
]

rate_code_df = spark.createDataFrame(
    rate_code_data,
    ["rate_code_id", "rate_code_desc"]
)

print(" Catálogo Rate Code:")
rate_code_df.show(truncate=False)

# Escribir a Snowflake
print(f" Escribiendo a {SNOWFLAKE_SCHEMA_ANALYTICS}.RATE_CODE_LOOKUP...")

rate_code_df.write \
    .format("snowflake") \
    .options(**sfOptions_analytics) \
    .option("dbtable", "RATE_CODE_LOOKUP") \
    .mode("overwrite") \
    .save()

print(" Catálogo Rate Code cargado")

 CREANDO CATÁLOGO: RATE_CODE
 Catálogo Rate Code:
+------------+---------------------+
|rate_code_id|rate_code_desc       |
+------------+---------------------+
|1           |Standard rate        |
|2           |JFK                  |
|3           |Newark               |
|4           |Nassau or Westchester|
|5           |Negotiated fare      |
|6           |Group ride           |
+------------+---------------------+

 Escribiendo a ANALYTICS.RATE_CODE_LOOKUP...
 Catálogo Rate Code cargado


In [13]:
print(" CREANDO CATÁLOGO: VENDOR")

# Definir catálogo según documentación NYC TLC
vendor_data = [
    (1, "Creative Mobile Technologies, LLC"),
    (2, "VeriFone Inc.")
]

vendor_df = spark.createDataFrame(
    vendor_data,
    ["vendor_id", "vendor_name"]
)

print(" Catálogo Vendor:")
vendor_df.show(truncate=False)

# Escribir a Snowflake
print(f" Escribiendo a {SNOWFLAKE_SCHEMA_ANALYTICS}.VENDOR_LOOKUP...")

vendor_df.write \
    .format("snowflake") \
    .options(**sfOptions_analytics) \
    .option("dbtable", "VENDOR_LOOKUP") \
    .mode("overwrite") \
    .save()

print(" Catálogo Vendor cargado")

 CREANDO CATÁLOGO: VENDOR
 Catálogo Vendor:
+---------+---------------------------------+
|vendor_id|vendor_name                      |
+---------+---------------------------------+
|1        |Creative Mobile Technologies, LLC|
|2        |VeriFone Inc.                    |
+---------+---------------------------------+

 Escribiendo a ANALYTICS.VENDOR_LOOKUP...
 Catálogo Vendor cargado


In [14]:
print(" CREANDO CATÁLOGO: TRIP_TYPE (Green Taxi)")

# Definir catálogo según documentación NYC TLC
trip_type_data = [
    (1, "Street-hail"),
    (2, "Dispatch")
]

trip_type_df = spark.createDataFrame(
    trip_type_data,
    ["trip_type_id", "trip_type_desc"]
)

print(" Catálogo Trip Type:")
trip_type_df.show(truncate=False)

# Escribir a Snowflake
print(f" Escribiendo a {SNOWFLAKE_SCHEMA_ANALYTICS}.TRIP_TYPE_LOOKUP...")

trip_type_df.write \
    .format("snowflake") \
    .options(**sfOptions_analytics) \
    .option("dbtable", "TRIP_TYPE_LOOKUP") \
    .mode("overwrite") \
    .save()

print(" Catálogo Trip Type cargado")

 CREANDO CATÁLOGO: TRIP_TYPE (Green Taxi)
 Catálogo Trip Type:
+------------+--------------+
|trip_type_id|trip_type_desc|
+------------+--------------+
|1           |Street-hail   |
|2           |Dispatch      |
+------------+--------------+

 Escribiendo a ANALYTICS.TRIP_TYPE_LOOKUP...
 Catálogo Trip Type cargado


In [15]:
print(" VERIFICANDO TABLAS DE LOOKUP EN ANALYTICS")

lookup_tables = [
    "TAXI_ZONES",
    "PAYMENT_TYPE_LOOKUP",
    "RATE_CODE_LOOKUP",
    "VENDOR_LOOKUP",
    "TRIP_TYPE_LOOKUP"
]

for table in lookup_tables:
    try:
        count_df = spark.read \
            .format("snowflake") \
            .options(**sfOptions_analytics) \
            .option("query", f"SELECT COUNT(*) as COUNT FROM {table}") \
            .load()
        
        count = count_df.collect()[0]['COUNT']
        print(f" {table}: {count:,} registros")
    except Exception as e:
        print(f" {table}: Error - {e}")

print("\n Verificación completada")

 VERIFICANDO TABLAS DE LOOKUP EN ANALYTICS
 TAXI_ZONES: 265 registros
 PAYMENT_TYPE_LOOKUP: 6 registros
 RATE_CODE_LOOKUP: 6 registros
 VENDOR_LOOKUP: 2 registros
 TRIP_TYPE_LOOKUP: 2 registros

 Verificación completada


In [17]:
print(" RESUMEN DE ENRIQUECIMIENTO")

print("\n Tablas creadas en ANALYTICS:")
print("  1. TAXI_ZONES - Mapeo de LocationID a Zone/Borough")
print("  2. PAYMENT_TYPE_LOOKUP - Descripciones de tipos de pago")
print("  3. RATE_CODE_LOOKUP - Descripciones de códigos de tarifa")
print("  4. VENDOR_LOOKUP - Nombres de vendors")
print("  5. TRIP_TYPE_LOOKUP - Tipos de viaje (Green)")





 RESUMEN DE ENRIQUECIMIENTO

 Tablas creadas en ANALYTICS:
  1. TAXI_ZONES - Mapeo de LocationID a Zone/Borough
  2. PAYMENT_TYPE_LOOKUP - Descripciones de tipos de pago
  3. RATE_CODE_LOOKUP - Descripciones de códigos de tarifa
  4. VENDOR_LOOKUP - Nombres de vendors
  5. TRIP_TYPE_LOOKUP - Tipos de viaje (Green)
