In [1]:
import os
import sys
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

print(" Imports completados")

 Imports completados


In [2]:
SNOWFLAKE_ACCOUNT = os.getenv('SNOWFLAKE_ACCOUNT')
SNOWFLAKE_USER = os.getenv('SNOWFLAKE_USER')
SNOWFLAKE_PASSWORD = os.getenv('SNOWFLAKE_PASSWORD')
SNOWFLAKE_ROLE = os.getenv('SNOWFLAKE_ROLE', 'ACCOUNTADMIN')
SNOWFLAKE_DATABASE = os.getenv('SNOWFLAKE_DATABASE')
SNOWFLAKE_WAREHOUSE = os.getenv('SNOWFLAKE_WAREHOUSE')
SNOWFLAKE_SCHEMA_ANALYTICS = os.getenv('SNOWFLAKE_SCHEMA_ANALYTICS', 'ANALYTICS')


print("ANÁLISIS DE DATOS - PREGUNTAS DE NEGOCIO")


ANÁLISIS DE DATOS - PREGUNTAS DE NEGOCIO


In [3]:
print("\n Inicializando Spark...")

spark = SparkSession.builder \
    .appName("NYC_TLC_Data_Analysis") \
    .config("spark.jars.packages", 
            "net.snowflake:spark-snowflake_2.12:2.11.0-spark_3.3,"
            "net.snowflake:snowflake-jdbc:3.13.30") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")
print(f" Spark {spark.version} inicializado")


 Inicializando Spark...
 Spark 3.5.0 inicializado


In [4]:
sfOptions = {
    "sfURL": f"{SNOWFLAKE_ACCOUNT}.snowflakecomputing.com",
    "sfUser": SNOWFLAKE_USER,
    "sfPassword": SNOWFLAKE_PASSWORD,
    "sfDatabase": SNOWFLAKE_DATABASE,
    "sfSchema": SNOWFLAKE_SCHEMA_ANALYTICS,
    "sfWarehouse": SNOWFLAKE_WAREHOUSE,
    "sfRole": SNOWFLAKE_ROLE
}

print(" Configuración establecida")

 Configuración establecida


In [6]:
print("\n Cargando OBT_TRIPS...")

obt = spark.read \
    .format("snowflake") \
    .options(**sfOptions) \
    .option("dbtable", "OBT_TRIPS") \
    .load()

print(f" OBT cargada: {obt.count():,} registros")



 Cargando OBT_TRIPS...
 OBT cargada: 837,099,213 registros


In [7]:
print("PREGUNTA 1: Top 10 zonas de pickup por volumen mensual")

q1 = obt.groupBy("pu_zone", "year", "month").agg(
    count("*").alias("total_viajes")
).orderBy(col("total_viajes").desc())

print("\n Top 10 zonas de pickup con más viajes:")
q1.show(10, truncate=False)




PREGUNTA 1: Top 10 zonas de pickup por volumen mensual

 Top 10 zonas de pickup con más viajes:
+---------------------+----+-----+------------+
|pu_zone              |year|month|total_viajes|
+---------------------+----+-----+------------+
|Upper East Side South|2015|4    |497024      |
|Upper East Side South|2015|5    |483931      |
|Upper East Side South|2015|3    |472809      |
|Upper East Side South|2015|10   |467922      |
|Upper East Side South|2015|1    |466551      |
|Upper East Side South|2016|5    |464800      |
|Midtown Center       |2015|3    |463430      |
|Midtown Center       |2015|4    |460102      |
|Upper East Side North|2015|4    |457471      |
|Midtown Center       |2016|3    |457249      |
+---------------------+----+-----+------------+
only showing top 10 rows



In [8]:
print("PREGUNTA 2: Top 10 zonas de dropoff por volumen mensual")

q2 = obt.groupBy("do_zone", "year", "month").agg(
    count("*").alias("total_viajes")
).orderBy(col("total_viajes").desc())

print("\n Top 10 zonas de dropoff con más viajes:")
q2.show(10, truncate=False)



PREGUNTA 2: Top 10 zonas de dropoff por volumen mensual

 Top 10 zonas de dropoff con más viajes:
+---------------------+----+-----+------------+
|do_zone              |year|month|total_viajes|
+---------------------+----+-----+------------+
|Midtown Center       |2015|3    |506836      |
|Midtown Center       |2015|4    |502304      |
|Midtown Center       |2015|5    |480522      |
|Midtown Center       |2015|6    |475046      |
|Midtown Center       |2015|1    |472899      |
|Upper East Side North|2015|4    |468546      |
|Midtown Center       |2016|3    |467714      |
|Upper East Side North|2015|5    |464654      |
|Midtown Center       |2015|2    |462104      |
|Midtown Center       |2015|10   |458016      |
+---------------------+----+-----+------------+
only showing top 10 rows



In [9]:

print("PREGUNTA 3: Evolución mensual de total_amount y tip_pct por borough")

# Filtrar solo años válidos (2015-2025)
q3 = obt.filter((col("year") >= 2015) & (col("year") <= 2025)) \
    .groupBy("pu_borough", "year", "month").agg(
        round(avg("total_amount"), 2).alias("avg_total_amount"),
        round(avg("tip_pct"), 2).alias("avg_tip_pct"),
        count("*").alias("num_viajes")
    ).orderBy("year", "month", "pu_borough")

print("\n Evolución por borough (últimos 12 meses de datos válidos):")
q3.orderBy(col("year").desc(), col("month").desc()).show(24, truncate=False)



PREGUNTA 3: Evolución mensual de total_amount y tip_pct por borough

 Evolución por borough (últimos 12 meses de datos válidos):
+-------------+----+-----+----------------+-----------+----------+
|pu_borough   |year|month|avg_total_amount|avg_tip_pct|num_viajes|
+-------------+----+-----+----------------+-----------+----------+
|Manhattan    |2025|9    |20.06           |17.26      |16        |
|Queens       |2025|9    |73.74           |11.15      |2         |
|Manhattan    |2025|8    |23.02           |17.44      |2872561   |
|Bronx        |2025|8    |28.75           |1.25       |32721     |
|Unknown      |2025|8    |32.83           |20.28      |5344      |
|Staten Island|2025|8    |45.8            |2.71       |344       |
|Queens       |2025|8    |62.34           |12.71      |399280    |
|Brooklyn     |2025|8    |25.72           |3.08       |171323    |
|EWR          |2025|8    |97.78           |32.74      |548       |
|N/A          |2025|8    |105.22          |20.58      |2114      |


In [10]:
print("PREGUNTA 4: Ticket promedio (avg total_amount) por service_type y mes")



q4 = obt.filter((col("year") >= 2015) & (col("year") <= 2025)) \
    .groupBy("service_type", "year", "month").agg(
        round(avg("total_amount"), 2).alias("ticket_promedio"),
        count("*").alias("num_viajes")
    ).orderBy("year", "month", "service_type")

print("\n Ticket promedio por servicio (últimos 12 meses de datos válidos):")
q4.orderBy(col("year").desc(), col("month").desc()).show(24, truncate=False)




PREGUNTA 4: Ticket promedio (avg total_amount) por service_type y mes

 Ticket promedio por servicio (últimos 12 meses de datos válidos):
+------------+----+-----+---------------+----------+
|service_type|year|month|ticket_promedio|num_viajes|
+------------+----+-----+---------------+----------+
|green       |2025|9    |26.04          |17        |
|yellow      |2025|9    |25.62          |1         |
|yellow      |2025|8    |27.79          |3438133   |
|green       |2025|8    |27.98          |46102     |
|yellow      |2025|7    |28.01          |3766256   |
|green       |2025|7    |25.79          |48017     |
|yellow      |2025|6    |28.36          |4179896   |
|green       |2025|6    |25.96          |48903     |
|yellow      |2025|5    |28.15          |4414719   |
|green       |2025|5    |25.52          |54879     |
|yellow      |2025|4    |27.65          |3862911   |
|green       |2025|4    |24.77          |51629     |
|yellow      |2025|3    |27.18          |4054091   |
|green       |

In [11]:
print("PREGUNTA 5: Viajes por hora del día y día de semana (picos)")

q5_hour = obt.groupBy("pickup_hour").agg(
    count("*").alias("total_viajes")
).orderBy("pickup_hour")

print("\n Distribución por hora del día:")
q5_hour.show(24, truncate=False)

q5_dow = obt.groupBy("day_of_week").agg(
    count("*").alias("total_viajes")
).orderBy(col("total_viajes").desc())

print("\n Distribución por día de la semana:")
q5_dow.show(truncate=False)




PREGUNTA 5: Viajes por hora del día y día de semana (picos)

 Distribución por hora del día:
+-----------+------------+
|pickup_hour|total_viajes|
+-----------+------------+
|0          |27498217    |
|1          |19638327    |
|2          |14012385    |
|3          |10092567    |
|4          |7785011     |
|5          |7783950     |
|6          |16826641    |
|7          |28943795    |
|8          |36708840    |
|9          |38143591    |
|10         |38490034    |
|11         |40361164    |
|12         |42799662    |
|13         |43305907    |
|14         |45740630    |
|15         |45737075    |
|16         |43309881    |
|17         |48859752    |
|18         |54276654    |
|19         |52009802    |
|20         |47371097    |
|21         |46630128    |
|22         |44010436    |
|23         |36763667    |
+-----------+------------+


 Distribución por día de la semana:
+-----------+------------+
|day_of_week|total_viajes|
+-----------+------------+
|Friday     |128124162   |
|Thur

In [12]:
print("PREGUNTA 6: Percentiles 50 y 90 de duración por borough de pickup")

q6 = obt.groupBy("pu_borough").agg(
    expr("percentile_approx(trip_duration_min, 0.5)").alias("p50_duration"),
    expr("percentile_approx(trip_duration_min, 0.9)").alias("p90_duration"),
    round(avg("trip_duration_min"), 2).alias("avg_duration"),
    count("*").alias("num_viajes")
).orderBy("pu_borough")

print("\n Percentiles de duración por borough:")
q6.show(truncate=False)



PREGUNTA 6: Percentiles 50 y 90 de duración por borough de pickup

 Percentiles de duración por borough:
+-------------+------------------+------------------+------------+----------+
|pu_borough   |p50_duration      |p90_duration      |avg_duration|num_viajes|
+-------------+------------------+------------------+------------+----------+
|Bronx        |13.516666666666667|38.016666666666666|22.25       |5235996   |
|Brooklyn     |13.0              |32.85             |21.42       |35006948  |
|EWR          |0.3               |1.7166666666666666|4.87        |72111     |
|Manhattan    |10.833333333333334|25.1              |15.38       |715675898 |
|N/A          |1.1833333333333333|60.0              |20.74       |622691    |
|Queens       |24.45             |54.4              |31.34       |70910644  |
|Staten Island|23.2              |71.0              |34.79       |52825     |
|Unknown      |10.6              |28.183333333333334|15.48       |9522100   |
+-------------+------------------+---

In [13]:
print("PREGUNTA 7: Velocidad promedio por franja horaria y borough")

q7 = obt.filter(col("avg_speed_mph").isNotNull()) \
    .withColumn("franja_horaria", 
        when((col("pickup_hour") >= 6) & (col("pickup_hour") < 9), "Morning Rush (6-9)")
        .when((col("pickup_hour") >= 17) & (col("pickup_hour") < 20), "Evening Rush (17-20)")
        .otherwise("Other")) \
    .groupBy("pu_borough", "franja_horaria").agg(
        round(avg("avg_speed_mph"), 2).alias("avg_speed"),
        count("*").alias("num_viajes")
    ).orderBy("pu_borough", "franja_horaria")

print("\n Velocidad promedio por franja horaria:")
q7.show(20, truncate=False)



PREGUNTA 7: Velocidad promedio por franja horaria y borough

 Velocidad promedio por franja horaria:
+-------------+--------------------+---------+----------+
|pu_borough   |franja_horaria      |avg_speed|num_viajes|
+-------------+--------------------+---------+----------+
|Bronx        |Evening Rush (17-20)|18.76    |772163    |
|Bronx        |Morning Rush (6-9)  |21.36    |732797    |
|Bronx        |Other               |22.62    |3452471   |
|Brooklyn     |Evening Rush (17-20)|14.64    |5734307   |
|Brooklyn     |Morning Rush (6-9)  |16.06    |3098783   |
|Brooklyn     |Other               |17.04    |25526719  |
|EWR          |Evening Rush (17-20)|776.43   |4357      |
|EWR          |Morning Rush (6-9)  |757.18   |2727      |
|EWR          |Other               |853.86   |14134     |
|Manhattan    |Evening Rush (17-20)|10.38    |132534519 |
|Manhattan    |Morning Rush (6-9)  |12.83    |70969056  |
|Manhattan    |Other               |11.81    |507969750 |
|N/A          |Evening Rush (

In [14]:
print("PREGUNTA 8: Participación por tipo de pago y relación con propinas")

q8 = obt.groupBy("payment_type_desc").agg(
    count("*").alias("num_viajes"),
    round(avg("tip_pct"), 2).alias("avg_tip_pct"),
    round(avg("total_amount"), 2).alias("avg_amount")
).orderBy(col("num_viajes").desc())

print("\n Análisis por tipo de pago:")
q8.show(truncate=False)

total_trips = obt.count()
q8_with_pct = q8.withColumn("participacion_pct", 
    round((col("num_viajes") / total_trips * 100), 2))

print("\n Con porcentaje de participación:")
q8_with_pct.show(truncate=False)



PREGUNTA 8: Participación por tipo de pago y relación con propinas

 Análisis por tipo de pago:
+-----------------+----------+-----------+----------+
|payment_type_desc|num_viajes|avg_tip_pct|avg_amount|
+-----------------+----------+-----------+----------+
|Credit card      |562846971 |25.34      |20.13     |
|Cash             |250048332 |0.0        |14.41     |
|NULL             |18630567  |5.44       |26.65     |
|No charge        |3452226   |0.06       |16.16     |
|Dispute          |2118125   |0.07       |21.98     |
|Unknown          |2992      |1.78       |14.79     |
+-----------------+----------+-----------+----------+


 Con porcentaje de participación:
+-----------------+----------+-----------+----------+-----------------+
|payment_type_desc|num_viajes|avg_tip_pct|avg_amount|participacion_pct|
+-----------------+----------+-----------+----------+-----------------+
|Credit card      |562846971 |25.34      |20.13     |67.24            |
|Cash             |250048332 |0.0       

In [15]:
print("PREGUNTA 9: Rate codes que concentran mayor distancia y monto")

q9 = obt.groupBy("rate_code_desc").agg(
    count("*").alias("num_viajes"),
    round(avg("trip_distance"), 2).alias("avg_distance"),
    round(avg("total_amount"), 2).alias("avg_amount"),
    round(sum("trip_distance"), 2).alias("total_distance"),
    round(sum("total_amount"), 2).alias("total_amount_sum")
).orderBy(col("total_amount_sum").desc())

print("\n Análisis por rate code:")
q9.show(truncate=False)



PREGUNTA 9: Rate codes que concentran mayor distancia y monto

 Análisis por rate code:
+---------------------+----------+------------+----------+---------------+-----------------+
|rate_code_desc       |num_viajes|avg_distance|avg_amount|total_distance |total_amount_sum |
+---------------------+----------+------------+----------+---------------+-----------------+
|Standard rate        |790602686 |2.62        |16.57     |2.07307329013E9|1.310348787429E10|
|JFK                  |19198949  |17.17       |71.36     |3.2972178351E8 |1.36994861935E9  |
|NULL                 |19847936  |4.53        |27.22     |8.988812851E7  |5.402063931E8    |
|Negotiated fare      |5064460   |5.44        |57.09     |2.756358327E7  |2.8912734685E8   |
|Newark               |1687799   |16.18       |94.0      |2.730458822E7  |1.5864482665E8   |
|Nassau or Westchester|690608    |19.26       |100.51    |1.330271168E7  |6.941186885E7    |
|Group ride           |6775      |1.28        |23.61     |8642.8         |1

In [16]:
print("PREGUNTA 10: Mix de yellow vs green por mes y borough")

q10 = obt.groupBy("pu_borough", "service_type", "year", "month").agg(
    count("*").alias("num_viajes")
).orderBy("year", "month", "pu_borough", "service_type")

print("\n Distribución Yellow vs Green (últimos 12 meses):")
q10.orderBy(col("year").desc(), col("month").desc()).show(24, truncate=False)



PREGUNTA 10: Mix de yellow vs green por mes y borough

 Distribución Yellow vs Green (últimos 12 meses):
+----------+------------+----+-----+----------+
|pu_borough|service_type|year|month|num_viajes|
+----------+------------+----+-----+----------+
|Manhattan |yellow      |2098|9    |1         |
|Queens    |yellow      |2090|12   |1         |
|Manhattan |yellow      |2088|1    |2         |
|Manhattan |yellow      |2084|11   |8         |
|Queens    |green       |2081|6    |1         |
|Queens    |yellow      |2070|8    |1         |
|Manhattan |yellow      |2066|12   |1         |
|Queens    |green       |2062|8    |1         |
|Manhattan |yellow      |2058|12   |3         |
|Brooklyn  |yellow      |2053|3    |1         |
|Manhattan |yellow      |2042|12   |1         |
|Manhattan |yellow      |2041|11   |1         |
|Queens    |green       |2041|8    |1         |
|Manhattan |yellow      |2041|6    |1         |
|Manhattan |yellow      |2041|3    |1         |
|Manhattan |yellow      |2038|2

In [17]:
print("PREGUNTA 11: Top 20 flujos origen-destino más frecuentes")

q11 = obt.groupBy("pu_zone", "do_zone").agg(
    count("*").alias("num_viajes"),
    round(avg("total_amount"), 2).alias("ticket_promedio"),
    round(avg("trip_distance"), 2).alias("avg_distance")
).orderBy(col("num_viajes").desc())

print("\n Top 20 flujos más frecuentes:")
q11.show(20, truncate=False)



PREGUNTA 11: Top 20 flujos origen-destino más frecuentes

 Top 20 flujos más frecuentes:
+----------------------------+----------------------------+----------+---------------+------------+
|pu_zone                     |do_zone                     |num_viajes|ticket_promedio|avg_distance|
+----------------------------+----------------------------+----------+---------------+------------+
|N/A                         |N/A                         |7474738   |17.42          |2.73        |
|Upper East Side South       |Upper East Side North       |4423767   |10.38          |1.06        |
|Upper East Side North       |Upper East Side South       |3779796   |11.18          |1.05        |
|Upper East Side North       |Upper East Side North       |3503791   |8.69           |0.63        |
|Upper East Side South       |Upper East Side South       |3342900   |9.27           |0.66        |
|Upper West Side South       |Upper West Side North       |1975989   |8.95           |0.84        |
|Upper West

In [18]:
print("PREGUNTA 12: Distribución de pasajeros y efecto en el monto total")

q12 = obt.filter(col("passenger_count").isNotNull()) \
    .groupBy("passenger_count").agg(
        count("*").alias("num_viajes"),
        round(avg("total_amount"), 2).alias("avg_amount"),
        round(avg("trip_distance"), 2).alias("avg_distance"),
        round(avg("tip_pct"), 2).alias("avg_tip_pct")
    ).orderBy("passenger_count")

print("\n Análisis por número de pasajeros:")
q12.show(truncate=False)




PREGUNTA 12: Distribución de pasajeros y efecto en el monto total

 Análisis por número de pasajeros:
+---------------+----------+----------+------------+-----------+
|passenger_count|num_viajes|avg_amount|avg_distance|avg_tip_pct|
+---------------+----------+----------+------------+-----------+
|0.0            |5826244   |19.5      |2.78        |26.6       |
|1.0            |597054517 |18.16     |2.97        |17.82      |
|2.0            |115285762 |19.61     |3.29        |16.7       |
|3.0            |31921078  |19.02     |3.19        |15.86      |
|4.0            |15400697  |20.12     |3.32        |15.22      |
|5.0            |32823642  |17.06     |3.05        |15.26      |
|6.0            |20147107  |16.9      |3.0         |14.97      |
|7.0            |3750      |47.45     |2.76        |24.65      |
|8.0            |3853      |49.73     |3.22        |17.74      |
|9.0            |1989      |62.89     |4.73        |102.64     |
|32.0           |1         |60.35     |16.32       |0

In [19]:
print("PREGUNTA 13: Impacto de peajes y cargos de congestión por zona")

q13 = obt.filter((col("tolls_amount") > 0) | (col("congestion_surcharge") > 0)) \
    .groupBy("pu_zone").agg(
        count("*").alias("num_viajes"),
        round(avg("tolls_amount"), 2).alias("avg_tolls"),
        round(avg("congestion_surcharge"), 2).alias("avg_congestion"),
        round(avg("total_amount"), 2).alias("avg_total")
    ).orderBy(col("num_viajes").desc())

print("\n Top zonas con peajes/congestión:")
q13.show(15, truncate=False)



PREGUNTA 13: Impacto de peajes y cargos de congestión por zona

 Top zonas con peajes/congestión:
+----------------------------+----------+---------+--------------+---------+
|pu_zone                     |num_viajes|avg_tolls|avg_congestion|avg_total|
+----------------------------+----------+---------+--------------+---------+
|LaGuardia Airport           |13417699  |5.78     |2.15          |52.63    |
|Upper East Side South       |12653287  |0.18     |2.5           |17.81    |
|Midtown Center              |12019761  |0.69     |2.49          |23.16    |
|JFK Airport                 |11489007  |5.77     |2.1           |76.35    |
|Upper East Side North       |11399320  |0.26     |2.5           |18.46    |
|Midtown East                |9925383   |0.69     |2.49          |22.49    |
|Penn Station/Madison Sq West|9746675   |0.5      |2.49          |22.27    |
|Times Sq/Theatre District   |9447604   |1.19     |2.47          |26.79    |
|Murray Hill                 |8716571   |0.8      |2.49

In [20]:
print("PREGUNTA 14: Proporción de viajes cortos vs largos por borough")

q14 = obt.withColumn("trip_category",
    when(col("trip_distance") < 2, "Corto (<2 mi)")
    .when(col("trip_distance") < 10, "Medio (2-10 mi)")
    .otherwise("Largo (>10 mi)")
).groupBy("pu_borough", "trip_category").agg(
    count("*").alias("num_viajes")
).orderBy("pu_borough", "trip_category")

print("\n Distribución por categoría de distancia:")
q14.show(20, truncate=False)

# Calcular porcentajes
window_spec = Window.partitionBy("pu_borough")
q14_pct = q14.withColumn("porcentaje",
    round((col("num_viajes") / sum("num_viajes").over(window_spec) * 100), 2)
)

print("\n Con porcentajes:")
q14_pct.show(20, truncate=False)

PREGUNTA 14: Proporción de viajes cortos vs largos por borough

 Distribución por categoría de distancia:
+-------------+---------------+----------+
|pu_borough   |trip_category  |num_viajes|
+-------------+---------------+----------+
|Bronx        |Corto (<2 mi)  |2141308   |
|Bronx        |Largo (>10 mi) |593236    |
|Bronx        |Medio (2-10 mi)|2501452   |
|Brooklyn     |Corto (<2 mi)  |14733344  |
|Brooklyn     |Largo (>10 mi) |2007926   |
|Brooklyn     |Medio (2-10 mi)|18265678  |
|EWR          |Corto (<2 mi)  |64050     |
|EWR          |Largo (>10 mi) |5713      |
|EWR          |Medio (2-10 mi)|2348      |
|Manhattan    |Corto (<2 mi)  |435753318 |
|Manhattan    |Largo (>10 mi) |18784883  |
|Manhattan    |Medio (2-10 mi)|261137697 |
|N/A          |Corto (<2 mi)  |344610    |
|N/A          |Largo (>10 mi) |122963    |
|N/A          |Medio (2-10 mi)|155118    |
|Queens       |Corto (<2 mi)  |15171315  |
|Queens       |Largo (>10 mi) |29568088  |
|Queens       |Medio (2-10 mi)|261

In [21]:
print("PREGUNTA 15: Diferencias por vendor en velocidad y duración")

q15 = obt.filter(col("avg_speed_mph").isNotNull()) \
    .groupBy("vendor_name").agg(
        count("*").alias("num_viajes"),
        round(avg("avg_speed_mph"), 2).alias("avg_speed"),
        round(avg("trip_duration_min"), 2).alias("avg_duration"),
        round(avg("trip_distance"), 2).alias("avg_distance")
    ).orderBy("vendor_name")

print("\n Comparación por vendor:")
q15.show(truncate=False)

PREGUNTA 15: Diferencias por vendor en velocidad y duración

 Comparación por vendor:
+---------------------------------+----------+---------+------------+------------+
|vendor_name                      |num_viajes|avg_speed|avg_duration|avg_distance|
+---------------------------------+----------+---------+------------+------------+
|NULL                             |946544    |167.01   |20.5        |3.99        |
|Creative Mobile Technologies, LLC|312994844 |15.24    |14.39       |2.93        |
|VeriFone Inc.                    |515511136 |12.27    |18.73       |3.18        |
+---------------------------------+----------+---------+------------+------------+



In [22]:
print("PREGUNTA 16: Relación entre método de pago y propina por hora del día")

q16 = obt.groupBy("pickup_hour", "payment_type_desc").agg(
    count("*").alias("num_viajes"),
    round(avg("tip_amount"), 2).alias("avg_tip_amount"),
    round(avg("tip_pct"), 2).alias("avg_tip_pct")
).orderBy("pickup_hour", col("num_viajes").desc())

print("\n Propinas por hora y método de pago (primeras 20):")
q16.show(20, truncate=False)

PREGUNTA 16: Relación entre método de pago y propina por hora del día

 Propinas por hora y método de pago (primeras 20):
+-----------+-----------------+----------+--------------+-----------+
|pickup_hour|payment_type_desc|num_viajes|avg_tip_amount|avg_tip_pct|
+-----------+-----------------+----------+--------------+-----------+
|0          |Credit card      |18531350  |3.02          |26.89      |
|0          |Cash             |8035751   |0.0           |0.0        |
|0          |NULL             |717204    |0.75          |3.89       |
|0          |No charge        |133372    |0.0           |0.03       |
|0          |Dispute          |80444     |0.01          |0.11       |
|0          |Unknown          |96        |0.0           |0.0        |
|1          |Credit card      |13090315  |2.81          |27.83      |
|1          |Cash             |5905387   |0.0           |0.0        |
|1          |NULL             |472532    |0.72          |3.72       |
|1          |No charge        |109082 

In [7]:


print("PREGUNTA 17: Zonas con duraciones/distancias extremas")

q17 = obt.filter((col("year") >= 2015) & (col("year") <= 2025)) \
    .groupBy("pu_zone").agg(
        count("*").alias("num_viajes"),
        round(avg("trip_duration_min"), 2).alias("avg_duration"),
        round(max("trip_duration_min"), 2).alias("max_duration"),
        round(avg("trip_distance"), 2).alias("avg_distance"),
        round(max("trip_distance"), 2).alias("max_distance")
    ).filter((col("max_duration") > 180) | (col("max_distance") > 100)) \
    .orderBy(col("max_duration").desc())

print("\n Top 15 zonas con valores extremos:")
print("(Zonas donde se han registrado viajes muy largos)")

q17.limit(15).show(truncate=False)



PREGUNTA 17: Zonas con duraciones/distancias extremas

 Top 15 zonas con valores extremos:
(Zonas donde se han registrado viajes muy largos)
+-----------------------------+----------+------------+------------+------------+------------+
|pu_zone                      |num_viajes|avg_duration|max_duration|avg_distance|max_distance|
+-----------------------------+----------+------------+------------+------------+------------+
|Financial District North     |5523971   |20.47       |1439.98     |4.38        |442.0       |
|Midtown East                 |25970033  |15.51       |1439.98     |2.4         |425.0       |
|Upper East Side South        |31713007  |12.96       |1439.98     |1.81        |401.5       |
|East Harlem North            |7055489   |16.15       |1439.98     |2.48        |249.6       |
|East Flatbush/Farragut       |139021    |28.22       |1439.98     |4.88        |169.1       |
|Sutton Place/Turtle Bay North|14649645  |13.79       |1439.98     |2.2         |365.9       |
|Tri

In [8]:
print("PREGUNTA 18: Rendimiento por milla (total_amount/trip_distance)")

q18 = obt.filter((col("trip_distance") > 0) & (col("total_amount") > 0)) \
    .withColumn("yield_per_mile", col("total_amount") / col("trip_distance")) \
    .groupBy("pu_borough", "pickup_hour").agg(
        round(avg("yield_per_mile"), 2).alias("avg_yield_per_mile"),
        count("*").alias("num_viajes")
    ).orderBy("pu_borough", col("avg_yield_per_mile").desc())

print("\n Top yields por borough (top 15):")
q18.show(15, truncate=False)

PREGUNTA 18: Rendimiento por milla (total_amount/trip_distance)

 Top yields por borough (top 15):
+----------+-----------+------------------+----------+
|pu_borough|pickup_hour|avg_yield_per_mile|num_viajes|
+----------+-----------+------------------+----------+
|Bronx     |17         |18.13             |286112    |
|Bronx     |2          |17.67             |69174     |
|Bronx     |18         |17.14             |260148    |
|Bronx     |16         |16.87             |280074    |
|Bronx     |19         |16.04             |220693    |
|Bronx     |3          |15.98             |59289     |
|Bronx     |1          |15.89             |97108     |
|Bronx     |0          |14.38             |131743    |
|Bronx     |20         |13.96             |195473    |
|Bronx     |4          |13.53             |73668     |
|Bronx     |21         |13.5              |186666    |
|Bronx     |23         |13.47             |164941    |
|Bronx     |22         |13.23             |178277    |
|Bronx     |5        

In [9]:
print("PREGUNTA 19: Cambios año sobre año (YoY)")

q19 = obt.groupBy("service_type", "year").agg(
    count("*").alias("num_viajes"),
    round(avg("total_amount"), 2).alias("ticket_promedio")
).orderBy("service_type", "year")

print("\n Evolución anual por servicio:")
q19.show(25, truncate=False)

window_spec = Window.partitionBy("service_type").orderBy("year")
q19_yoy = q19.withColumn("prev_viajes", lag("num_viajes").over(window_spec)) \
    .withColumn("prev_ticket", lag("ticket_promedio").over(window_spec)) \
    .withColumn("cambio_viajes_pct", 
        round(((col("num_viajes") - col("prev_viajes")) / col("prev_viajes") * 100), 2)) \
    .withColumn("cambio_ticket_pct",
        round(((col("ticket_promedio") - col("prev_ticket")) / col("prev_ticket") * 100), 2))

print("\n Con cambios YoY:")
q19_yoy.show(25, truncate=False)

PREGUNTA 19: Cambios año sobre año (YoY)

 Evolución anual por servicio:
+------------+----+----------+---------------+
|service_type|year|num_viajes|ticket_promedio|
+------------+----+----------+---------------+
|green       |2008|112       |13.07          |
|green       |2009|311       |16.54          |
|green       |2010|346       |18.03          |
|green       |2012|3         |9.79           |
|green       |2015|19189832  |14.83          |
|green       |2016|16341257  |14.69          |
|green       |2017|11703607  |14.29          |
|green       |2018|8870778   |16.15          |
|green       |2019|6248023   |18.35          |
|green       |2020|1726073   |20.23          |
|green       |2021|1063303   |23.99          |
|green       |2022|835867    |19.38          |
|green       |2023|783243    |23.96          |
|green       |2024|657148    |24.38          |
|green       |2025|394832    |24.99          |
|green       |2030|2         |6.15           |
|green       |2035|1         |0.0 

In [10]:
print("PREGUNTA 20: Efecto de congestión en el monto total")

q20 = obt.withColumn("congestion_level",
    when(col("congestion_surcharge") > 0, "Con cargo de congestión")
    .otherwise("Sin cargo de congestión")
).groupBy("congestion_level").agg(
    count("*").alias("num_viajes"),
    round(avg("total_amount"), 2).alias("avg_total"),
    round(avg("trip_duration_min"), 2).alias("avg_duration"),
    round(avg("avg_speed_mph"), 2).alias("avg_speed")
).orderBy("congestion_level")

print("\n Comparación con/sin cargo de congestión:")
q20.show(truncate=False)

PREGUNTA 20: Efecto de congestión en el monto total

 Comparación con/sin cargo de congestión:
+-----------------------+----------+---------+------------+---------+
|congestion_level       |num_viajes|avg_total|avg_duration|avg_speed|
+-----------------------+----------+---------+------------+---------+
|Con cargo de congestión|244193258 |21.81    |16.63       |11.56    |
|Sin cargo de congestión|592905955 |17.21    |17.19       |14.4     |
+-----------------------+----------+---------+------------+---------+

