# Big Data Management Project 1:
## Analyzing New York City Taxi Data

In [2]:
!pip install shapely

Collecting shapely
  Downloading shapely-2.0.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Downloading shapely-2.0.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: shapely
Successfully installed shapely-2.0.7


In [35]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, unix_timestamp, col, lag, avg, lead, sum as spark_sum
from pyspark.sql.types import IntegerType
from pyspark.sql.window import Window

import json

from shapely.geometry import shape, Point

In [4]:
spark = (SparkSession.builder
                    .appName('BDM_Project1')
                    .enableHiveSupport()  # Enables Hive support, persistent Hive metastore
                    .getOrCreate()
        )

### NYC Borough Data

In [6]:
with open('input/nyc-boroughs.geojson') as f:
    geo_data = json.load(f)

# Broadcasting data to workers
broadcast_geo_data = spark.sparkContext.broadcast(geo_data)

# TODO: is using a dictionary fine? "dataframe can be created out of it"
# Creating a dictionary of borough codes and polygons within the borough
polygons = {}
b_names = {} # borough names by code

for feature in broadcast_geo_data.value['features']:
    
    code = feature['properties']['boroughCode']
    name = feature['properties']['borough']

    if code not in polygons:
        polygons[code] = []
        b_names[code] = name
    
    polygons[code].append(shape(feature['geometry']))

# Sorting borough polygons by area
for code in polygons: 
    polygons[code] = sorted(
        polygons[code], key=lambda x: x.area, reverse=True
    )

# also sort boroughs by total area?
#borough_total_areas = {code: sum(poly.area for poly in polys) for code, polys in polygons.items()}
#sorted_boroughs = sorted(borough_total_areas.keys(), key=lambda x: borough_total_areas[x], reverse=True)
#polygons = {code: polygons[code] for code in sorted_boroughs}


In [7]:
# UDF: longitude, latitude -> borough
def get_borough(long, lat):
    point = Point(long, lat)
    
    for code, pols in polygons.items():
        for polygon in pols:
            if polygon.contains(point):
                return code
    
    return None

get_borough_udf = udf(get_borough, IntegerType())

### NYC Taxi Data

In [48]:
taxi_df = (spark.read
             .option("sep", ",")
             .option("header", True)
             .option("inferSchema", True)
             .csv("input/Sample NYC Data.csv")
            )

# Selecting only necessary columns
taxi_df = taxi_df.select(
    "hack_license",
    "pickup_latitude",
    "pickup_longitude",
    "pickup_datetime",
    "dropoff_latitude",
    "dropoff_longitude",
    "dropoff_datetime" 
)

# Converting datetime to unix timestamp (seconds)
taxi_df = taxi_df.withColumn(
    "pickup_ts", unix_timestamp("pickup_datetime", "dd-MM-yy HH:mm")
).withColumn(
    "dropoff_ts", unix_timestamp("dropoff_datetime", "dd-MM-yy HH:mm")
)

# Calculating ride duration (seconds)
taxi_df = taxi_df.withColumn(
    "duration", (taxi_df["dropoff_ts"] - taxi_df["pickup_ts"])
)

# Filtering out rides longer than 4h or with negative duration
taxi_df = taxi_df.filter((taxi_df["duration"] > 0) & (taxi_df["duration"] <= 4 * 60 * 60))

# Add pick up and drop off boroughs to taxi data
taxi_df = taxi_df.withColumn(
    "pickup_borough", get_borough_udf("pickup_longitude", "pickup_latitude")
).withColumn(
    "dropoff_borough", get_borough_udf("dropoff_longitude", "dropoff_latitude")
)
taxi_df.show(3, truncate=False)

+--------------------------------+---------------+----------------+---------------+----------------+-----------------+----------------+----------+----------+--------+--------------+---------------+
|hack_license                    |pickup_latitude|pickup_longitude|pickup_datetime|dropoff_latitude|dropoff_longitude|dropoff_datetime|pickup_ts |dropoff_ts|duration|pickup_borough|dropoff_borough|
+--------------------------------+---------------+----------------+---------------+----------------+-----------------+----------------+----------+----------+--------+--------------+---------------+
|BA96DE419E711691B9445D6A6307C170|40.757977      |-73.978165      |01-01-13 15:11 |40.751171       |-73.989838       |01-01-13 15:18  |1357053060|1357053480|420     |1             |1              |
|9FD8F69F0804BDB5549F40E9DA1BE472|40.731781      |-74.006683      |06-01-13 00:18 |40.75066        |-73.994499       |06-01-13 00:22  |1357431480|1357431720|240     |1             |1              |
|9FD8F69F0

### Query 1
Utilization: idle time per taxi

In [50]:
# Copy taxi_df dataframe
taxi_copy = taxi_df.select("*")

# Ensure all trips of the same driver are together
taxi_copy = taxi_copy.repartition("hack_license")

# Partition by driver and then order by pickup time with Window
window_sp = Window.partitionBy("hack_license").orderBy("pickup_ts")

# Compute previous dropoff time
taxi_copy = taxi_copy.withColumn("prev_dropoff_ts", lag("dropoff_ts").over(window_sp))

# Compute idle time
taxi_copy = taxi_copy.withColumn("idle_time", col("pickup_ts") - col("prev_dropoff_ts"))

# Control that ride is not over 4h
taxi_copy = taxi_copy.filter((col("idle_time").isNotNull()) & (col("idle_time") <= 14400))

# Group by driver and sum idle time
result = taxi_copy.groupBy("hack_license").agg(spark_sum("idle_time").alias("total_idle_time"))

result.show()

+--------------------+---------------+
|        hack_license|total_idle_time|
+--------------------+---------------+
|001C8AAB90AEE49F3...|          12960|
|0025133AD810DBE80...|           2400|
|00447A6197DBB329F...|          13440|
|006313464EC98A24B...|          31500|
|007439EEDB510EF82...|           3240|
|00927C48BA4C1B2B1...|          14460|
|00AE05F56D451E89E...|          22200|
|00B442110FA2D04A1...|          10680|
|00BB5ECED533BF463...|          10380|
|00BF52E4A8E6DBB01...|           9720|
|00D0B6CE0ADA00D70...|           8940|
|01060D63D29CE42C8...|           4200|
|011707FD64AD1EBEA...|           8220|
|011EB4B6E7DE7B08C...|           8220|
|01202D837DD4454C7...|           8100|
|0124A558E529199A8...|          19620|
|013B64EE51129C462...|          14340|
|014BB395ECEE67BEC...|           3360|
|015B18400858D89EE...|           2700|
|015D33FBAB8A7C5CE...|          18660|
+--------------------+---------------+
only showing top 20 rows



### Query 2
The average time it takes for a taxi to find its next fare(trip) per destination borough

In [25]:
window_spec = Window.partitionBy("hack_license").orderBy("pickup_ts")

#get the next pickup time
next_pick_up = taxi_df.withColumn("next_pickup_ts", lead("pickup_ts").over(window_spec))

wait_time = next_pick_up.withColumn("waiting_time", (next_pick_up["next_pickup_ts"] - next_pick_up["dropoff_ts"])/60)
wait_time = wait_time.na.drop(subset=["next_pickup_ts"])
wait_time = wait_time.filter(wait_time["dropoff_borough"].isNotNull())

result = wait_time.groupBy("dropoff_borough").agg(avg("waiting_time").alias("avg_waiting_time"))
print("In minutes")
result.show()

In minutes
+---------------+------------------+
|dropoff_borough|  avg_waiting_time|
+---------------+------------------+
|              1|30.784491026167967|
|              3|111.74982332155477|
|              5|167.88888888888889|
|              4|100.00736478711163|
|              2| 79.10493827160494|
+---------------+------------------+



### Query 3
The number of trips that started and ended within the same borough

In [9]:
sumOfTripsBoroughSame = taxi_df.filter(col("pickup_borough") == col("dropoff_borough")).count()
print(sumOfTripsBoroughSame)

85944


### Query 4
The number of trips that started in one borough and ended in another one

In [26]:
sumOfTripsBoroughDifferent = taxi_df.filter(col("pickup_borough") != col("dropoff_borough")).count()
print(sumOfTripsBoroughDifferent)

11431
