# Big Data Management Project 2:
## DESB GRAND CHALLENGE 2015

In [1]:
!pip install shapely

Collecting shapely
  Downloading shapely-2.0.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Downloading shapely-2.0.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: shapely
Successfully installed shapely-2.0.7


In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, unix_timestamp, col, lag, avg, lead, count, sum as spark_sum
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, DoubleType, FloatType
from pyspark.sql.window import Window

import json
import time

In [12]:
spark = (SparkSession.builder
                    .appName('BDM_Project2')
                    .enableHiveSupport()
                    .getOrCreate()
        )

### Query 0
Data Cleansing and Setup

In [13]:
start_time = time.time() # To see the time it takes to execute data transformations

# Defining the schema for faster reading of data
schema = StructType([
    StructField("medallion", StringType(), True),
    StructField("hack_license", StringType(), True),
    StructField("pickup_datetime", TimestampType(), True),
    StructField("dropoff_datetime", TimestampType(), True),
    StructField("passenger_count", IntegerType(), True),
    StructField("trip_distance", DoubleType(), True),
    StructField("pickup_longitude", DoubleType(), True),
    StructField("pickup_latitude", DoubleType(), True),
    StructField("dropoff_longitude", DoubleType(), True),
    StructField("dropoff_latitude", DoubleType(), True)
])

# Creating a single dataframe of all the trip_data files
taxi_df_og = (
    spark.read
    .option("header", True)
    .schema(schema)
    .csv("input/sorted_data.csv")
)

# Removing the trips with 0 passengers
# Transforming the data (further explained in the project report)
taxi_df = taxi_df_og.filter(
    (col("passenger_count") > 0) &
    (col("trip_distance") > 0) &
    (col("pickup_longitude") != col("dropoff_longitude")) &
    (col("pickup_latitude") != col("dropoff_latitude"))
)

# Convert timestamps to Unix format 
taxi_df = taxi_df.withColumns({
    "pickup_ts": unix_timestamp("pickup_datetime"),
    "dropoff_ts": unix_timestamp("dropoff_datetime")
}).withColumn(
    "duration", col("dropoff_ts") - col("pickup_ts")
).filter(
    (col("duration") > 0) & (col("duration") <= 4 * 60 * 60)
).select(
    "hack_license",
    "pickup_latitude",
    "pickup_longitude",
    "pickup_ts",
    "dropoff_latitude",
    "dropoff_longitude",
    "dropoff_ts",
    "duration"
).dropna()

taxi_df.show(5, truncate=False)

print("Execution time", time.time() - start_time)

+--------------------------------+---------------+----------------+----------+----------------+-----------------+----------+--------+
|hack_license                    |pickup_latitude|pickup_longitude|pickup_ts |dropoff_latitude|dropoff_longitude|dropoff_ts|duration|
+--------------------------------+---------------+----------------+----------+----------------+-----------------+----------+--------+
|77FFDF38272A6006517D53EDA14333E2|40.768005      |-73.9701        |1356998420|40.767834       |-73.969772       |1356998482|62      |
|CDCB7729DE07243726FF7BB0BD5D06BF|40.749657      |-73.975441      |1356998414|40.751991       |-73.977333       |1356998497|83      |
|7D89374F8E98F30A19F2381EC71A16BA|40.720531      |-74.005165      |1356998440|40.725655       |-74.003929       |1356998500|60      |
|E7750A37CAB07D0DFF0AF7E3573AC141|40.716976      |-73.956528      |1356998400|40.715008       |-73.96244        |1356998520|120     |
|145038A0CC99D6982D8001BE668154CA|40.790169      |-73.95208   

### Query 1
Frequent Routes

### Query 2
Profitable Areas