# Big Data Management Project 2:
## DESB GRAND CHALLENGE 2015

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import unix_timestamp, regexp_extract, col, count
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, DoubleType, FloatType

In [2]:
spark = (SparkSession.builder
                    .appName('BDM_Project2')
                    .enableHiveSupport()
                    .getOrCreate()
        )

### Query 0
Data Cleansing and Setup

In [4]:
# Defining the schema for faster reading of data
schema = StructType([
    StructField("medallion", StringType(), True),
    StructField("hack_license", StringType(), True),
    StructField("pickup_datetime", TimestampType(), True),
    StructField("dropoff_datetime", TimestampType(), True),
    StructField("trip_time_in_secs", IntegerType(), True),
    StructField("trip_distance", DoubleType(), True),
    StructField("pickup_longitude", DoubleType(), True),
    StructField("pickup_latitude", DoubleType(), True),
    StructField("dropoff_longitude", DoubleType(), True),
    StructField("dropoff_latitude", DoubleType(), True),
    StructField("payment_type", StringType(), True),
    StructField("fare_amount", DoubleType(), True),
    StructField("surcharge", DoubleType(), True),
    StructField("mta_tax", DoubleType(), True),
    StructField("tip_amount", DoubleType(), True),
    StructField("tolls_amount", DoubleType(), True)
])

# Creating a single dataframe of all the trip_data files
taxi_df_og = (
    spark.read
    .option("header", False)
    .schema(schema)
    .csv("input/sorted_data.csv")
)

# Extracting ~1GB of data
taxi_df_small = taxi_df_og.sample(fraction=1/32, seed=42)  # 1GB out of 32GB (zipped 12GB, unzipped 32GB)
original_count = taxi_df_small.count()
print(f"Number of rows in the 1GB dataset: {original_count}")

Small count: 5411459


In [5]:
# Transforming the data 
taxi_df = taxi_df_small.filter(
    (regexp_extract(col("medallion"), r"^[a-fA-F0-9]{32}$", 0) != "") &
    (regexp_extract(col("hack_license"), r"^[a-fA-F0-9]{32}$", 0) != "") &
    (col("pickup_datetime").isNotNull()) &
    (col("dropoff_datetime").isNotNull()) &
    (col("trip_time_in_secs") > 0) &                 
    (col("trip_distance") > 0) &                    
    (col("fare_amount") > 0) &
    (col("tip_amount") >= 0)
)

# Convert timestamps to Unix format 
taxi_df = taxi_df.withColumns({
    "pickup_ts": unix_timestamp("pickup_datetime"),
    "dropoff_ts": unix_timestamp("dropoff_datetime")
}).withColumn(
    "duration", col("dropoff_ts") - col("pickup_ts")
).filter(
    (col("duration") > 0) & (col("duration") <= 4 * 60 * 60) 
).select(
    "*"
).dropna()  # Drop remaining null values

taxi_df.show(5, truncate=False)

+--------------------------------+--------------------------------+-------------------+-------------------+---------------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+----------+----------+--------+
|medallion                       |hack_license                    |pickup_datetime    |dropoff_datetime   |passenger_count|trip_distance|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|payment_type|fare_amount|surcharge|mta_tax|tip_amount|tolls_amount|pickup_ts |dropoff_ts|duration|
+--------------------------------+--------------------------------+-------------------+-------------------+---------------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+----------+----------+--------+
|319AE2555940BA65DB0749E1DD1FBA0B|BAC146F5AA74DE3040A5D53572EA663A|2013-01-01 00:00:00|2

In [None]:
# Impact of transformations
filtered_count = taxi_df.count()
filtered_out_count = original_count - filtered_count

print(f"Original count: {original_count}") 
print(f"Filtered count: {filtered_count}")
print(f"Rows filtered out: {filtered_out_count}")

### Query 1
Frequent Routes

### Query 2
Profitable Areas