### Importing required libraries

In [21]:
from datetime import datetime
from collections import namedtuple

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import functions as F
from pyspark.sql import types

### Starting Spark Session

In [3]:
spark = SparkSession.builder \
        .master("local[*]") \
        .appName('test') \
        .getOrCreate()

22/06/09 11:33:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [8]:
spark.version

'3.0.3'

### Downloading FHVHV Feb  File and repartitioning

In [9]:
!ls

basic_operations_in_spark.ipynb  head.csv	    spark_rdd.ipynb
data				 lib		    taxi+_zone_lookup.csv
download_data.sh		 schema.md	    test.ipynb
fhv_tripdata1.csv		 sparkSQL.md	    tmp
fhvhv_parquet			 spark_gcs.ipynb    week5_hw.ipynb
fhvhv_tripdata_2021-01.parquet	 spark_local.ipynb  zones
fhvhv_tripdata_2021-02.parquet	 spark_local.py


In [6]:
!wget https://nyc-tlc.s3.amazonaws.com/trip+data/fhvhv_tripdata_2021-02.parquet

--2022-06-09 11:34:19--  https://nyc-tlc.s3.amazonaws.com/trip+data/fhvhv_tripdata_2021-02.parquet
Resolving nyc-tlc.s3.amazonaws.com (nyc-tlc.s3.amazonaws.com)... 52.217.139.241
Connecting to nyc-tlc.s3.amazonaws.com (nyc-tlc.s3.amazonaws.com)|52.217.139.241|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 302633211 (289M) [application/x-www-form-urlencoded]
Saving to: ‘fhvhv_tripdata_2021-02.parquet’


2022-06-09 11:34:31 (25.5 MB/s) - ‘fhvhv_tripdata_2021-02.parquet’ saved [302633211/302633211]



In [7]:
!ls

basic_operations_in_spark.ipynb  head.csv	    spark_rdd.ipynb
data				 lib		    taxi+_zone_lookup.csv
download_data.sh		 schema.md	    test.ipynb
fhv_tripdata1.csv		 sparkSQL.md	    tmp
fhvhv_parquet			 spark_gcs.ipynb    week5_hw.ipynb
fhvhv_tripdata_2021-01.parquet	 spark_local.ipynb  zones
fhvhv_tripdata_2021-02.parquet	 spark_local.py


In [11]:
schema = types.StructType([
  types.StructField('hvfhs_license_num', types.StringType(), True),
  types.StructField('dispatching_base_num', types.StringType(), True),
  types.StructField('originating_base_num', types.StringType(), True),
  types.StructField('request_datetime', types.TimestampType(), True),
  types.StructField('on_scene_datetime', types.TimestampType(), True),
  types.StructField('pickup_datetime', types.TimestampType(), True),
  types.StructField('dropoff_datetime', types.TimestampType(), True),
  types.StructField('PULocationID', types.LongType(), True),
  types.StructField('DOLocationID', types.LongType(), True),
  types.StructField('trip_miles', types.DoubleType(), True),
  types.StructField('trip_time', types.LongType(), True),
  types.StructField('base_passenger_fare', types.DoubleType(), True),
  types.StructField('tolls', types.DoubleType(), True),
  types.StructField('bcf', types.DoubleType(), True),
  types.StructField('sales_tax', types.DoubleType(), True),
  types.StructField('congestion_surcharge', types.DoubleType(), True),
  types.StructField('airport_fee', types.DoubleType(), True),
  types.StructField('tips', types.DoubleType(), True),
  types.StructField('driver_pay', types.DoubleType(), True),
  types.StructField('shared_request_flag', types.StringType(), True),
  types.StructField('shared_match_flag', types.StringType(), True),
  types.StructField('access_a_ride_flag', types.StringType(), True),
  types.StructField('wav_request_flag', types.StringType(), True),
  types.StructField('wav_match_flag', types.StringType(), True)
])

In [12]:
df = spark.read.schema(schema).parquet('fhvhv_tripdata_2021-02.parquet')

In [13]:
df.repartition(24) \
    .write \
    .parquet('fhvhv_parquet/2021/02/')

                                                                                

### Size of the folder

In [16]:
!ls -lh fhvhv_parquet/2021/02

total 511M
-rw-r--r-- 1 Naveen Naveen   0 Jun  9 11:39 _SUCCESS
-rw-r--r-- 1 Naveen Naveen 22M Jun  9 11:39 part-00000-99743572-0fbb-415e-bb97-2a1fc18fce9b-c000.snappy.parquet
-rw-r--r-- 1 Naveen Naveen 22M Jun  9 11:39 part-00001-99743572-0fbb-415e-bb97-2a1fc18fce9b-c000.snappy.parquet
-rw-r--r-- 1 Naveen Naveen 22M Jun  9 11:39 part-00002-99743572-0fbb-415e-bb97-2a1fc18fce9b-c000.snappy.parquet
-rw-r--r-- 1 Naveen Naveen 22M Jun  9 11:39 part-00003-99743572-0fbb-415e-bb97-2a1fc18fce9b-c000.snappy.parquet
-rw-r--r-- 1 Naveen Naveen 22M Jun  9 11:39 part-00004-99743572-0fbb-415e-bb97-2a1fc18fce9b-c000.snappy.parquet
-rw-r--r-- 1 Naveen Naveen 22M Jun  9 11:39 part-00005-99743572-0fbb-415e-bb97-2a1fc18fce9b-c000.snappy.parquet
-rw-r--r-- 1 Naveen Naveen 22M Jun  9 11:39 part-00006-99743572-0fbb-415e-bb97-2a1fc18fce9b-c000.snappy.parquet
-rw-r--r-- 1 Naveen Naveen 22M Jun  9 11:39 part-00007-99743572-0fbb-415e-bb97-2a1fc18fce9b-c000.snappy.parquet
-rw-r--r-- 1 Naveen Naveen 22M

### Counting Trips on Feb 15?

In [18]:
df1 = spark.read.schema(schema).parquet('fhvhv_parquet/2021/02/')

In [19]:
df1.show()

+-----------------+--------------------+--------------------+-------------------+-------------------+-------------------+-------------------+------------+------------+----------+---------+-------------------+-----+----+---------+--------------------+-----------+----+----------+-------------------+-----------------+------------------+----------------+--------------+
|hvfhs_license_num|dispatching_base_num|originating_base_num|   request_datetime|  on_scene_datetime|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|trip_miles|trip_time|base_passenger_fare|tolls| bcf|sales_tax|congestion_surcharge|airport_fee|tips|driver_pay|shared_request_flag|shared_match_flag|access_a_ride_flag|wav_request_flag|wav_match_flag|
+-----------------+--------------------+--------------------+-------------------+-------------------+-------------------+-------------------+------------+------------+----------+---------+-------------------+-----+----+---------+--------------------+-----------+--

- Using Spark itself

In [25]:
df1 \
    .withColumn('pickup_date', F.to_date(df1.pickup_datetime)) \
    .filter("pickup_date = '2021-02-15'") \
    .groupBy('pickup_date') \
    .count() \
    .show()

                                                                                

+-----------+------+
|pickup_date| count|
+-----------+------+
| 2021-02-15|367170|
+-----------+------+



- Using sparkSQL

In [26]:
df.registerTempTable('fhvhv_2021_02')

In [31]:
spark.sql("""
SELECT 
    COUNT(1)
FROM 
    fhvhv_2021_02
WHERE 
    to_date(pickup_datetime) = '2021-02-15';
""").show()



+--------+
|count(1)|
+--------+
|  367170|
+--------+



                                                                                

### Longest Trip for each day

- Using Spark Directly

In [77]:
df1 \
    .withColumn('duration', df1.dropoff_datetime.cast('long') - df1.pickup_datetime.cast('long')) \
    .withColumn('pickup_date', F.to_date(df1.pickup_datetime)) \
    .groupBy('pickup_date') \
    .max('duration') \
    .orderBy('max(duration)', ascending=False) \
    .withColumn('max(duration)', F.to_timestamp("max(duration)")) \
    .limit(5) \
    .show()



+-----------+-------------------+
|pickup_date|      max(duration)|
+-----------+-------------------+
| 2021-02-11|1970-01-01 20:59:00|
| 2021-02-17|1970-01-01 15:53:41|
| 2021-02-20|1970-01-01 12:13:59|
| 2021-02-03|1970-01-01 11:17:33|
| 2021-02-19|1970-01-01 10:26:17|
+-----------+-------------------+



                                                                                

- Using sparkSQL

In [76]:
spark.sql("""
SELECT 
    to_date(pickup_datetime) AS pickup_date,    
    CAST(MAX(CAST(dropoff_datetime AS LONG) - CAST(pickup_datetime AS LONG)) AS TIMESTAMP) AS duration
FROM 
    fhvhv_2021_02
GROUP BY 
    1
ORDER BY 
    2 DESC
LIMIT 10;
""").show()



+-----------+-------------------+
|pickup_date|           duration|
+-----------+-------------------+
| 2021-02-11|1970-01-01 20:59:00|
| 2021-02-17|1970-01-01 15:53:41|
| 2021-02-20|1970-01-01 12:13:59|
| 2021-02-03|1970-01-01 11:17:33|
| 2021-02-19|1970-01-01 10:26:17|
| 2021-02-25|1970-01-01 09:43:30|
| 2021-02-18|1970-01-01 09:36:52|
| 2021-02-10|1970-01-01 09:29:29|
| 2021-02-21|1970-01-01 08:57:03|
| 2021-02-09|1970-01-01 08:54:47|
+-----------+-------------------+



                                                                                

### Most Frequent Dispatching Base Num

In [74]:
df1 \
    .withColumn('dispatching_base_num', df1.dispatching_base_num) \
    .groupBy('dispatching_base_num') \
    .count() \
    .orderBy('count', ascending=False) \
    .limit(5) \
    .show()



+--------------------+-------+
|dispatching_base_num|  count|
+--------------------+-------+
|              B02510|3233664|
|              B02764| 965568|
|              B02872| 882689|
|              B02875| 685390|
|              B02765| 559768|
+--------------------+-------+



                                                                                

In [75]:
spark.sql("""
SELECT 
    dispatching_base_num,    
    COUNT(1)
FROM 
    fhvhv_2021_02
GROUP BY 
    1
ORDER BY 
    2 DESC
LIMIT 10;
""").show()



+--------------------+--------+
|dispatching_base_num|count(1)|
+--------------------+--------+
|              B02510| 3233664|
|              B02764|  965568|
|              B02872|  882689|
|              B02875|  685390|
|              B02765|  559768|
|              B02869|  429720|
|              B02887|  322331|
|              B02871|  312364|
|              B02864|  311603|
|              B02866|  311089|
+--------------------+--------+



                                                                                

### Most common Location Pairs

- Using sparkSQL

In [81]:
df_zones = spark.read.parquet('zones')
df_zones.show(3)

+----------+-------+--------------------+------------+
|LocationID|Borough|                Zone|service_zone|
+----------+-------+--------------------+------------+
|         1|    EWR|      Newark Airport|         EWR|
|         2| Queens|         Jamaica Bay|   Boro Zone|
|         3|  Bronx|Allerton/Pelham G...|   Boro Zone|
+----------+-------+--------------------+------------+
only showing top 3 rows



In [82]:
df_zones.registerTempTable('zones')

In [80]:
df1.show(2)

+-----------------+--------------------+--------------------+-------------------+-------------------+-------------------+-------------------+------------+------------+----------+---------+-------------------+-----+----+---------+--------------------+-----------+----+----------+-------------------+-----------------+------------------+----------------+--------------+
|hvfhs_license_num|dispatching_base_num|originating_base_num|   request_datetime|  on_scene_datetime|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|trip_miles|trip_time|base_passenger_fare|tolls| bcf|sales_tax|congestion_surcharge|airport_fee|tips|driver_pay|shared_request_flag|shared_match_flag|access_a_ride_flag|wav_request_flag|wav_match_flag|
+-----------------+--------------------+--------------------+-------------------+-------------------+-------------------+-------------------+------------+------------+----------+---------+-------------------+-----+----+---------+--------------------+-----------+--

In [86]:
spark.sql("""
SELECT 
    CONCAT(pul.Zone, ' / ', dol.Zone) AS pu_do_pair, 
    COUNT(1)
FROM 
    fhvhv_2021_02 fhv LEFT JOIN zones pul on pul.LocationID=fhv.PULocationID
                      LEFT JOIN zones dol on dol.LocationID=fhv.DOLocationID
GROUP BY 
    1
ORDER BY 
    2 DESC
LIMIT 5;
""").take(5)

                                                                                

[Row(pu_do_pair='East New York / East New York', count(1)=45041),
 Row(pu_do_pair='Borough Park / Borough Park', count(1)=37329),
 Row(pu_do_pair='Canarsie / Canarsie', count(1)=28026),
 Row(pu_do_pair='Crown Heights North / Crown Heights North', count(1)=25976),
 Row(pu_do_pair='Bay Ridge / Bay Ridge', count(1)=17934)]