In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/03/04 20:44:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [12]:
spark.version

'3.3.2'

In [13]:
src_filename='fhv_tripdata_2019-10.csv.gz'

In [3]:
df = spark.read \
    .option("header", "true") \
    .csv(src_filename)

df.show()

[Stage 0:>                                                          (0 + 1) / 1]                                                                                

+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+
|dispatching_base_num|    pickup_datetime|   dropOff_datetime|PUlocationID|DOlocationID|SR_Flag|Affiliated_base_number|
+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+
|              B00009|2019-10-01 00:23:00|2019-10-01 00:35:00|         264|         264|   null|                B00009|
|              B00013|2019-10-01 00:11:29|2019-10-01 00:13:22|         264|         264|   null|                B00013|
|              B00014|2019-10-01 00:11:43|2019-10-01 00:37:20|         264|         264|   null|                B00014|
|              B00014|2019-10-01 00:56:29|2019-10-01 00:57:47|         264|         264|   null|                B00014|
|              B00014|2019-10-01 00:23:09|2019-10-01 00:28:27|         264|         264|   null|                B00014|
|     B00021         |2019-10-01 00:00:4

In [4]:
import pandas as pd

df_pd = pd.read_csv(src_filename)
df_pd.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00009,2019-10-01 00:23:00,2019-10-01 00:35:00,264.0,264.0,,B00009
1,B00013,2019-10-01 00:11:29,2019-10-01 00:13:22,264.0,264.0,,B00013
2,B00014,2019-10-01 00:11:43,2019-10-01 00:37:20,264.0,264.0,,B00014
3,B00014,2019-10-01 00:56:29,2019-10-01 00:57:47,264.0,264.0,,B00014
4,B00014,2019-10-01 00:23:09,2019-10-01 00:28:27,264.0,264.0,,B00014


In [8]:
from pyspark.sql import types

schema = types.StructType([
    types.StructField('dispatching_base_num', types.StringType(), True),    
    types.StructField('pickup_datetime', types.TimestampType(), True), 
    types.StructField('dropOff_datetime',types.TimestampType(), True), 
    types.StructField('PULocationID', types.IntegerType(), True), 
    types.StructField('DOLocationID', types.IntegerType(), True), 
    types.StructField('SR_Flag', types.StringType(), True),
    types.StructField('Affiliated_base_number', types.StringType(), True)
])

df = spark.read \
    .option("header", "true") \
    .schema(schema) \
    .csv(src_filename)

In [11]:
df.show(10)

+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+
|dispatching_base_num|    pickup_datetime|   dropOff_datetime|PULocationID|DOLocationID|SR_Flag|Affiliated_base_number|
+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+
|              B00009|2019-10-01 00:23:00|2019-10-01 00:35:00|         264|         264|   null|                B00009|
|              B00013|2019-10-01 00:11:29|2019-10-01 00:13:22|         264|         264|   null|                B00013|
|              B00014|2019-10-01 00:11:43|2019-10-01 00:37:20|         264|         264|   null|                B00014|
|              B00014|2019-10-01 00:56:29|2019-10-01 00:57:47|         264|         264|   null|                B00014|
|              B00014|2019-10-01 00:23:09|2019-10-01 00:28:27|         264|         264|   null|                B00014|
|     B00021         |2019-10-01 00:00:4

In [14]:
df = df.repartition(6)

df.write.parquet('fhvhv/2019/10/', mode="overwrite")

                                                                                

In [15]:
df = spark.read.parquet('fhvhv/2019/10/')

In [17]:
df.show(5)

+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+
|dispatching_base_num|    pickup_datetime|   dropOff_datetime|PULocationID|DOLocationID|SR_Flag|Affiliated_base_number|
+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+
|              B02784|2019-10-01 09:55:38|2019-10-01 10:05:43|          89|          85|   null|                  null|
|              B02429|2019-10-21 04:15:47|2019-10-21 04:36:04|         264|         264|   null|                B02429|
|              B01482|2019-10-19 12:00:00|2019-10-19 12:20:00|         264|         264|   null|                B01482|
|              B03015|2019-10-11 14:28:00|2019-10-11 14:32:44|         264|         216|   null|                B03015|
|              B01529|2019-10-21 18:00:26|2019-10-21 18:07:21|         264|          80|   null|                B01529|
+--------------------+------------------

In [18]:
df.printSchema()

root
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropOff_datetime: timestamp (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- SR_Flag: string (nullable = true)
 |-- Affiliated_base_number: string (nullable = true)



In [19]:
from pyspark.sql import functions as F

In [23]:
df \
    .withColumn('pickup_date', F.to_date(df.pickup_datetime)) \
    .withColumn('dropoff_date', F.to_date(df.dropOff_datetime)) \
    .select('pickup_date', 'dropoff_date', 'PULocationID', 'DOLocationID') \
    .show()

1897493

In [26]:
df.filter(F.to_date(df.pickup_datetime) == '2019-10-15').count()

62610

In [57]:
from datetime import datetime

def dt_diff_in_hours(dt_start, dt_end):
    if (dt_start is None or dt_end is None):
        return 0
    return (dt_end - dt_start).total_seconds() / 3600


In [62]:
fmt = '%Y-%m-%d %H:%M:%S'
t1 = datetime.strptime('2019-10-01 10:05:43', fmt)
t2 = datetime.strptime('2019-10-02 10:05:43', fmt)
dt_diff_in_hours(t1, F.to_timestamp('2019-10-02 10:05:43'))

TypeError: 'Column' object is not callable

In [68]:
dt_diff_in_hours_udf=F.udf(dt_diff_in_hours,  returnType= types.DoubleType())

In [69]:
df \
    .withColumn('hours', dt_diff_in_hours_udf(F.to_timestamp(df.pickup_datetime), F.to_timestamp(df.dropOff_datetime)) \
    .select('pickup_datetime', 'dropOff_datetime', 'hours') \
    .show()


SyntaxError: incomplete input (482567243.py, line 4)

In [None]:
#spark.stop()