### Import libraries

In [0]:
import pyspark.sql.functions as F
import datetime

### Loading the New York's taxi datasets

In [0]:
df = spark.read.format('delta').load('/databricks-datasets/nyctaxi/tables/nyctaxi_yellow')
p_df = spark.read.format('parquet').load('/mnt/dls/data/big/nyctaxi_yellow_partitioned')
#df.rdd.getNumPartitions()

In [0]:
df.count()

In [0]:
p_df.count()

### Solving it using the non partitioned datasets

In [0]:
df2 = df.withColumn('surcharge_amount', F.col('total_amount') * 0.1)

In [0]:
df3 = df2.withColumn('is_long_trip', F.col('trip_distance') > 10)

In [0]:
df4 = df3.withColumn('trip_category', F.when(F.col('passenger_count') <= 2, F.lit('small group')).when(F.col('passenger_count') <= 4, F.lit('medium group')).otherwise(F.lit('big group')))

In [0]:
df5 = df4.filter(F.col('vendor_id') == 'VTS')

In [0]:
df6 = df5.select(['vendor_id', 'total_amount', 'surcharge_amount', 'trip_distance', 'is_long_trip', 'passenger_count', 'trip_category'])

In [0]:
df6.count()

In [0]:
df.filter(F.col('vendor_id') == 'VTS').count()

In [0]:
df6.display()

In [0]:
df6.explain()

In [0]:
ts = datetime.datetime.now()
output_file_path_non_partitioned = '/mnt/dls/results/oscar/ex01_non_partitioned'
df6.write.mode('overwrite').format('parquet').save(output_file_path_non_partitioned)
pt = (datetime.datetime.now() - ts).seconds
print(f'The processing time was {pt} seconds')

In [0]:
c_non_partitioned = spark.read.format('parquet').load(output_file_path_non_partitioned).count()
c_non_partitioned

### Solving it using the partitioned datasets

In [0]:
p_df2 = p_df.withColumn('surcharge_amount', F.col('total_amount') * 0.1)
p_df3 = p_df2.withColumn('is_long_trip', F.col('trip_distance') > 10)
p_df4 = p_df3.withColumn('trip_category', F.when(F.col('passenger_count') <= 2, F.lit('small group')).when(F.col('passenger_count') <= 4, F.lit('medium group')).otherwise(F.lit('big group')))
p_df5 = p_df4.filter(F.col('vendor_id') == 'VTS')
p_df6 = p_df5.select(['vendor_id', 'total_amount', 'surcharge_amount', 'trip_distance', 'is_long_trip', 'passenger_count', 'trip_category'])

In [0]:
p_df6.explain(True)

In [0]:
ts = datetime.datetime.now()
output_file_path_partitioned = '/mnt/dls/results/oscar/ex01_partitioned'
p_df6.write.mode('overwrite').format('parquet').save(output_file_path_partitioned)
p_pt = (datetime.datetime.now() - ts).seconds
print(f'The processing time was {p_pt} seconds')

In [0]:
c_partitioned = spark.read.format('parquet').load(output_file_path_partitioned).count()
c_partitioned

In [0]:
assert c_non_partitioned == c_partitioned

In [0]:
d = round(100 * abs(pt - p_pt)/pt, 2)
print(f'Using the partitioned source reduced execution time by {d}% than the non-partitioned one. The speed-up factor is x{round(pt/p_pt, 2)}')