In [2]:
%autosave 120

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.7 


Autosaving every 120 seconds


# PySpark Training Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


#### Optional: Run this cell to see available notebook commands ("magics").


In [3]:
#%help

####  Run this cell to set up and start your interactive session.

In [8]:
%idle_timeout 30
%glue_version 5.0
%worker_type G.1X
%number_of_workers 4

Current idle_timeout is None minutes.
idle_timeout has been set to 30 minutes.
Setting Glue version to: 5.0
Previous worker type: None
Setting new worker type to: G.1X
Previous number of workers: None
Setting new number of workers to: 3


In [10]:
%%configure
{
    "--enable-spark-ui": "true",
    "--spark-event-logs-path": "s3://dip-pyspark-training/spark_ui_tmp/",
    "--enable-metrics": "true",
    "--enable-observability-metrics": "true",
    "--conf": "spark.sql.ui.retainedExecutions=100",
    "--conf": "spark.sql.ui.retainedStages=100",
    "--conf": "spark.sql.codegen.comments=true"
}

The following configurations have been updated: {'--enable-spark-ui': 'true', '--spark-event-logs-path': 's3://dip-pyspark-training/spark_ui_tmp/', '--enable-metrics': 'true', '--enable-observability-metrics': 'true', '--conf': 'spark.sql.codegen.comments=true'}


In [1]:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 3
Idle Timeout: 30
Session ID: 8c4a0016-6f30-452a-a956-637b30f693ce
Applying the following default arguments:
--glue_kernel_version 1.0.7
--enable-glue-datacatalog true
--enable-spark-ui true
--spark-event-logs-path s3://dip-pyspark-training/spark_ui_tmp/
--enable-metrics true
--enable-observability-metrics true
--conf spark.sql.codegen.comments=true
Waiting for session 8c4a0016-6f30-452a-a956-637b30f693ce to get into ready status...
Session 8c4a0016-6f30-452a-a956-637b30f693ce has been created.



### Get spark configuration

In [2]:
dynamic_allocation_enabled = spark.sparkContext.getConf().get('spark.dynamicAllocation.enabled')
dynamic_min_executors = spark.sparkContext.getConf().get('spark.dynamicAllocation.minExecutors')
dynamic_max_executors = spark.sparkContext.getConf().get('spark.dynamicAllocation.maxExecutors')
dynamic_initial_executors = spark.sparkContext.getConf().get('spark.dynamicAllocation.initialExecutors')

executor_instances = spark.sparkContext.getConf().get('spark.executor.instances')
executor_cores = spark.sparkContext.getConf().get('spark.executor.cores')
executor_memory = spark.sparkContext.getConf().get('spark.executor.memory')

driver_cores = spark.sparkContext.getConf().get('spark.driver.cores')
driver_memory = spark.sparkContext.getConf().get('spark.driver.memory')

print(f'''
Dynamic allocation enabled: {dynamic_allocation_enabled}
Dynamic min executors: {dynamic_min_executors}
Dynamic max executors: {dynamic_max_executors}
Dynamic initial executors: {dynamic_initial_executors}
----------------------------------------
Executor instances: {executor_instances}
Executor cores: {executor_cores}
Executor memory: {executor_memory}
----------------------------------------
Driver cores: {driver_cores}
Driver memory: {driver_memory}
''')


Dynamic allocation enabled: false
Dynamic min executors: 1
Dynamic max executors: 2
Dynamic initial executors: 3
----------------------------------------
Executor instances: 2
Executor cores: 4
Executor memory: 10g
----------------------------------------
Driver cores: 4
Driver memory: 10g


### Import libraries

In [3]:
import pyspark.sql.functions as F
import datetime




### Load data NY Taxi dataset

In [4]:
df = spark.read.format('parquet').load('s3://dip-pyspark-training/data/big/ny-taxi-dataset/')
p_df = spark.read.format('parquet').load('s3://dip-pyspark-training//data/big/ny-taxi-dataset-partitioned/')
#df.rdd.getNumPartitions()




In [11]:
#df.count()

In [12]:
#p_df.count()

In [None]:
df.schema

In [6]:
#df.show()

+---------+-------------------+-------------------+---------------+-------------+----------------+---------------+------------+------------------+-----------------+----------------+------------+-----------+-----+-------+----------+------------+------------+
|vendor_id|    pickup_datetime|   dropoff_datetime|passenger_count|trip_distance|pickup_longitude|pickup_latitude|rate_code_id|store_and_fwd_flag|dropoff_longitude|dropoff_latitude|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|total_amount|
+---------+-------------------+-------------------+---------------+-------------+----------------+---------------+------------+------------------+-----------------+----------------+------------+-----------+-----+-------+----------+------------+------------+
|      VTS|2009-11-07 19:44:00|2009-11-07 19:49:00|              2|         0.74|      -73.992127|      40.734658|        NULL|              NULL|        -73.99197|       40.729115|        CASH|        4.5|  0.0|    0.5|      

In [11]:
#df.filter(F.col('vendor_id') == 'VTS').explain(True)

In [6]:
df2 = df.withColumn('surcharge_amount', F.col('total_amount') * 0.1)




In [7]:
df3 = df2.withColumn('is_long_trip', F.col('trip_distance') > 10)




In [8]:
df4 = df3.withColumn('trip_category', F.when(F.col('passenger_count') <= 2, F.lit('small group')).when(F.col('passenger_count') <= 4, F.lit('medium group')).otherwise(F.lit('big group')))




In [9]:
df5 = df4.filter(F.col('vendor_id') == 'VTS')




In [12]:
df6 = df5.select(['vendor_id', 'total_amount', 'surcharge_amount', 'trip_distance', 'is_long_trip', 'passenger_count', 'trip_category'])




In [11]:
df6.count()

513853109


In [14]:
df6.explain(True)

== Parsed Logical Plan ==
'Project ['vendor_id, 'total_amount, 'surcharge_amount, 'trip_distance, 'is_long_trip, 'passenger_count, 'trip_category]
+- Filter (vendor_id#0 = VTS)
   +- Project [vendor_id#0, pickup_datetime#1, dropoff_datetime#2, passenger_count#3, trip_distance#4, pickup_longitude#5, pickup_latitude#6, rate_code_id#7, store_and_fwd_flag#8, dropoff_longitude#9, dropoff_latitude#10, payment_type#11, fare_amount#12, extra#13, mta_tax#14, tip_amount#15, tolls_amount#16, total_amount#17, surcharge_amount#72, is_long_trip#93, CASE WHEN (passenger_count#3 <= 2) THEN small group WHEN (passenger_count#3 <= 4) THEN medium group ELSE big group END AS trip_category#114]
      +- Project [vendor_id#0, pickup_datetime#1, dropoff_datetime#2, passenger_count#3, trip_distance#4, pickup_longitude#5, pickup_latitude#6, rate_code_id#7, store_and_fwd_flag#8, dropoff_longitude#9, dropoff_latitude#10, payment_type#11, fare_amount#12, extra#13, mta_tax#14, tip_amount#15, tolls_amount#16, total_

In [15]:
df6.show()

+---------+------------+------------------+-------------+------------+---------------+-------------+
|vendor_id|total_amount|  surcharge_amount|trip_distance|is_long_trip|passenger_count|trip_category|
+---------+------------+------------------+-------------+------------+---------------+-------------+
|      VTS|         5.0|               0.5|         0.74|       false|              2|  small group|
|      VTS|         6.7|              0.67|         1.04|       false|              1|  small group|
|      VTS|        12.7|              1.27|         4.05|       false|              1|  small group|
|      VTS|         3.8|              0.38|         0.33|       false|              1|  small group|
|      VTS|         5.0|               0.5|          0.6|       false|              1|  small group|
|      VTS|        11.0|               1.1|         1.37|       false|              2|  small group|
|      VTS|        11.0|               1.1|         3.03|       false|              1|  sma

In [16]:
ts = datetime.datetime.now()
output_file_path = 's3://dip-pyspark-training/output/dummy-output-01'
df6.write.mode('overwrite').format('parquet').save('s3://dip-pyspark-training/output/dummy-output-01')
pt = (datetime.datetime.now() - ts).seconds
print(f'The processing time was {pt} seconds')

The processing time was 123 seconds


In [29]:
c_non_partitioned = spark.read.format('parquet').load(output_file_path).count()
c_non_partitioned

513853109


### Using a partitioned source

In [20]:
p_df2 = p_df.withColumn('surcharge_amount', F.col('total_amount') * 0.1)
p_df3 = p_df2.withColumn('is_long_trip', F.col('trip_distance') > 10)
p_df4 = p_df3.withColumn('trip_category', F.when(F.col('passenger_count') <= 2, F.lit('small group')).when(F.col('passenger_count') <= 4, F.lit('medium group')).otherwise(F.lit('big group')))
p_df5 = p_df4.filter(F.col('vendor_id') == 'VTS')
p_df6 = p_df5.select(['vendor_id', 'total_amount', 'surcharge_amount', 'trip_distance', 'is_long_trip', 'passenger_count', 'trip_category'])




In [21]:
p_df6.explain(True)

== Parsed Logical Plan ==
'Project ['vendor_id, 'total_amount, 'surcharge_amount, 'trip_distance, 'is_long_trip, 'passenger_count, 'trip_category]
+- Filter (vendor_id#53 = VTS)
   +- Project [pickup_datetime#36, dropoff_datetime#37, passenger_count#38, trip_distance#39, pickup_longitude#40, pickup_latitude#41, rate_code_id#42, store_and_fwd_flag#43, dropoff_longitude#44, dropoff_latitude#45, payment_type#46, fare_amount#47, extra#48, mta_tax#49, tip_amount#50, tolls_amount#51, total_amount#52, vendor_id#53, surcharge_amount#378, is_long_trip#399, CASE WHEN (passenger_count#38 <= 2) THEN small group WHEN (passenger_count#38 <= 4) THEN medium group ELSE big group END AS trip_category#420]
      +- Project [pickup_datetime#36, dropoff_datetime#37, passenger_count#38, trip_distance#39, pickup_longitude#40, pickup_latitude#41, rate_code_id#42, store_and_fwd_flag#43, dropoff_longitude#44, dropoff_latitude#45, payment_type#46, fare_amount#47, extra#48, mta_tax#49, tip_amount#50, tolls_amount

In [33]:
ts = datetime.datetime.now()
output_file_path = 's3://dip-pyspark-training/output/dummy-output-02'
p_df6.write.mode('overwrite').format('parquet').save(output_file_path)
p_pt = (datetime.datetime.now() - ts).seconds
print(f'The processing time was {p_pt} seconds')

The processing time was 102 seconds


In [34]:
c_partitioned = spark.read.format('parquet').load(output_file_path).count()
c_partitioned

513853109


In [35]:
assert c_non_partitioned == c_partitioned




In [36]:
d = round(100 * (pt - p_pt)/p_pt, 2)
print(f'Using the partitioned source was {d}% faster than the non-partitioned one.')

Using the partitioned source was 20.59% faster than the non-partitioned one.


In [23]:
df.count()

1611611035


In [24]:
p_df.count()

1611611035
