# PySpark Training Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.



####  Run this cell to set up and start your interactive session.

In [5]:
%idle_timeout 60
%glue_version 5.0
%worker_type G.1X
%number_of_workers 4

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.7 
Current idle_timeout is None minutes.
idle_timeout has been set to 60 minutes.
Setting Glue version to: 5.0
Previous worker type: None
Setting new worker type to: G.1X
Previous number of workers: None
Setting new number of workers to: 3


In [7]:
%%configure
{
    "--enable-continuous-cloudwatch-log": "true",
    "--enable-spark-ui": "true",
    "--spark-event-logs-path": "s3://dip-pyspark-training/spark_ui_tmp/",
    "--enable-metrics": "true",
    "--enable-observability-metrics": "true",
    "--conf": "spark.sql.codegen.comments=true",
    "--conf": "spark.sql.codegen.fallback=true",
    "--conf": "spark.sql.codegen.wholeStage=true",
    "--conf": "spark.sql.ui.explainMode=extended",
    "--conf": "spark.sql.ui.retainedExecutions=100",
    "--conf": "spark.ui.retainedJobs=1000",
    "--conf": "spark.ui.retainedStages=1000",
    "--conf": "spark.ui.retainedTasks=10000",
    "--conf": "spark.ui.showAdditionalMetrics=true"
}

The following configurations have been updated: {'--enable-spark-ui': 'true', '--spark-event-logs-path': 's3://dip-pyspark-training/spark_ui_tmp/', '--enable-metrics': 'true', '--enable-observability-metrics': 'true', '--conf': 'spark.sql.codegen.comments=true'}


#### Example: Create a DynamicFrame from a table in the AWS Glue Data Catalog and display its schema


In [1]:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 3
Idle Timeout: 60
Session ID: 90bb883e-a75d-44fd-9fa3-384021c6ee2b
Applying the following default arguments:
--glue_kernel_version 1.0.7
--enable-glue-datacatalog true
--enable-spark-ui true
--spark-event-logs-path s3://dip-pyspark-training/spark_ui_tmp/
--enable-metrics true
--enable-observability-metrics true
--conf spark.sql.codegen.comments=true
Waiting for session 90bb883e-a75d-44fd-9fa3-384021c6ee2b to get into ready status...
Session 90bb883e-a75d-44fd-9fa3-384021c6ee2b has been created.



### Get spark configuration


In [2]:
dynamic_allocation_enabled = spark.sparkContext.getConf().get('spark.dynamicAllocation.enabled')
dynamic_min_executors = spark.sparkContext.getConf().get('spark.dynamicAllocation.minExecutors')
dynamic_max_executors = spark.sparkContext.getConf().get('spark.dynamicAllocation.maxExecutors')
dynamic_initial_executors = spark.sparkContext.getConf().get('spark.dynamicAllocation.initialExecutors')

executor_instances = spark.sparkContext.getConf().get('spark.executor.instances')
executor_cores = spark.sparkContext.getConf().get('spark.executor.cores')
executor_memory = spark.sparkContext.getConf().get('spark.executor.memory')

driver_cores = spark.sparkContext.getConf().get('spark.driver.cores')
driver_memory = spark.sparkContext.getConf().get('spark.driver.memory')

print(f'''
Dynamic allocation enabled: {dynamic_allocation_enabled}
Dynamic min executors: {dynamic_min_executors}
Dynamic max executors: {dynamic_max_executors}
Dynamic initial executors: {dynamic_initial_executors}
----------------------------------------
Executor instances: {executor_instances}
Executor cores: {executor_cores}
Executor memory: {executor_memory}
----------------------------------------
Driver cores: {driver_cores}
Driver memory: {driver_memory}
''')


Dynamic allocation enabled: false
Dynamic min executors: 1
Dynamic max executors: 2
Dynamic initial executors: 3
----------------------------------------
Executor instances: 2
Executor cores: 4
Executor memory: 10g
----------------------------------------
Driver cores: 4
Driver memory: 10g


### Import libraries

In [3]:
import pyspark.sql.functions as F
import pyspark.sql.types as T
import datetime




In [4]:
df = spark.read.format('parquet').load('s3://dip-pyspark-training/ny-taxi-dataset-partitioned/')
df.rdd.getNumPartitions()

432


In [8]:
#df.select('vendor_id').distinct().show()

In [9]:
#df.show()

In [5]:
# Define the data as lists
vendors = ['VTS', 'CMT', 'DDS', 'VTS', 'CMT', 'DDS']
payment_type = ['CASH', 'CASH', 'CASH', 'CREDIT', 'CREDIT', 'CREDIT']
extra_col = ['A', 'B', 'C', 'D', 'E', 'F']

# Define the schema of the dataframe
schema = T.StructType([
    T.StructField("vendor_id", T.StringType(), False),
    T.StructField("payment_type", T.StringType(), False),
    T.StructField("extra_col_from_m", T.StringType(), False)
])

# Create a list of tuples
data = [(vendors[i], payment_type[i], extra_col[i]) for i in range(len(vendors))]

# Create a PySpark dataframe
m_df = spark.createDataFrame(data, schema)
#m_df.show()




In [6]:
#joined_df = df.join(other=m_df.hint('broadcast'), how='left', on = ['vendor_id', 'payment_type']).filter(F.col('extra_col_from_m').isNotNull())
joined_df = df.join(other=m_df.hint('broadcast'), how='left', on = ['vendor_id', 'payment_type']).filter(F.col('extra_col_from_m').isNotNull())
joined_df.explain(True)

== Parsed Logical Plan ==
'Filter isnotnull('extra_col_from_m)
+- Project [vendor_id#17, payment_type#10, pickup_datetime#0, dropoff_datetime#1, passenger_count#2, trip_distance#3, pickup_longitude#4, pickup_latitude#5, rate_code_id#6, store_and_fwd_flag#7, dropoff_longitude#8, dropoff_latitude#9, fare_amount#11, extra#12, mta_tax#13, tip_amount#14, tolls_amount#15, total_amount#16, extra_col_from_m#38]
   +- Join LeftOuter, ((vendor_id#17 = vendor_id#36) AND (payment_type#10 = payment_type#37))
      :- Relation [pickup_datetime#0,dropoff_datetime#1,passenger_count#2,trip_distance#3,pickup_longitude#4,pickup_latitude#5,rate_code_id#6,store_and_fwd_flag#7,dropoff_longitude#8,dropoff_latitude#9,payment_type#10,fare_amount#11,extra#12,mta_tax#13,tip_amount#14,tolls_amount#15,total_amount#16,vendor_id#17] parquet
      +- ResolvedHint (strategy=broadcast)
         +- LogicalRDD [vendor_id#36, payment_type#37, extra_col_from_m#38], false

== Analyzed Logical Plan ==
vendor_id: string, paym

In [None]:
merged_df.write.format('parquet').mode('overwrite').save('s3://dip-pyspark-training/dummy-output-03/')