# PySpark Training Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


####  Run these cells to configure your interactive session

In [None]:
%idle_timeout 30
%glue_version 5.0
%worker_type G.1X
%number_of_workers 4

In [None]:
%%configure
{
    "--enable-spark-ui": "true",
    "--spark-event-logs-path": "s3://dip-pyspark-training/spark_ui_tmp/",
    "--enable-metrics": "true",
    "--enable-observability-metrics": "true",
    "--conf": "spark.sql.ui.retainedExecutions=100",
    "--conf": "spark.sql.ui.retainedStages=100",
    "--conf": "spark.sql.codegen.comments=true"
}

### Start spark session 

In [None]:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

### Get spark's configuration

In [None]:
dynamic_allocation_enabled = spark.sparkContext.getConf().get('spark.dynamicAllocation.enabled')
dynamic_min_executors = spark.sparkContext.getConf().get('spark.dynamicAllocation.minExecutors')
dynamic_max_executors = spark.sparkContext.getConf().get('spark.dynamicAllocation.maxExecutors')
dynamic_initial_executors = spark.sparkContext.getConf().get('spark.dynamicAllocation.initialExecutors')

executor_instances = spark.sparkContext.getConf().get('spark.executor.instances')
executor_cores = spark.sparkContext.getConf().get('spark.executor.cores')
executor_memory = spark.sparkContext.getConf().get('spark.executor.memory')

driver_cores = spark.sparkContext.getConf().get('spark.driver.cores')
driver_memory = spark.sparkContext.getConf().get('spark.driver.memory')

print(f'''
Dynamic allocation enabled: {dynamic_allocation_enabled}
Dynamic min executors: {dynamic_min_executors}
Dynamic max executors: {dynamic_max_executors}
Dynamic initial executors: {dynamic_initial_executors}
----------------------------------------
Executor instances: {executor_instances}
Executor cores: {executor_cores}
Executor memory: {executor_memory}
----------------------------------------
Driver cores: {driver_cores}
Driver memory: {driver_memory}
''')

### Import libraries

In [None]:
import pyspark.sql.functions as F
import datetime

### Loading the New York's taxi dataset

In [None]:
df = spark.read.format('parquet').load('s3://dip-pyspark-training/data/big/ny-taxi-dataset/')
p_df = spark.read.format('parquet').load('s3://dip-pyspark-training//data/big/ny-taxi-dataset-partitioned/')
#df.rdd.getNumPartitions()

In [None]:
#df.count()

In [None]:
#p_df.count()

In [None]:
df.schema

In [None]:
#df.show()

In [None]:
#df.filter(F.col('vendor_id') == 'VTS').explain(True)

In [None]:
df2 = df.withColumn('surcharge_amount', F.col('total_amount') * 0.1)

In [None]:
df3 = df2.withColumn('is_long_trip', F.col('trip_distance') > 10)

In [None]:
df4 = df3.withColumn('trip_category', F.when(F.col('passenger_count') <= 2, F.lit('small group')).when(F.col('passenger_count') <= 4, F.lit('medium group')).otherwise(F.lit('big group')))

In [None]:
df5 = df4.filter(F.col('vendor_id') == 'VTS')

In [None]:
df6 = df5.select(['vendor_id', 'total_amount', 'surcharge_amount', 'trip_distance', 'is_long_trip', 'passenger_count', 'trip_category'])

In [None]:
df6.count()

In [None]:
df6.explain(True)

In [None]:
df6.show()

In [None]:
ts = datetime.datetime.now()
output_file_path_non_partitioned = 's3://dip-pyspark-training/output/dummy-output-01'
df6.write.mode('overwrite').format('parquet').save(output_file_path_non_partitioned)
pt = (datetime.datetime.now() - ts).seconds
print(f'The processing time was {pt} seconds')

In [None]:
c_non_partitioned = spark.read.format('parquet').load(output_file_path_non_partitioned).count()
c_non_partitioned

### Using a partitioned source

In [None]:
p_df2 = p_df.withColumn('surcharge_amount', F.col('total_amount') * 0.1)
p_df3 = p_df2.withColumn('is_long_trip', F.col('trip_distance') > 10)
p_df4 = p_df3.withColumn('trip_category', F.when(F.col('passenger_count') <= 2, F.lit('small group')).when(F.col('passenger_count') <= 4, F.lit('medium group')).otherwise(F.lit('big group')))
p_df5 = p_df4.filter(F.col('vendor_id') == 'VTS')
p_df6 = p_df5.select(['vendor_id', 'total_amount', 'surcharge_amount', 'trip_distance', 'is_long_trip', 'passenger_count', 'trip_category'])

In [None]:
p_df6.explain(True)

In [None]:
ts = datetime.datetime.now()
output_file_path_partitioned = 's3://dip-pyspark-training/output/dummy-output-02'
p_df6.write.mode('overwrite').format('parquet').save(output_file_path_partitioned)
p_pt = (datetime.datetime.now() - ts).seconds
print(f'The processing time was {p_pt} seconds')

In [None]:
c_partitioned = spark.read.format('parquet').load(output_file_path_partitioned).count()
c_partitioned

In [None]:
assert c_non_partitioned == c_partitioned

In [None]:
d = round(100 * (pt - p_pt)/p_pt, 2)
print(f'Using the partitioned source was {d}% faster than the non-partitioned one.')

In [None]:
df.count()

In [None]:
p_df.count()