# PySpark Training Notebook
##### Refreshing of basic concepts

####  Run these cells to configure your interactive session

In [None]:
%idle_timeout 60
%glue_version 5.0
%worker_type G.1X
%number_of_workers 2

In [None]:
%%configure
{
    "--spark-event-logs-path": "s3://dip-pyspark-training/spark_ui_tmp/"
}

### Start spark session 

In [None]:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

### Spark's Core components

In [None]:
executor_instances = spark.sparkContext.getConf().get('spark.executor.instances')
executor_cores = spark.sparkContext.getConf().get('spark.executor.cores')
executor_memory = spark.sparkContext.getConf().get('spark.executor.memory')

driver_cores = spark.sparkContext.getConf().get('spark.driver.cores')
driver_memory = spark.sparkContext.getConf().get('spark.driver.memory')

print(f'''
----------------------------------------
Executor instances: {executor_instances}
Executor cores: {executor_cores}
Executor memory: {executor_memory}
----------------------------------------
Driver cores: {driver_cores}
Driver memory: {driver_memory}
----------------------------------------
''')

In [None]:
spark.sparkContext.getConf().getAll()

### Import libraries

In [None]:
import pyspark.sql.functions as F
import pyspark.sql.types as T
import datetime

### Spark's Unified Framework

In [None]:
data = [('Alice', 34), ('Bob', 45), ('Cathy', 29), ('David', 50), ('Eve', 28), ('Frank', 20), ('Grace', 42), ('Hank', 21), ('Ivy', 26), ('Jack', 40), ('Karen', 19), ('Leo', 29), ('Mona', 35), ('Nina', 48), ('Javier', 38)]
columns = ['Name', 'Age']

In [None]:
# DataFrames Pyspark
df = spark.createDataFrame(data, columns)
df.show()

In [None]:
# df.rdd.glom().collect()

In [None]:
df.filter(F.col('Age') > 30).show()

In [None]:
# DataFrames SQL
df.createOrReplaceTempView('friends')

In [None]:
spark.sql(
    '''
    SELECT *
    FROM friends
    WHERE Age > 30
    ''').show()

In [None]:
# RDDs
rdd = spark.sparkContext.parallelize(data)
rdd.collect()

In [None]:
filtered_rdd = rdd.filter(lambda x: x[1] > 30)

In [None]:
filtered_rdd.collect()

### RDDs - Example 1

In [None]:
this_is_a_variable = [i for i in range(10**6)]
this_is_a_variable[:10]

In [None]:
rdd2 = spark.sparkContext.parallelize(this_is_a_variable)

In [None]:
rdd2.getNumPartitions()

In [None]:
# l = rdd2.glom().collect()

In [None]:
filtered_rdd = rdd2.filter(lambda x: x > 10**6 - 100)

In [None]:
filtered_rdd.collect()

In [None]:
filtered_rdd.getNumPartitions()

In [None]:
filtered_rdd.glom().collect()

### RDDs - Example 2

In [None]:
# Lazy Transformation
time_to_retirement_rdd = rdd.map(lambda x: 67 - x[1])

In [None]:
time_to_retirement_rdd.collect()

In [None]:
rdd.getNumPartitions()

In [None]:
rdd.glom().collect()

In [None]:
time_to_retirement_rdd.getNumPartitions()

In [None]:
time_to_retirement_rdd.glom().collect()

### Lazy vs eager transformations

In [None]:
df.show()

In [None]:
df.rdd.getNumPartitions()

In [None]:
df2 = df.withColumn('retirement_in', 67 - F.col('Age'))

In [None]:
df3 = df2.withColumn('older_than_30', F.when(F.col('Age')>30, F.lit(True)).otherwise(F.lit(False)))

In [None]:
df3.show()

In [None]:
df3.explain()

In [None]:
df4 = df3.groupBy('older_than_30').agg(F.count('Name').alias('total_people'), F.mean('Age').alias('mean_age'), F.stddev('Age').alias('stddev_age'))

In [None]:
df4.show()

In [None]:
df2.rdd.getNumPartitions()

In [None]:
df4.explain(True)