#Spark training

## Get spark's configuration

In [0]:
dynamic_allocation_enabled = spark.sparkContext.getConf().get('spark.dynamicAllocation.enabled')
dynamic_min_executors = spark.sparkContext.getConf().get('spark.dynamicAllocation.minExecutors')
dynamic_max_executors = spark.sparkContext.getConf().get('spark.dynamicAllocation.maxExecutors')
dynamic_initial_executors = spark.sparkContext.getConf().get('spark.dynamicAllocation.initialExecutors')

executor_instances = spark.sparkContext.getConf().get('spark.executor.instances')
executor_cores = spark.sparkContext.getConf().get('spark.executor.cores')
executor_memory = spark.sparkContext.getConf().get('spark.executor.memory')

driver_cores = spark.sparkContext.getConf().get('spark.driver.cores')
driver_memory = spark.sparkContext.getConf().get('spark.driver.memory')

print(f'''
Dynamic allocation enabled: {dynamic_allocation_enabled}
Dynamic min executors: {dynamic_min_executors}
Dynamic max executors: {dynamic_max_executors}
Dynamic initial executors: {dynamic_initial_executors}
----------------------------------------
Executor instances: {executor_instances}
Executor cores: {executor_cores}
Executor memory: {executor_memory}
----------------------------------------
Driver cores: {driver_cores}
Driver memory: {driver_memory}
''')
''')

### Import libraries

In [0]:
import pyspark.sql.functions as F
import pyspark.sql.types as T
import datetime

In [0]:
c_df = spark.read.format('parquet').load('/mnt/dls/data/small/customers')

In [0]:
#c_df.write.format('parquet').save('/mnt/dls/data/small/customers')

In [0]:
c_df.schema

In [0]:
c_df.display()

In [0]:
t_df = spark.read.format('parquet').load('/mnt/dls/data/small/transactions')
t_df.rdd.getNumPartitions()

In [0]:
t_df.display()

### Examples of narrow transformations

In [0]:
tmp_01_df = c_df.withColumn('first_name', F.split('name', ' ').getItem(0))

In [0]:
tmp_02_df = tmp_01_df.withColumn('last_name', F.split('name', ' ').getItem(1))

In [0]:
tmp_03_df = tmp_02_df.select(['cust_id', 'first_name', 'last_name', 'city', 'gender', 'birthday'])

In [0]:
tmp_04_df = tmp_03_df.filter(F.col('city') == 'chicago')

In [0]:
tmp_04_df.display()

In [0]:
tmp_04_df.explain(True)

### Examples of wide transformations

#### Repartition

In [0]:
t_df.rdd.getNumPartitions()

In [0]:
t_df.repartition(20).explain(True)