#Spark training

## 1. Spark's core concepts

In [0]:
executor_instances = spark.sparkContext.getConf().get('spark.executor.instances')
executor_cores = spark.sparkContext.getConf().get('spark.executor.cores')
executor_memory = spark.sparkContext.getConf().get('spark.executor.memory')

driver_cores = spark.sparkContext.getConf().get('spark.driver.cores')
driver_memory = spark.sparkContext.getConf().get('spark.driver.memory')

print(f'''
----------------------------------------
Executor instances: {executor_instances}
Executor cores: {executor_cores}
Executor memory: {executor_memory}
----------------------------------------
Driver cores: {driver_cores}
Driver memory: {driver_memory}
----------------------------------------
''')

## 2. Spark's unified framework

In [0]:
import pyspark.sql.functions as F
import pyspark.sql.types as T
import datetime

In [0]:
data = [('Alice', 34), ('Bob', 45), ('Cathy', 29), ('David', 50), ('Eve', 28), ('Frank', 20), ('Grace', 42), ('Hank', 21), ('Ivy', 26), ('Jack', 40), ('Karen', 19), ('Leo', 29), ('Mona', 35), ('Nina', 48), ('Javier', 38)]
columns = ['Name', 'Age']

In [0]:
# DataFrames Pyspark
df = spark.createDataFrame(data, columns)
df.display()

In [0]:
# df.rdd.glom().collect()

In [0]:
df.filter(F.col('Age') > 30).display()

In [0]:
# DataFrames SQL
df.createOrReplaceTempView('friends')

In [0]:
spark.sql(
    '''
    SELECT *
    FROM friends
    WHERE Age > 30
    ''').display()

In [0]:
# RDDs
rdd = spark.sparkContext.parallelize(data)
rdd.collect()

In [0]:
filtered_rdd = rdd.filter(lambda x: x[1] > 30)

In [0]:
filtered_rdd.collect()

## 3. RDDs: Resilient Distributed Datasets 

### RDDs - Example 1

In [0]:
this_is_a_variable = [i for i in range(10**6)]
this_is_a_variable[:20]

In [0]:
rdd2 = spark.sparkContext.parallelize(this_is_a_variable)

In [0]:
rdd2.getNumPartitions()

In [0]:
rdd2_detailed_partitions = rdd2.glom().collect()

In [0]:
for i, partition_i in enumerate(rdd2_detailed_partitions):
    print(f'Partition {i} has {len(partition_i)} items: {partition_i[:5]} ->  {partition_i[-5:]}')

In [0]:
filtered_rdd = rdd2.filter(lambda x: x > 10**6 - 100)

In [0]:
filtered_rdd.collect()

In [0]:
filtered_rdd.getNumPartitions()

In [0]:
filtered_rdd_detailed_partitions = filtered_rdd.glom().collect()

In [0]:
for i, partition_i in enumerate(filtered_rdd_detailed_partitions):
    print(f'Partition {i} has {len(partition_i)} items: {partition_i[:5]} ->  {partition_i[-5:]}')

### RDDs - Example 2

In [0]:
rdd.collect()

In [0]:
# Lazy Transformation
time_to_retirement_rdd = rdd.map(lambda x: (x[0], f'retires in {67 - x[1]} years'))

In [0]:
time_to_retirement_rdd.collect()

## 4. Lazy vs eager evaluation 

In [0]:
df.show()

In [0]:
df2 = df.withColumn('retirement_in', 67 - F.col('Age'))

In [0]:
df3 = df2.withColumn('older_than_30', F.when(F.col('Age')>30, F.lit(True)).otherwise(F.lit(False)))

In [0]:
df3.show()

In [0]:
df3.explain()

In [0]:
df4 = df3.groupBy('older_than_30').agg(F.count('Name').alias('total_people'), F.mean('Age').alias('mean_age'), F.stddev('Age').alias('stddev_age'))

In [0]:
df4.show()

In [0]:
df4.explain(True)