In [None]:
# Import the basic spark library
from pyspark.sql import SparkSession

# Create an entry point to the PySpark Application
spark = SparkSession.builder \
      .master("local") \
      .appName("MyFirstSparkApplication") \
      .getOrCreate()

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, FloatType, ArrayType

#Createe the schema using StructField(Name, Type, Nullable)
schema = StructType([ \
    StructField("Pizza Name", StringType(), True), \
    StructField("Price", FloatType(), True), \
    StructField("Ingredients", ArrayType(StringType()), True) \
])

df_data = [("Margherita", 5.95, ["Tomato Sauce", "Mozzarella Cheese", "Basil"]),
        ("Calzone", 7.95, ["Tomato Sauce", "Mozzarella Cheese", "Prosciutto Cotto"]),
        ("Diavola", 5.95, ["Tomato Sauce", "Mozzarella Cheese", "Spicy Salame"]),
        ("Prosciutto", 7.95, ["Tomato Sauce", "Mozzarella Cheese", "Prosciutto Cotto"]),
        ("Speck & Brie", 7.95, ["Tomato Sauce", "Mozzarella Cheese", "Speck", "Brie"]),
        ("Tonno & Cipolle", 7.95, ["Tomato Sauce", "Mozzarella Cheese", "Tuna", "Onions"]),
        ("Fries", 3.95, ["Potatoes"])]

df = spark.createDataFrame(data = df_data, schema = schema)
df.printSchema()
df.show(truncate=False)

<h4>Grouping using groupBy</h4>

In [None]:
# Count
df.groupBy("Price").count().show(truncate = False)

In [None]:
# Minimum
df.groupBy().min("Price").show(truncate = False)

In [None]:
# Average
df.groupBy().avg("Price").show(truncate = False)

<h4>Grouping Multiple Columns</h4>

In [None]:
# Let's explode our array to perform more interesting operations
from pyspark.sql.functions import explode, col

exploded_df = df.select(col("Pizza Name"), df.Price, explode(df.Ingredients))
exploded_df = exploded_df.withColumnRenamed("col", "Ingredient")
exploded_df.printSchema()
exploded_df.show(truncate = False)

In [None]:
# counting
exploded_df.groupBy("Ingredient", "Price").count().show(truncate = False)

<h4>Multiple Aggregations</h4>

In [None]:
from pyspark.sql.functions import sum, avg, count, max

exploded_df.groupBy("Pizza Name").agg(
    sum("Price").alias("Sum Price"),
    avg("Price").alias("Average Price"),
    count("Ingredient").alias("Number of Ingredients"),
    max("Price").alias("Price")).show(truncate = False)

In [None]:
# Let's keep only the Pizza's with at least four ingredients
exploded_df.groupBy("Pizza Name") \
    .agg(count("Ingredient").alias("Number of Ingredients")) \
    .filter(col("Number of Ingredients") >= 4) \
    .show(truncate = False)

<h4>Aggregate Functions</h4>

In [None]:
from pyspark.sql.functions import approx_count_distinct

# Count the number of unique values in a field
print("Number of different ingredients", str(exploded_df.select(approx_count_distinct("Ingredient")).collect()[0][0]))

In [None]:
from pyspark.sql.functions import avg

# Compute the average price for a Pizza
print("Average Price: ", str(df.select(avg("Price")).collect()[0][0]))

In [None]:
from pyspark.sql.functions import collect_list

# Return all the values from a column (with duplicates)
exploded_df.select(collect_list("Ingredient")).collect()[0][0]

In [None]:
exploded_df.select(collect_list("Ingredient")).show(truncate = False)

In [None]:
from pyspark.sql.functions import collect_set

# Return all the values from a column (without duplicates)
exploded_df.select(collect_set("Ingredient")).collect()[0][0]

In [None]:
from pyspark.sql.functions import countDistinct

# Return all the values from a column (without duplicates)
exploded_df.select(countDistinct("Ingredient", "Price")).withColumnRenamed("count(DISTINCT Ingredient, Price)", "count").show(truncate = False)

In [None]:
from pyspark.sql.functions import first, last

# Select the first non-null element of the column
df.select(first("Ingredients")).show(truncate = False)

# Select the last non-null element of the column
df.select(last("Ingredients")).show(truncate = False)

# A wide variety of functions is also available, like avg(), sum(), mean(), variance(), stddev(), etc.