<h4>Operations</h4>
<p>Spark supports two different types of operations</p>
<ul>
    <li><b>Transformations</b> on RDDs return another RDD as a result (e.g., filter()), this is why they are called lazy operations.</li>
    <li><b>Actions</b> return values from RDDs</li>
</ul>

In [None]:
# Import the basic spark library
from pyspark.sql import SparkSession

# Create an entry point to the PySpark Application
spark = SparkSession.builder \
      .master("local") \
      .appName("MyFirstSparkApplication") \
      .getOrCreate()

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, FloatType, ArrayType

#Createe the schema using StructField(Name, Type, Nullable)
schema = StructType([ \
    StructField("Pizza Name", StringType(), True), \
    StructField("Price", FloatType(), True), \
    StructField("Ingredients", ArrayType(StringType()), True) \
])

df_data = [("Margherita", 5.95, ["Tomato Sauce", "Mozzarella Cheese", "Basil"]),
        ("Calzone", 7.95, ["Tomato Sauce", "Mozzarella Cheese", "Prosciutto Cotto"]),
        ("Diavola", 5.95, ["Tomato Sauce", "Mozzarella Cheese", "Spicy Salame"]),
        ("Prosciutto", 7.95, ["Tomato Sauce", "Mozzarella Cheese", "Prosciutto Cotto"]),
        ("Speck & Brie", 7.95, ["Tomato Sauce", "Mozzarella Cheese", "Speck", "Brie"]),
        ("Tonno & Cipolle", 7.95, ["Tomato Sauce", "Mozzarella Cheese", "Tuna", "Onions"]),
        ("Fries", 3.95, ["Potatoes"])]

df = spark.createDataFrame(data = df_data, schema = schema)
df.printSchema()
df.show(truncate=False)

<h4>Filtering operations (i.e., WHERE conditions)</h4>

In [None]:
# Filtering using equal condition
df.filter(df.Price == "7.95").show(truncate = False)

In [None]:
# Filtering using not equal condition
df.filter(df.Price != "7.95").show(truncate = False)

In [None]:
# Filtering using the col() function
from pyspark.sql.functions import col

df.filter(col("Price") == "7.95").show(truncate = False)

In [None]:
# Filtering using SQL Expression
df.filter("Price == '7.95'").show(truncate = False)

In [None]:
# Filtering with multiple conditions
df.filter((df.Price == "7.95") & (col("Pizza Name") == "Calzone")).show(truncate = False)

# N.B. Parenthesis are essential!

In [None]:
# Filtering w.r.t. a list of elements
favourite_pizzas = ["Speck & Brie", "Tonno & Cipolle"]

# "is in" Filtering
df.filter(col("Pizza Name").isin(favourite_pizzas)).show(truncate = False)

# "is not in" Filtering
df.filter(col("Pizza Name").isin(favourite_pizzas) == False).show(truncate = False)

In [None]:
# Filtering w.r.t. a list of elements

# collect() -> Extract the list of rows from the resulting RDD
expensive_pizzas = df.filter(col("Price") == "7.95").select("Pizza Name").collect()

# Extract the value of the chosen field
expensive_pizzas = [ep[0] for ep in expensive_pizzas]

# "is in" Filtering
df.filter(col("Pizza Name").isin(expensive_pizzas)).show(truncate = False)

# "is not in" Filtering
df.filter(col("Pizza Name").isin(expensive_pizzas) == False).show(truncate = False)

In [None]:
# Filtering based on the content of the column

# Filtering based on the initial letter(s)
df.filter(col("Pizza Name").startswith("To")).show(truncate = False)

# Filtering based on the ending letter(s)
df.filter(col("Pizza Name").endswith("one")).show(truncate = False)

# Filtering based on whether a word is contained in the word
df.filter(col("Pizza Name").contains("&")).show(truncate = False)

In [None]:
# Filtering using like (i.e., SQL LIKE) 
df.filter(col("Pizza Name").like("%on%")).show(truncate = False)

# Filtering using rlike (i.e., REGEX LIKE)
df.filter(col("Pizza Name").rlike("[A-z]*&[A-z]*")).show(truncate = False)

In [None]:
# Filtering on array columns
from pyspark.sql.functions import array_contains

# Filtering on a single value
df.filter(array_contains(df.Ingredients, "Tomato Sauce")).show(truncate = False)

# Filtering on multiple values
df.filter(array_contains(df.Ingredients, "Tomato Sauce") & array_contains(df.Ingredients, "Basil")).show(truncate = False)

In [None]:
# Limit the results to the first 5 elements
df.limit(5).show(truncate = False)