In [None]:
# Import the basic spark library
from pyspark.sql import SparkSession

# Create an entry point to the PySpark Application
spark = SparkSession.builder \
      .master("local") \
      .appName("MyFirstSparkApplication") \
      .getOrCreate()

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, FloatType, ArrayType

#Createe the schema using StructField(Name, Type, Nullable)
schema = StructType([ \
    StructField("Pizza Name", StringType(), True), \
    StructField("Price", FloatType(), True), \
    StructField("Ingredients_ID", StringType(), True) \
])

df_data = [("Margherita", 5.95, "IG_1"),
        ("Calzone", 7.95, "IG_2"),
        ("Diavola", 5.95, "IG_3"),
        ("Prosciutto", 7.95, "IG_4"),
        ("Speck & Brie", 7.95, "IG_7"),
        ("Tonno & Cipolle", 7.95, "IG_8")]

df = spark.createDataFrame(data = df_data, schema = schema)
df.printSchema()
df.show(truncate=False)

In [None]:
ingredient_schema = StructType([ \
    StructField("Ingredients_ID", StringType(), True), \
    StructField("Ingredients", ArrayType(StringType()), True) \
])

ingredient_df_data = [("IG_1", ["Tomato Sauce", "Mozzarella Cheese", "Basil"]),
                    ("IG_2", ["Tomato Sauce", "Mozzarella Cheese", "Prosciutto Cotto"]),
                    ("IG_3", ["Tomato Sauce", "Mozzarella Cheese", "Spicy Salame"]),
                    ("IG_4", ["Tomato Sauce", "Mozzarella Cheese", "Prosciutto Cotto"]),
                    ("IG_5", ["Tomato Sauce", "Mozzarella Cheese", "Speck", "Brie"]),
                    ("IG_6", ["Tomato Sauce", "Mozzarella Cheese", "Tuna", "Onions"])]

ingredient_df = spark.createDataFrame(data = ingredient_df_data, schema = ingredient_schema)
ingredient_df.printSchema()
ingredient_df.show(truncate=False)

<h4>Join Operations</h4>

In [None]:
# Inner join - returns the tuples that matched in both tables
df.join(ingredient_df, df.Ingredients_ID == ingredient_df.Ingredients_ID, "inner") \
     .show(truncate=False)

In [None]:
# Outer join - returns all the tuples from both tables, if no matches are found, the tuples are returned with null values
df.join(ingredient_df, df.Ingredients_ID == ingredient_df.Ingredients_ID, "outer") \
     .show(truncate=False)

# Outer join
df.join(ingredient_df, df.Ingredients_ID == ingredient_df.Ingredients_ID, "full") \
     .show(truncate=False)

# Outer join
df.join(ingredient_df, df.Ingredients_ID == ingredient_df.Ingredients_ID, "fullouter") \
     .show(truncate=False)

In [None]:
# Left join
df.join(ingredient_df, df.Ingredients_ID == ingredient_df.Ingredients_ID, "left") \
     .show(truncate=False)

# Left Outer join
df.join(ingredient_df, df.Ingredients_ID == ingredient_df.Ingredients_ID, "leftouter") \
     .show(truncate=False)

In [None]:
# Right join
df.join(ingredient_df, df.Ingredients_ID == ingredient_df.Ingredients_ID, "right") \
     .show(truncate=False)

# Right Outer join
df.join(ingredient_df, df.Ingredients_ID == ingredient_df.Ingredients_ID, "rightouter") \
     .show(truncate=False)

In [None]:
# Left Semi join (i.e., left inner join)
df.join(ingredient_df, df.Ingredients_ID == ingredient_df.Ingredients_ID, "leftsemi") \
     .show(truncate=False)

In [None]:
# Left Anti join - returns all the tuples without a match in the other table
df.join(ingredient_df, df.Ingredients_ID == ingredient_df.Ingredients_ID, "leftanti") \
     .show(truncate=False)

In [None]:
# Self join
df.join(df, df.Ingredients_ID == df.Ingredients_ID, "inner") \
     .show(truncate=False)

In [None]:
# It is also possible to concatenate multiple joins one after another
df.join(ingredient_df, df.Ingredients_ID == ingredient_df.Ingredients_ID, "inner") \
  .drop(ingredient_df.Ingredients_ID) \
  .join(ingredient_df, df.Ingredients_ID == ingredient_df.Ingredients_ID, "right") \
  .show(truncate=False)