# <center> <img src="../labs/img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Big Data** </center>
---
### <center> **Spring 2025** </center>
---
### <center> **Examples on unions and joins** </center>

---
**Professor**: Dr. Pablo Camarillo Ramirez

In [1]:
import findspark
findspark.init()

#### Connetion with the spark cluster

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("SparkSQL-Unions-and-Joins") \
    .master("spark://078b2e28e517:7077") \
    .config("spark.ui.port","4040") \
    .getOrCreate()
sc = spark.sparkContext

### Data frame unions

In [None]:
df_a = spark.createDataFrame([(1, "Alice"), (2, "Bob")], ["id", "name"])
df_b = spark.createDataFrame([(3, "Charlie"), (4, "David")], ["id", "name"])
df_a.show()
df_b.show()
result = df_a.union(df_b)
result.show()

#### Union without duplicates

In [None]:
df_a = spark.createDataFrame([(1, "Alice"), (2, "Bob")], ["id", "name"])
df_a.show()
df_b = spark.createDataFrame([(1, "Alice"), (4, "David")], ["id", "name"])
df_b.show()
result = df_a.union(df_b).distinct()
result.show()

##### Union with Mismatched Schemas

In [None]:
df_a = spark.createDataFrame([(1, "Alice")], ["id", "name"])
df_a.show()
df_b = spark.createDataFrame([("Bob", 2)], ["name", "id"])
df_b.show()
result = df_a.unionByName(df_b)
result.show()

You can also use `allowMissingColumns=True` to allow union of DataFrames with missing columns.

In [None]:
df_a = spark.createDataFrame([(1, "Alice", "NY")], ["id", "name", "city"])
df_a.show()
df_b = spark.createDataFrame([(2, "Bob")], ["id", "name"])
df_b.show()
result = df_a.unionByName(df_b, allowMissingColumns=True)
result.show()

### Joins

#### Left Join

In [None]:
book_data = [
    ("Scala", 400, 1),
    ("Spark", 500, 2),
    ("Kafka", 300, 3),
    ("Java", 350, 5)
]
df_books = spark.createDataFrame(book_data, ["book_name", "cost", "writer_id"])

writer_data = [
    ("Martin", 1),
    ("Zaharia", 2),
    ("Neha", 3),
    ("James", 4)
]
df_writers = spark.createDataFrame(writer_data, ["writer_name", "writer_id"])

result = df_books.join(df_writers, 
      df_books["writer_id"] == df_writers["writer_id"], 
      "left")
result.show()

#### Right join

In [None]:
result = df_books.join(df_writers, 
      df_books["writer_id"] == df_writers["writer_id"], 
      "right")
result.show()

#### Inner join

In [None]:
result = df_books.join(df_writers, 
      df_books["writer_id"] == df_writers["writer_id"], 
      "inner")
result.show()

#### Full outer

In [None]:
result = df_books.join(df_writers, 
      df_books["writer_id"] == df_writers["writer_id"], 
      "fullouter")
result.show()

#### Handle null values

In [None]:
result.fillna({"writer_name": "Uknown"})

### Class acctivity: Car rental service (Big Data pipeline part I)

In [None]:
from team_name.spark_utils import SparkUtils
agencies_schema = SparkUtils.generate_schema([("agency_id", "string"), ("agency_info", "string")])

agencies_df = spark.read \
                .schema(agencies_schema) \
                .option("header", "true") \
                .csv("/home/jovyan/notebooks/data/rentals_dataset/agencies.csv")

agencies_df.printSchema()

agencies_df.show(5, truncate=False)

In [None]:
brands_schema = SparkUtils.generate_schema([("brand_id", "integer"), ("brand_info", "string")])
brands_df = spark.read.option("header", "true").schema(brands_schema).csv("/home/jovyan/notebooks/data/rentals_dataset/brands.csv")
brands_df.printSchema()

In [None]:
cars_schema = SparkUtils.generate_schema([("car_id", "integer"), ("car_info", "string")])
cars_df = spark.read.option("header", "true").schema(cars_schema).csv("/home/jovyan/notebooks/data/rentals_dataset/cars.csv")
cars_df.printSchema()

In [None]:
customers_schema = SparkUtils.generate_schema([("customer_id", "integer"), ("customer_info", "string")])
customers_df = spark.read.option("header", "true").schema(customers_schema).csv("/home/jovyan/notebooks/data/rentals_dataset/customers.csv")
customers_df.printSchema()

In [None]:
rentals_schema = SparkUtils.generate_schema([("rental_id", "integer"), ("rental_info", "string")])
rentals_df = spark.read.option("header", "true").schema(rentals_schema).csv("/home/jovyan/notebooks/data/rentals_dataset/rentals/")
rentals_df.printSchema()

In [76]:
# Stop the SparkContext
sc.stop()