# <center> <img src="../labs/img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Procesamiento de Datos Masivos** </center>
---
### <center> **Primavera 2025** </center>
---
### <center> **Ejemplos de Spark SQL: Uniones con dataframes** </center>

---
**Profesor**: Dr. Pablo Camarillo Ramirez

In [1]:
import findspark
findspark.init()

#### Creacion de la conexión con el cluster de spark


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("SparkSQL-Unions-and-Joins") \
    .master("spark://078b2e28e517:7077") \
    .config("spark.ui.port","4040") \
    .getOrCreate()
sc = spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/27 23:05:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Data frame unions

In [47]:
df_a = spark.createDataFrame([(1, "Alice"), (2, "Bob")], ["id", "name"])
df_b = spark.createDataFrame([(3, "Charlie"), (4, "David")], ["id", "name"])
df_a.show()
df_b.show()
result = df_a.union(df_b)
result.show()

+---+-----+
| id| name|
+---+-----+
|  1|Alice|
|  2|  Bob|
+---+-----+

+---+-------+
| id|   name|
+---+-------+
|  3|Charlie|
|  4|  David|
+---+-------+

+---+-------+
| id|   name|
+---+-------+
|  1|  Alice|
|  2|    Bob|
|  3|Charlie|
|  4|  David|
+---+-------+



#### Union without duplicates

In [48]:
df_a = spark.createDataFrame([(1, "Alice"), (2, "Bob")], ["id", "name"])
df_a.show()
df_b = spark.createDataFrame([(1, "Alice"), (4, "David")], ["id", "name"])
df_b.show()
result = df_a.union(df_b).distinct()
result.show()

+---+-----+
| id| name|
+---+-----+
|  1|Alice|
|  2|  Bob|
+---+-----+

+---+-----+
| id| name|
+---+-----+
|  1|Alice|
|  4|David|
+---+-----+

+---+-----+
| id| name|
+---+-----+
|  1|Alice|
|  4|David|
|  2|  Bob|
+---+-----+



##### Union with Mismatched Schemas

In [49]:
df_a = spark.createDataFrame([(1, "Alice")], ["id", "name"])
df_a.show()
df_b = spark.createDataFrame([("Bob", 2)], ["name", "id"])
df_b.show()
result = df_a.unionByName(df_b)
result.show()

+---+-----+
| id| name|
+---+-----+
|  1|Alice|
+---+-----+

+----+---+
|name| id|
+----+---+
| Bob|  2|
+----+---+

+---+-----+
| id| name|
+---+-----+
|  1|Alice|
|  2|  Bob|
+---+-----+



You can also use `allowMissingColumns=True` to allow union of DataFrames with missing columns.

In [52]:
df_a = spark.createDataFrame([(1, "Alice", "NY")], ["id", "name", "city"])
df_a.show()
df_b = spark.createDataFrame([(2, "Bob")], ["id", "name"])
df_b.show()
result = df_a.unionByName(df_b, allowMissingColumns=True)
result.show()

+---+-----+----+
| id| name|city|
+---+-----+----+
|  1|Alice|  NY|
+---+-----+----+

+---+----+
| id|name|
+---+----+
|  2| Bob|
+---+----+

+---+-----+----+
| id| name|city|
+---+-----+----+
|  1|Alice|  NY|
|  2|  Bob|NULL|
+---+-----+----+



### Joins

#### Left Join

In [54]:
book_data = [
    ("Scala", 400, 1),
    ("Spark", 500, 2),
    ("Kafka", 300, 3),
    ("Java", 350, 5)
]
df_books = spark.createDataFrame(book_data, ["book_name", "cost", "writer_id"])

writer_data = [
    ("Martin", 1),
    ("Zaharia", 2),
    ("Neha", 3),
    ("James", 4)
]
df_writers = spark.createDataFrame(writer_data, ["writer_name", "writer_id"])

result = df_books.join(df_writers, 
      df_books["writer_id"] == df_writers["writer_id"], 
      "left")
result.show()

+---------+----+---------+-----------+---------+
|book_name|cost|writer_id|writer_name|writer_id|
+---------+----+---------+-----------+---------+
|    Scala| 400|        1|     Martin|        1|
|    Spark| 500|        2|    Zaharia|        2|
|     Java| 350|        5|       NULL|     NULL|
|    Kafka| 300|        3|       Neha|        3|
+---------+----+---------+-----------+---------+



#### Right join

In [55]:
result = df_books.join(df_writers, 
      df_books["writer_id"] == df_writers["writer_id"], 
      "right")
result.show()

+---------+----+---------+-----------+---------+
|book_name|cost|writer_id|writer_name|writer_id|
+---------+----+---------+-----------+---------+
|    Scala| 400|        1|     Martin|        1|
|    Spark| 500|        2|    Zaharia|        2|
|    Kafka| 300|        3|       Neha|        3|
|     NULL|NULL|     NULL|      James|        4|
+---------+----+---------+-----------+---------+



#### Inner join

In [56]:
result = df_books.join(df_writers, 
      df_books["writer_id"] == df_writers["writer_id"], 
      "inner")
result.show()

+---------+----+---------+-----------+---------+
|book_name|cost|writer_id|writer_name|writer_id|
+---------+----+---------+-----------+---------+
|    Scala| 400|        1|     Martin|        1|
|    Spark| 500|        2|    Zaharia|        2|
|    Kafka| 300|        3|       Neha|        3|
+---------+----+---------+-----------+---------+



#### Full outer

In [58]:
result = df_books.join(df_writers, 
      df_books["writer_id"] == df_writers["writer_id"], 
      "fullouter")
result.show()

+---------+----+---------+-----------+---------+
|book_name|cost|writer_id|writer_name|writer_id|
+---------+----+---------+-----------+---------+
|    Scala| 400|        1|     Martin|        1|
|    Spark| 500|        2|    Zaharia|        2|
|    Kafka| 300|        3|       Neha|        3|
|     NULL|NULL|     NULL|      James|        4|
|     Java| 350|        5|       NULL|     NULL|
+---------+----+---------+-----------+---------+



In [8]:
from team_name.spark_utils import SparkUtils
agencies_schema = SparkUtils.generate_schema([("agency_id", "string"), ("agency_info", "string")])

agencies_df = spark.read \
                .schema(agencies_schema) \
                .option("header", "true") \
                .csv("/home/jovyan/notebooks/data/rentals_dataset/agencies.csv")

agencies_df.printSchema()

agencies_df.show(5, truncate=False)

root
 |-- agency_id: string (nullable = true)
 |-- agency_info: string (nullable = true)

+---------+-----------------------------------------------------+
|agency_id|agency_info                                          |
+---------+-----------------------------------------------------+
|1        |{'agency_name': 'NYC Rentals', 'city': 'New York'}   |
|2        |{'agency_name': 'LA Car Rental', 'city': 'Londres'}  |
|3        |{'agency_name': 'Zapopan Auto', 'city': 'Zapopan'}   |
|4        |{'agency_name': 'SF Cars', 'city': 'San Francisco'}  |
|5        |{'agency_name': 'Mexico Cars', 'city': 'Mexico City'}|
+---------+-----------------------------------------------------+



In [None]:
brands_schema = SparkUtils.generate_schema([("brand_id", "integer"), ("brand_info", "string")])
brands_df = spark.read.option("header", "true").csv("/home/jovyan/notebooks/data/rentals_dataset/brands.csv")
brands_df.printSchema()

root
 |-- brand_id: string (nullable = true)
 |-- brand_info: string (nullable = true)

+--------+------------------------------------------------------+
|brand_id|brand_info                                            |
+--------+------------------------------------------------------+
|1       |{'brand_name': 'Mercedes-Benz', 'country': 'Tanzania'}|
|2       |{'brand_name': 'BMW', 'country': 'Hungary'}           |
|3       |{'brand_name': 'Audi', 'country': 'Senegal'}          |
|4       |{'brand_name': 'Ford', 'country': 'Tuvalu'}           |
|5       |{'brand_name': 'BYD', 'country': 'Italy'}             |
+--------+------------------------------------------------------+
only showing top 5 rows

+--------+--------------------+-------------+
|brand_id|          brand_info|   brand_name|
+--------+--------------------+-------------+
|       1|{'brand_name': 'M...|Mercedes-Benz|
|       2|{'brand_name': 'B...|          BMW|
|       3|{'brand_name': 'A...|         Audi|
|       4|{'brand

In [None]:
cars_schema = SparkUtils.generate_schema([("car_id", "integer"), ("car_info", "string")])
cars_df = spark.read.option("header", "true").csv("/home/jovyan/notebooks/data/rentals_dataset/cars.csv")
cars_df.printSchema()

root
 |-- car_id: string (nullable = true)
 |-- car_info: string (nullable = true)

+------+--------------------+
|car_id|            car_info|
+------+--------------------+
|     1|{'car_name': 'Tuc...|
|     2|{'car_name': 'How...|
|     3|{'car_name': 'Wag...|
|     4|{'car_name': 'Cam...|
|     5|{'car_name': 'Arc...|
+------+--------------------+
only showing top 5 rows

+------+-------------------------------------------------------------------------------------+---------------------------------+
|car_id|car_info                                                                             |car_name                         |
+------+-------------------------------------------------------------------------------------+---------------------------------+
|1     |{'car_name': 'Tucker, Hull and Gallegos Model 1', 'brand_id': 5, 'price_per_day': 68}|Tucker, Hull and Gallegos Model 1|
|2     |{'car_name': 'Howard-Snow Model 7', 'brand_id': 5, 'price_per_day': 55}              |Howard-Snow

In [None]:
customers_schema = SparkUtils.generate_schema([("customer_id", "integer"), ("customer_info", "string")])
customers_df = spark.read.option("header", "true").csv("/home/jovyan/notebooks/data/rentals_dataset/customers.csv")
customers_df.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- customer_info: string (nullable = true)

+-----------+--------------------+-------------------+
|customer_id|       customer_info|      customer_name|
+-----------+--------------------+-------------------+
|          1|{'customer_name':...|  Martin Graves DVM|
|          2|{'customer_name':...|   Frederick Wilson|
|          3|{'customer_name':...|       Gabriela Lee|
|          4|{'customer_name':...|     Devin Thornton|
|          5|{'customer_name':...|Christopher Simmons|
+-----------+--------------------+-------------------+
only showing top 5 rows



In [41]:
rentals_schema = SparkUtils.generate_schema([("rental_id", "integer"), ("rental_info", "string")])
rentals_df = spark.read.option("header", "true").csv("/home/jovyan/notebooks/data/rentals_dataset/rentals/")
rentals_df.printSchema()

root
 |-- rental_id: string (nullable = true)
 |-- rental_info: string (nullable = true)

+---------+--------------------------------------------------+----------+---------------+-------------+
|rental_id|rental_info                                       |car_rental|customer_rental|agency_rental|
+---------+--------------------------------------------------+----------+---------------+-------------+
|12740    |{'car_id': 23, 'customer_id': 42, 'agency_id': 1} |23        |42             |1            |
|12741    |{'car_id': 19, 'customer_id': 146, 'agency_id': 2}|19        |146            |2            |
|12742    |{'car_id': 24, 'customer_id': 143, 'agency_id': 3}|24        |143            |3            |
|12743    |{'car_id': 22, 'customer_id': 90, 'agency_id': 4} |22        |90             |4            |
|12744    |{'car_id': 9, 'customer_id': 115, 'agency_id': 3} |9         |115            |3            |
+---------+--------------------------------------------------+----------+-----

In [76]:
# Stop the SparkContext
sc.stop()