# <center> <img src="../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Big Data** </center>
---
### <center> **Autumn 2025** </center>
---
### <center> **Examples on Data Joins and JSON columns** </center>
---
**Profesor**: Pablo Camarillo Ramirez

# Create SparkSession

In [38]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Examples on data sources (Files)") \
    .master("spark://9835fefe4923:7077") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

# Manipulating JSON Columns

In [39]:
from pcamarillor.spark_utils import SparkUtils
json_schema = SparkUtils.generate_schema([("id", "int"), ("json_col", "string")])
json_data = [
    (1, '{"name": "Alice", "age": 25, "payments": [34, 433, 54], "address": {"city": "New York", "zip": "10001"}}'),
    (2, '{"name": "Bob", "age": 30, "address": {"city": "Los Angeles", "zip": "90001"}}'),
    (3, '{"name": "Charlie", "age": 35, "address": {"city": "Chicago", "zip": "60601"}}')
]

df_json = spark.createDataFrame(json_data, json_schema)
df_json.show()


+---+--------------------+
| id|            json_col|
+---+--------------------+
|  1|{"name": "Alice",...|
|  2|{"name": "Bob", "...|
|  3|{"name": "Charlie...|
+---+--------------------+



### Extract a JSON column with get_json_object function

In [40]:
from pyspark.sql.functions import get_json_object
df_json.withColumn("name", get_json_object(df_json.json_col, "$.name")).show()

+---+--------------------+-------+
| id|            json_col|   name|
+---+--------------------+-------+
|  1|{"name": "Alice",...|  Alice|
|  2|{"name": "Bob", "...|    Bob|
|  3|{"name": "Charlie...|Charlie|
+---+--------------------+-------+



### Extact a JSON column with from_json

In [41]:
from pyspark.sql.functions import from_json
# Deine the schema of the JSON object
people_schema = SparkUtils.generate_schema([("name", "string"),
                                            ("age", "int"),
                                            ("payments", "array_int"),
                                            ("address", "struct")])
df_parsed = df_json.withColumn("parsed", from_json(df_json.json_col, people_schema))
df_parsed.show()

+---+--------------------+--------------------+
| id|            json_col|              parsed|
+---+--------------------+--------------------+
|  1|{"name": "Alice",...|{Alice, 25, [34, ...|
|  2|{"name": "Bob", "...| {Bob, 30, NULL, {}}|
|  3|{"name": "Charlie...|{Charlie, 35, NUL...|
+---+--------------------+--------------------+



In [42]:
df_parsed.printSchema()

root
 |-- id: integer (nullable = true)
 |-- json_col: string (nullable = true)
 |-- parsed: struct (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- age: integer (nullable = true)
 |    |-- payments: array (nullable = true)
 |    |    |-- element: integer (containsNull = true)
 |    |-- address: struct (nullable = true)



In [43]:
from pyspark.sql.functions import col
df_parsed.select(col("parsed.name"), col("parsed.payments").getItem(0)).show()

+-------+------------------+
|   name|parsed.payments[0]|
+-------+------------------+
|  Alice|                34|
|    Bob|              NULL|
|Charlie|              NULL|
+-------+------------------+



# Data Joins & Unions

## Data Join

In [44]:
df_a = spark.createDataFrame([(1, "Alice"), (2, "Bob")],
                              ["id", "name"])
df_b = spark.createDataFrame([(3, "Charlie"), (4, "David")],
                              ["id", "name"])
result = df_a.union(df_b)
result.show()

+---+-------+
| id|   name|
+---+-------+
|  1|  Alice|
|  2|    Bob|
|  3|Charlie|
|  4|  David|
+---+-------+



                                                                                

### Union w/o duplicates

In [45]:
df_a = spark.createDataFrame([(1, "Alice"), (2, "Bob")],
                              ["id", "name"])
df_b = spark.createDataFrame([(1, "Alice"), (4, "David")],
                              ["id", "name"])
df_a.union(df_b).distinct().show()



+---+-----+
| id| name|
+---+-----+
|  1|Alice|
|  2|  Bob|
|  4|David|
+---+-----+



                                                                                

### Union with Mismatched Schemas

In [46]:
df_a = spark.createDataFrame([(1, "Alice")], ["id", "name"])
df_b = spark.createDataFrame([("Bob", 2)], ["name", "id"])
result = df_a.unionByName(df_b)
result.show()

+---+-----+
| id| name|
+---+-----+
|  1|Alice|
|  2|  Bob|
+---+-----+



                                                                                

### Union by Name with missing columns

In [47]:
df_a = spark.createDataFrame([(1, "Alice", "NY")], ["id", "name", "city"])
df_a.show()
df_b = spark.createDataFrame([(2, "Bob")], ["id", "name"])
df_b.show()
result = df_a.unionByName(df_b, allowMissingColumns=True)
result.show()

+---+-----+----+
| id| name|city|
+---+-----+----+
|  1|Alice|  NY|
+---+-----+----+

+---+----+
| id|name|
+---+----+
|  2| Bob|
+---+----+

+---+-----+----+
| id| name|city|
+---+-----+----+
|  1|Alice|  NY|
|  2|  Bob|NULL|
+---+-----+----+



## Left Join

### Datasets

In [48]:
book_data = [
    ("Game of thrones", 400, 1),
    ("Spark", 500, 2),
    ("Kafka", 300, 3),
    ("Java", 350, 5)
]
df_books = spark.createDataFrame(book_data, ["book_name", "cost", "writer_id"])

writer_data = [
    ("George R.R. Martin", 1),
    ("Zaharia", 2),
    ("Neha", 3),
    ("James", 4)
]
df_writers = spark.createDataFrame(writer_data, ["writer_name", "writer_id"])

df_books.show()
df_writers.show()

+---------------+----+---------+
|      book_name|cost|writer_id|
+---------------+----+---------+
|Game of thrones| 400|        1|
|          Spark| 500|        2|
|          Kafka| 300|        3|
|           Java| 350|        5|
+---------------+----+---------+

+------------------+---------+
|       writer_name|writer_id|
+------------------+---------+
|George R.R. Martin|        1|
|           Zaharia|        2|
|              Neha|        3|
|             James|        4|
+------------------+---------+



In [49]:
result = df_books.join(df_writers, 
      df_books["writer_id"] == df_writers["writer_id"], "left")
result.show()



+---------------+----+---------+------------------+---------+
|      book_name|cost|writer_id|       writer_name|writer_id|
+---------------+----+---------+------------------+---------+
|Game of thrones| 400|        1|George R.R. Martin|        1|
|          Spark| 500|        2|           Zaharia|        2|
|           Java| 350|        5|              NULL|     NULL|
|          Kafka| 300|        3|              Neha|        3|
+---------------+----+---------+------------------+---------+



                                                                                

In [50]:
result = df_books.join(df_writers, on="writer_id", how="left")
result.show()



+---------+---------------+----+------------------+
|writer_id|      book_name|cost|       writer_name|
+---------+---------------+----+------------------+
|        1|Game of thrones| 400|George R.R. Martin|
|        2|          Spark| 500|           Zaharia|
|        5|           Java| 350|              NULL|
|        3|          Kafka| 300|              Neha|
+---------+---------------+----+------------------+



                                                                                

## Right join

In [51]:
result = df_books.join(df_writers, df_books["writer_id"] == df_writers["writer_id"], "right")
result.show()



+---------------+----+---------+------------------+---------+
|      book_name|cost|writer_id|       writer_name|writer_id|
+---------------+----+---------+------------------+---------+
|Game of thrones| 400|        1|George R.R. Martin|        1|
|          Spark| 500|        2|           Zaharia|        2|
|          Kafka| 300|        3|              Neha|        3|
|           NULL|NULL|     NULL|             James|        4|
+---------------+----+---------+------------------+---------+



                                                                                

In [52]:
result = df_books.join(df_writers, on="writer_id", how="right")
result.show()



+---------+---------------+----+------------------+
|writer_id|      book_name|cost|       writer_name|
+---------+---------------+----+------------------+
|        1|Game of thrones| 400|George R.R. Martin|
|        2|          Spark| 500|           Zaharia|
|        3|          Kafka| 300|              Neha|
|        4|           NULL|NULL|             James|
+---------+---------------+----+------------------+



                                                                                

### Inner Join

In [53]:
df_books.join(df_writers, on="writer_id").show()



+---------+---------------+----+------------------+
|writer_id|      book_name|cost|       writer_name|
+---------+---------------+----+------------------+
|        1|Game of thrones| 400|George R.R. Martin|
|        2|          Spark| 500|           Zaharia|
|        3|          Kafka| 300|              Neha|
+---------+---------------+----+------------------+



                                                                                

In [None]:

agencies_schema = SparkUtils.generate_schema([("agency_id", "int"), ("agency_info","string")])
agencies_df = spark.read.option("header", True).schema(agencies_schema).csv("/opt/spark/work-dir/data/car_service/agencies")
agencies_df.show()

+---------+--------------------+
|agency_id|         agency_info|
+---------+--------------------+
|        1|{'agency_name': '...|
|        2|{'agency_name': '...|
|        3|{'agency_name': '...|
|        4|{'agency_name': '...|
|        5|{'agency_name': '...|
+---------+--------------------+



In [55]:
!pwd

/opt/spark/work-dir


In [56]:
#sc.stop()