# <center> <img src="../labs/img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Big Data** </center>
---
### <center> **Spring 2025** </center>
---
### <center> **Examples on interactive querying with PySpark** </center>

---
**Profesor**: Dr. Pablo Camarillo Ramirez

In [1]:
import findspark
findspark.init()

#### Creacion de la conexión con el cluster de spark


In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("SparkSQL-Interactive-Querying") \
    .master("spark://078b2e28e517:7077") \
    .config("spark.ui.port","4040") \
    .getOrCreate()
sc = spark.sparkContext

### Car rental service

In [None]:
from team_name.spark_utils import SparkUtils
agencies_schema = SparkUtils.generate_schema([("agency_id", "string"), ("agency_info", "string")])

agencies_df = spark.read \
                .schema(agencies_schema) \
                .option("header", "true") \
                .csv("/home/jovyan/notebooks/data/rentals_dataset/agencies.csv")

agencies_df.printSchema()

agencies_df.show(5, truncate=False)

In [None]:
from pyspark.sql.functions import get_json_object
agencies_df = agencies_df.withColumn("agency_name", get_json_object(agencies_df.agency_info, "$.agency_name"))
agencies_df.show(truncate=False)

In [None]:
brands_schema = SparkUtils.generate_schema([("brand_id", "integer"), ("brand_info", "string")])
brands_df = spark.read.option("header", "true").schema(brands_schema).csv("/home/jovyan/notebooks/data/rentals_dataset/brands.csv")
brands_df.printSchema()
brands_df = brands_df.withColumn("brand_name", get_json_object(brands_df.brand_info, "$.brand_name"))
brands_df.show(5, truncate=False)

In [None]:
cars_schema = SparkUtils.generate_schema([("car_id", "integer"), ("car_info", "string")])
cars_df = spark.read.option("header", "true").schema(cars_schema).csv("/home/jovyan/notebooks/data/rentals_dataset/cars.csv")
cars_df.printSchema()
cars_df = cars_df.withColumn("car_name", get_json_object(cars_df.car_info, "$.car_name"))
cars_df.show(5, truncate=False)

In [None]:
customers_schema = SparkUtils.generate_schema([("customer_id", "integer"), ("customer_info", "string")])
customers_df = spark.read.option("header", "true").schema(customers_schema).csv("/home/jovyan/notebooks/data/rentals_dataset/customers.csv")
customers_df.printSchema()
customers_df = customers_df.withColumn("customer_name", get_json_object(customers_df.customer_info, "$.customer_name"))
customers_df.show(5, truncate=True)

In [None]:
rental_cars_schema = SparkUtils.generate_schema([("rental_id", "integer"), ("rental_info", "string")])
rental_cars_df = spark.read.option("header", "true").schema(rental_cars_schema).csv("/home/jovyan/notebooks/data/rentals_dataset/rentals/")
rental_cars_df.printSchema()
rental_cars_df.show(5, truncate=False)

In [None]:
rental_cars_df = rental_cars_df.withColumn("car_id", get_json_object(rental_cars_df.rental_info, '$.car_id')) \
                            .withColumn("customer_id", get_json_object(rental_cars_df.rental_info, '$.customer_id')) \
                            .withColumn("agency_id", get_json_object(rental_cars_df.rental_info, '$.agency_id'))

rental_cars_df.show(5, truncate=False)

In [None]:
rental_cars_df = rental_cars_df.join(cars_df, rental_cars_df.car_id == cars_df.car_id, "inner") \
                                .join(agencies_df, rental_cars_df.agency_id == agencies_df.agency_id, "inner") \
                                .join(customers_df, rental_cars_df.customer_id == customers_df.customer_id, "inner")

rental_cars_df.show(5, truncate=False)

In [104]:
# Final result
rental_cars_df = rental_cars_df.select("rental_id", "car_name", "agency_name", "customer_name")

In [None]:
rental_cars_df.show(5, truncate=False)

### Register the rentals DF as a temporal view

In [106]:
rental_cars_df.createOrReplaceTempView("rentals")

In [None]:
spark.sql("SELECT rental_id, customer_name FROM rentals").show(5)

In [None]:
spark.sql("SELECT agency_name, count(*) as rentals_number FROM rentals GROUP BY agency_name").show(5)

In [109]:
book_data = [
    ("Scala", 400, 1),
    ("Spark", 500, 2),
    ("Kafka", 300, 3),
    ("Java", 350, 5)
]
df_books = spark.createDataFrame(book_data, ["book_name", "cost", "writer_id"])
df_books.createOrReplaceTempView("books")

writer_data = [
    ("Martin", 1),
    ("Zaharia", 2),
    ("Neha", 3),
    ("James", 4)
]
df_writers = spark.createDataFrame(writer_data, ["writer_name", "writer_id"])
df_writers.createOrReplaceTempView("writers")

In [None]:
spark.sql("SELECT b.book_name, w.writer_name " \
          "FROM books b " \
          "JOIN writers w " \
          "ON b.writer_id = w.writer_id").count()

#### UDF

#### Define the UDF

In [114]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

@udf(returnType=IntegerType())
def str_len_return(s):
    return len(s)


#### Register the UDF

In [None]:
spark.udf.register("str_len_return", str_len_return)

#### Call the UDF

In [None]:
spark.sql("SELECT agency_name, str_len_return(agency_name) AS agency_name_len FROM rentals").show(5)

In [3]:
# Stop the SparkContext
sc.stop()