# <center> <img src="../../labs/img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
### <center> **Procesamiento de Datos Masivos** </center>
---
### <center> **Primavera 2025** </center>
---
**Primer Examen**

**Fecha**: 14 de Marzo del 2025

**Nombre del estudiante**: Arturo Benjamin Vergara Romo

**Professor**: Pablo Camarillo Ramirez

In [86]:
import findspark
findspark.init()

In [87]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("SparkSQL-Exam-1-BENJAMIN-VERGARA") \
    .master("spark://b33dcc1265b4:7077") \
    .config("spark.ui.port","4040") \
    .getOrCreate()
sc = spark.sparkContext

In [88]:
from gatubelxs.spark_utils import SparkUtils
from pyspark.sql.functions import col, get_json_object, when, year, current_date, max

In [89]:
department_schema = SparkUtils.generate_schema(
    [("department_id", "integer"),
     ("department_name", "string"),
     ("location", "string")
    ])

employees_schema = SparkUtils.generate_schema(
    [
        ("name", "string"),
        ("department_id", "integer"),
        ("salary", "double"),
        ("hire_date", "date")
    ]
)

In [90]:
department_df = spark \
                .read \
                .schema(department_schema) \
                .option("header", "true") \
                .csv("/home/jovyan/notebooks/data/exam_P2025_ESI3914B/departments.csv")

In [91]:
employees_df = spark.read.option("header", "true").csv("/home/jovyan/notebooks/data/exam_P2025_ESI3914B/employees.csv")

employees_df = employees_df \
    .withColumn("name", get_json_object(col("employee_info"), "$.name")) \
    .withColumn("department_id", get_json_object(col("employee_info"), "$.department_id")) \
    .withColumn("salary", get_json_object(col("employee_info"), "$.salary")) \
    .withColumn("hire_date", get_json_object(col("employee_info"), "$.hire_date"))

In [92]:
employees_df = employees_df.drop("employee_info")

for field in employees_schema.fields:
    employees_df = employees_df.withColumn(field.name, col(field.name).cast(field.dataType))

employees_df.show(n=10)

+-----------+---------+-------------+---------+----------+
|employee_id|     name|department_id|   salary| hire_date|
+-----------+---------+-------------+---------+----------+
|          1|  Caitlyn|          103|115959.78|2002-06-10|
|          2|   Rachel|          104|100820.16|2009-07-01|
|          3|   Carrie|          105|114421.44|1998-12-10|
|          4|    Renee|          104| 54688.13|1995-03-17|
|          5|Gabriella|          109|106267.03|1995-02-09|
|          6|  Kristen|          101| 88237.54|2010-11-15|
|          7| Jonathan|          102| 39323.42|2012-06-30|
|          8| Michelle|          101| 64262.85|2005-10-30|
|          9| Michelle|          105|103521.88|1991-07-10|
|         10|     Lisa|          110| 55435.93|2016-03-25|
+-----------+---------+-------------+---------+----------+
only showing top 10 rows



In [93]:
joined_df = employees_df.join(department_df, 
      employees_df["department_id"] == department_df["department_id"], 
      "inner")

In [94]:
df_enriched = joined_df.withColumn(
    "salary_category", when(col("salary") >= 55000, "High").otherwise("Low")
)

df_enriched.show(n=10)

+-----------+---------+-------------+---------+----------+-------------+--------------------+-------------+---------------+
|employee_id|     name|department_id|   salary| hire_date|department_id|     department_name|     location|salary_category|
+-----------+---------+-------------+---------+----------+-------------+--------------------+-------------+---------------+
|          1|  Caitlyn|          103|115959.78|2002-06-10|          103| Sales and Marketing|      Chicago|           High|
|          2|   Rachel|          104|100820.16|2009-07-01|          104|    Data Engineering|      Zapopan|           High|
|          3|   Carrie|          105|114421.44|1998-12-10|          105|        Data Science|      Seattle|           High|
|          4|    Renee|          104| 54688.13|1995-03-17|          104|    Data Engineering|      Zapopan|            Low|
|          5|Gabriella|          109|106267.03|1995-02-09|          109|    Customer Service|San Francisco|           High|
|       

In [95]:
df_high_salary = df_enriched.filter(col("salary_category") == "High")
df_high_salary.show(n=10)

+-----------+---------+-------------+---------+----------+-------------+--------------------+-------------+---------------+
|employee_id|     name|department_id|   salary| hire_date|department_id|     department_name|     location|salary_category|
+-----------+---------+-------------+---------+----------+-------------+--------------------+-------------+---------------+
|          1|  Caitlyn|          103|115959.78|2002-06-10|          103| Sales and Marketing|      Chicago|           High|
|          2|   Rachel|          104|100820.16|2009-07-01|          104|    Data Engineering|      Zapopan|           High|
|          3|   Carrie|          105|114421.44|1998-12-10|          105|        Data Science|      Seattle|           High|
|          5|Gabriella|          109|106267.03|1995-02-09|          109|    Customer Service|San Francisco|           High|
|          6|  Kristen|          101| 88237.54|2010-11-15|          101|     Human Resources|    San Diego|           High|
|       

In [96]:
df_low_salary = df_enriched.filter(col("salary_category") == "Low")
df_low_salary.show(n=10)

+-----------+--------+-------------+--------+----------+-------------+--------------------+-------------+---------------+
|employee_id|    name|department_id|  salary| hire_date|department_id|     department_name|     location|salary_category|
+-----------+--------+-------------+--------+----------+-------------+--------------------+-------------+---------------+
|          4|   Renee|          104|54688.13|1995-03-17|          104|    Data Engineering|      Zapopan|            Low|
|          7|Jonathan|          102|39323.42|2012-06-30|          102|Finance and Accou...|     New York|            Low|
|         13|    Lisa|          104|36032.49|2019-05-16|          104|    Data Engineering|      Zapopan|            Low|
|         26|    John|          109|44836.57|2004-11-13|          109|    Customer Service|San Francisco|            Low|
|         38|  Rachel|          109|43269.85|1992-07-13|          109|    Customer Service|San Francisco|            Low|
|         42|    Juan|  

In [97]:
df_high_salary.createOrReplaceTempView("high_salary_employees")
df_low_salary.createOrReplaceTempView("low_salary_employees")


df_high_avg_salary = spark.sql("""
    SELECT department_name, AVG(salary) AS avg_salary
    FROM high_salary_employees
    GROUP BY department_name
""")

df_low_avg_salary = spark.sql("""
    SELECT department_name, AVG(salary) AS avg_salary
    FROM low_salary_employees
    GROUP BY department_name
""")

In [98]:
df_high_avg_salary.show(n=10)
df_low_avg_salary.show(n=10)

+--------------------+------------------+
|     department_name|        avg_salary|
+--------------------+------------------+
|Corporate Strateg...|102741.38324414717|
| Sales and Marketing|100839.65275449108|
|    Data Engineering|101626.29492163012|
|Research and Deve...|  98714.3003086419|
|Finance and Accou...|100731.07877887784|
|    Customer Service|101585.01600000002|
|               Legal|  99366.3129102167|
|        Data Science|101903.63710344829|
|          Operations|100169.65621722837|
|     Human Resources|104999.43191489363|
+--------------------+------------------+

+--------------------+------------------+
|     department_name|        avg_salary|
+--------------------+------------------+
|Corporate Strateg...|41590.741833333326|
| Sales and Marketing| 41150.40277777778|
|    Data Engineering| 41358.50794117647|
|Finance and Accou...|42740.952888888874|
|Research and Deve...| 41426.43521126761|
|    Customer Service|42644.472021276604|
|               Legal| 41160.2661

In [99]:
df_top_high_salary = spark.sql("""
    SELECT name, department_name, salary
    FROM high_salary_employees
    ORDER BY salary DESC
    LIMIT 5
""")

df_top_low_salary = spark.sql("""
    SELECT name, department_name, salary
    FROM low_salary_employees
    ORDER BY salary DESC
    LIMIT 5
""")

In [100]:
df_top_high_salary.show()
df_top_low_salary.show()

+---------+--------------------+---------+
|     name|     department_name|   salary|
+---------+--------------------+---------+
|Gabriella|     Human Resources|149989.73|
|Katherine|     Human Resources| 149979.3|
|     Ryan|Corporate Strateg...| 149963.1|
|  Caitlyn|               Legal|149956.54|
|     Mark|               Legal|149915.56|
+---------+--------------------+---------+

+-----+--------------------+--------+
| name|     department_name|  salary|
+-----+--------------------+--------+
|Linda|Corporate Strateg...|54993.53|
|Tammy|    Data Engineering|54991.71|
|Aaron|Finance and Accou...|54989.45|
|Craig|     Human Resources| 54945.2|
|Aaron|    Customer Service| 54937.3|
+-----+--------------------+--------+



In [101]:
df_enriched = df_enriched.withColumn(
    "years_in_company", year(current_date()) - year(col("hire_date"))
)

df_enriched.show(n=10)

+-----------+---------+-------------+---------+----------+-------------+--------------------+-------------+---------------+----------------+
|employee_id|     name|department_id|   salary| hire_date|department_id|     department_name|     location|salary_category|years_in_company|
+-----------+---------+-------------+---------+----------+-------------+--------------------+-------------+---------------+----------------+
|          1|  Caitlyn|          103|115959.78|2002-06-10|          103| Sales and Marketing|      Chicago|           High|              23|
|          2|   Rachel|          104|100820.16|2009-07-01|          104|    Data Engineering|      Zapopan|           High|              16|
|          3|   Carrie|          105|114421.44|1998-12-10|          105|        Data Science|      Seattle|           High|              27|
|          4|    Renee|          104| 54688.13|1995-03-17|          104|    Data Engineering|      Zapopan|            Low|              30|
|          5|

In [102]:
df_enriched.createOrReplaceTempView("employees")

most_experienced_employees = spark.sql("""
    SELECT name, years_in_company 
    FROM employees
    WHERE years_in_company = (SELECT MAX(years_in_company) FROM employees)
""")

most_experienced_employees.show(n=10)

+--------+----------------+
|    name|years_in_company|
+--------+----------------+
|     Ana|              37|
|    Seth|              37|
|   Megan|              37|
|   Sarah|              37|
|    Mark|              37|
|    Luke|              37|
|  Carrie|              37|
|   James|              37|
|  Brandy|              37|
|Jonathan|              37|
+--------+----------------+
only showing top 10 rows



In [103]:
num_most_experienced = most_experienced_employees.count()

num_most_experienced

88

![DAG Screenshot](../../labs/img/BenjaminVergara.png)

In [104]:
sc.stop()