# <center> <img src="../../labs/img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
### <center> **Procesamiento de Datos Masivos** </center>
---
### <center> **Primavera 2025** </center>
---
**Primer Examen**

**Fecha**: 14 de Marzo del 2025

**Nombre del estudiante**:

**Professor**: Pablo Camarillo Ramirez

In [85]:
import findspark
findspark.init()

In [86]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("SparkSQL-Exam-1-AndresBlanco") \
    .master("spark://dc612074df78:7077") \
    .config("spark.ui.port","4040") \
    .getOrCreate()
sc = spark.sparkContext

In [87]:
employees_schema = SparkUtils.generate_schema([("employee_id", "string"), ("employee_info", "string")])
employees_df = spark.read \
                .schema(employees_schema) \
                .option("header", "true") \
                .csv("/home/jovyan/notebooks/data/Exam_CSV/employees.csv")
employees_df.show(truncate=False)


departments_schema = SparkUtils.generate_schema([("department_id", "string"), ("department_name", "string"), ("location", "string")])
departments_df = spark.read \
                .schema(departments_schema) \
                .option("header", "true") \
                .csv("/home/jovyan/notebooks/data/EXAM_CSV/departments.csv")
departments_df.show()

[Stage 0:>                                                          (0 + 1) / 1]

+-----------+-------------------------------------------------------------------------------------------+
|employee_id|employee_info                                                                              |
+-----------+-------------------------------------------------------------------------------------------+
|1          |{'name': 'Caitlyn', 'department_id': 103, 'salary': 115959.78, 'hire_date': '2002-06-10'}  |
|2          |{'name': 'Rachel', 'department_id': 104, 'salary': 100820.16, 'hire_date': '2009-07-01'}   |
|3          |{'name': 'Carrie', 'department_id': 105, 'salary': 114421.44, 'hire_date': '1998-12-10'}   |
|4          |{'name': 'Renee', 'department_id': 104, 'salary': 54688.13, 'hire_date': '1995-03-17'}     |
|5          |{'name': 'Gabriella', 'department_id': 109, 'salary': 106267.03, 'hire_date': '1995-02-09'}|
|6          |{'name': 'Kristen', 'department_id': 101, 'salary': 88237.54, 'hire_date': '2010-11-15'}   |
|7          |{'name': 'Jonathan', 'department_

                                                                                

In [88]:
from pyspark.sql.functions import get_json_object, col

employees_info = employees_df \
    .withColumn("name", get_json_object(employees_df.employee_info, "$.name")) \
    .withColumn("department_id", get_json_object(employees_df.employee_info, "$.department_id")) \
    .withColumn("salary", get_json_object(employees_df.employee_info, "$.salary")) \
    .withColumn("hire_date", get_json_object(employees_df.employee_info, "$.hire_date"))

employees_info = employees_info \
    .withColumn("department_id", col("department_id").cast("integer")) \
    .withColumn("salary", col("salary").cast("double")) \
    .withColumn("hire_date", col("hire_date").cast("date"))

employees_info.printSchema()  
employees_info.show(truncate=False)

root
 |-- employee_id: string (nullable = true)
 |-- employee_info: string (nullable = true)
 |-- name: string (nullable = true)
 |-- department_id: integer (nullable = true)
 |-- salary: double (nullable = true)
 |-- hire_date: date (nullable = true)

+-----------+-------------------------------------------------------------------------------------------+---------+-------------+---------+----------+
|employee_id|employee_info                                                                              |name     |department_id|salary   |hire_date |
+-----------+-------------------------------------------------------------------------------------------+---------+-------------+---------+----------+
|1          |{'name': 'Caitlyn', 'department_id': 103, 'salary': 115959.78, 'hire_date': '2002-06-10'}  |Caitlyn  |103          |115959.78|2002-06-10|
|2          |{'name': 'Rachel', 'department_id': 104, 'salary': 100820.16, 'hire_date': '2009-07-01'}   |Rachel   |104          |100820.16|2009

In [89]:
from pyspark.sql.functions import when
employees_info_with_category = employees_info.withColumn(
    "salary_category", 
    when(col("salary") >= 55000, "High")
    .otherwise("Low")
)

employees_info_with_category.select("employee_id", "name", "salary", "salary_category").show()

+-----------+---------+---------+---------------+
|employee_id|     name|   salary|salary_category|
+-----------+---------+---------+---------------+
|          1|  Caitlyn|115959.78|           High|
|          2|   Rachel|100820.16|           High|
|          3|   Carrie|114421.44|           High|
|          4|    Renee| 54688.13|            Low|
|          5|Gabriella|106267.03|           High|
|          6|  Kristen| 88237.54|           High|
|          7| Jonathan| 39323.42|            Low|
|          8| Michelle| 64262.85|           High|
|          9| Michelle|103521.88|           High|
|         10|     Lisa| 55435.93|           High|
|         11|   Cheryl| 88073.75|           High|
|         12|  Mikayla| 95192.05|           High|
|         13|     Lisa| 36032.49|            Low|
|         14|     Sean| 64904.69|           High|
|         15|   Monica| 92589.97|           High|
|         16|  Katelyn|147225.58|           High|
|         17|    Linda|146632.64|           High|


In [90]:
from pyspark.sql.functions import avg, round

# Filter employees by salary category
high_salary_employees = employees_info_with_category.filter(col("salary_category") == "High")
low_salary_employees = employees_info_with_category.filter(col("salary_category") == "Low")

# Join with departments dataframe to get department names
high_salary_with_dept = high_salary_employees.join(
    departments_df,
    high_salary_employees.department_id == departments_df.department_id,
    "inner"
)

low_salary_with_dept = low_salary_employees.join(
    departments_df,
    low_salary_employees.department_id == departments_df.department_id,
    "inner"
)

# Calculate average salary per department for high salary employees
high_salary_avg_by_dept = high_salary_with_dept.groupBy("department_name") \
    .agg(round(avg("salary"), 2).alias("avg_salary")) \
    .select("department_name", "avg_salary")

low_salary_avg_by_dept = low_salary_with_dept.groupBy("department_name") \
    .agg(round(avg("salary"), 2).alias("avg_salary")) \
    .select("department_name", "avg_salary")

print("Average salary per department for High salary category:")
high_salary_avg_by_dept.show()

print("Average salary per department for Low salary category:")
low_salary_avg_by_dept.show()

Average salary per department for High salary category:
+--------------------+----------+
|     department_name|avg_salary|
+--------------------+----------+
|Corporate Strateg...| 102741.38|
| Sales and Marketing| 100839.65|
|    Data Engineering| 101626.29|
|Research and Deve...|   98714.3|
|Finance and Accou...| 100731.08|
|    Customer Service| 101585.02|
|               Legal|  99366.31|
|        Data Science| 101903.64|
|          Operations| 100169.66|
|     Human Resources| 104999.43|
+--------------------+----------+

Average salary per department for Low salary category:
+--------------------+----------+
|     department_name|avg_salary|
+--------------------+----------+
|Corporate Strateg...|  41590.74|
| Sales and Marketing|   41150.4|
|    Data Engineering|  41358.51|
|Finance and Accou...|  42740.95|
|Research and Deve...|  41426.44|
|    Customer Service|  42644.47|
|               Legal|  41160.27|
|        Data Science|  41974.19|
|          Operations|   40646.1|
|   

In [91]:
from pyspark.sql.functions import desc

# Top 5 employees with highest salaries from "High" category
print("Top 5 employees with highest salaries from 'High' category:")
high_salary_employees.select("employee_id", "name", "salary", "department_id") \
    .orderBy(desc("salary")) \
    .limit(5) \
    .show()

# Top 5 employees with highest salaries from "Low" category
print("Top 5 employees with highest salaries from 'Low' category:")
low_salary_employees.select("employee_id", "name", "salary", "department_id") \
    .orderBy(desc("salary")) \
    .limit(5) \
    .show()

# Optional: join with departments to show department names as well
print("Top 5 'High' salary employees with department names:")
high_salary_with_dept.select("employee_id", "name", "salary", "department_name") \
    .orderBy(desc("salary")) \
    .limit(5) \
    .show()

print("Top 5 'Low' salary employees with department names:")
low_salary_with_dept.select("employee_id", "name", "salary", "department_name") \
    .orderBy(desc("salary")) \
    .limit(5) \
    .show(truncate=False)

Top 5 employees with highest salaries from 'High' category:
+-----------+---------+---------+-------------+
|employee_id|     name|   salary|department_id|
+-----------+---------+---------+-------------+
|       1778|Gabriella|149989.73|          101|
|       3621|Katherine| 149979.3|          101|
|        346|     Ryan| 149963.1|          110|
|       3807|  Caitlyn|149956.54|          107|
|       3050|     Mark|149915.56|          107|
+-----------+---------+---------+-------------+

Top 5 employees with highest salaries from 'Low' category:
+-----------+-----+--------+-------------+
|employee_id| name|  salary|department_id|
+-----------+-----+--------+-------------+
|       3472|Linda|54993.53|          110|
|       2545|Tammy|54991.71|          104|
|        382|Aaron|54989.45|          102|
|       2153|Craig| 54945.2|          101|
|       3024|Aaron| 54937.3|          109|
+-----------+-----+--------+-------------+

Top 5 'High' salary employees with department names:
+------

In [92]:
from pyspark.sql.functions import current_date, datediff, col
from pyspark.sql.functions import when

employees_years = employees_info.withColumn(
    "years_in_company", 
    (datediff(current_date(), col("hire_date")) / 365).cast("int")
)

print("Empleados con sus años en la empresa:")
employees_years.select("name", "hire_date", "years_in_company") \
    .orderBy("years_in_company", ascending=False) \
    .show(5)

# Empleados más antiguos
max_años = employees_years.agg({"years_in_company": "max"}).collect()[0][0]
print(f"Antigüedad máxima: {max_años} años")

print("Empleados con mayor antigüedad:")
employees_years.filter(col("years_in_company") == max_años) \
    .select("name", "hire_date", "years_in_company") \
    .show()

Empleados con sus años en la empresa:
+------+----------+----------------+
|  name| hire_date|years_in_company|
+------+----------+----------------+
|Brandy|1988-03-17|              37|
|Monica|1988-03-15|              37|
|Daniel|1989-01-15|              36|
|  Mark|1988-11-20|              36|
| Sarah|1988-04-14|              36|
+------+----------+----------------+
only showing top 5 rows

Antigüedad máxima: 37 años
Empleados con mayor antigüedad:
+------+----------+----------------+
|  name| hire_date|years_in_company|
+------+----------+----------------+
|Brandy|1988-03-17|              37|
|Monica|1988-03-15|              37|
+------+----------+----------------+



In [93]:
sc.stop()

![Alt Text](SS_Andres.png)