In [0]:
"""
You are tasked with managing project budgets at a company. Each project has a fixed budget, and multiple employees work on these projects. The company's payroll is based on annual salaries, and each employee works for a specific duration on a project.

Over budget on a project is defined when the salaries (allocated on per day basis as per project duration) exceed the budget of the project. For example, if Ankit and Rohit both combined income make 200K and work on a project of a budget of 50K that takes half a year, then the project is over budget given 0.5 * 200K = 100K > 50K.

Write a query to forecast the budget for all projects and return a label of "overbudget" if it is over budget and "within budget" otherwise. Order the result by project title.

Note: Assume that employees only work on one project at a time.

Table: employees 
+-------------+----------+
| COLUMN_NAME | DATA_TYPE|
+-------------+----------+
| id          | int      |
| name        | varchar  |
| salary      | int      |
+-------------+----------+
Table: projects 
+-------------+----------+
| COLUMN_NAME | DATA_TYPE|
+-------------+----------+
| id          | int      |
| title       | varchar  |
| start_date  | date     |
| end_date    | date     |
| budget      | int      |
+-------------+----------+
Table: project_employees 
+-------------+----------+
| COLUMN_NAME | DATA_TYPE|
+-------------+----------+
| project_id  | int      |
| employee_id | int      |
+-------------+----------+

+------+---------+--------+
| id   | name    | salary |
+------+---------+--------+
|    1 | Alice   | 100000 |
|    2 | Bob     | 120000 |
|    3 | Charlie |  90000 |
|    4 | David   | 110000 |
|    5 | Eva     |  95000 |
|    6 | Frank   | 105000 |
|    7 | Grace   |  98000 |
|    8 | Helen   | 115000 |
+------+---------+--------+
+----+--------------------+------------+------------+--------+
| id | title              | start_date | end_date   | budget |
+----+--------------------+------------+------------+--------+
|  1 | Website Redesign   | 2024-01-15 | 2024-07-15 |  50000 |
|  2 | App Development    | 2024-02-01 | 2024-05-31 | 100000 |
|  3 | Cloud Migration    | 2024-03-01 | 2024-04-30 |  20000 |
|  4 | Analytics Platform | 2024-05-05 | 2024-08-05 |  80000 |
+----+--------------------+------------+------------+--------+
+------------+-------------+
| project_id | employee_id |
+------------+-------------+
|          1 |           1 |
|          2 |           2 |
|          2 |           3 |
|          2 |           4 |
|          3 |           5 |
|          3 |           6 |
|          3 |           7 |
|          3 |           8 |
|          4 |           6 |
|          4 |           7 |
+------------+-------------+


Output
+--------------------+--------+---------------+
| title              | budget | label         |
+--------------------+--------+---------------+
| Analytics Platform |  80000 | within budget |
| App Development    | 100000 | overbudget    |
| Cloud Migration    |  20000 | overbudget    |
| Website Redesign   |  50000 | within budget |
+--------------------+--------+---------------+
"""

employees_df = spark.createDataFrame([
    (1,'Alice',100000),
    (2,'Bob',120000),
    (3,'Charlie',90000),
    (4,'David', 110000),
    (5,'Eva',95000),
    (6,'Frank',105000),
    (7,'Grace',98000),
    (8,'Helen',115000)
], ["id", "name", "salary"])

projects_df = spark.createDataFrame([
    (1,'Website Redesign','2024-01-15','2024-07-15',50000),
    (2,'App Development','2024-02-01','2024-05-31',100000),
    (3,'Cloud Migration','2024-03-01','2024-04-30',20000),
    (4,'Analytics Platform','2024-05-05','2024-08-05',80000)
], ["id", "title", "start_date", "end_date", "budget"])

project_employees_df = spark.createDataFrame([
    (1, 1),
    (2, 2),
    (2, 3),
    (2, 4),
    (3, 5),
    (3, 6),
    (3, 7),
    (3, 8),
    (4, 6),
    (4, 7)
], ["project_id", "employee_id"])

employees_df.show()
projects_df.show()
project_employees_df.show()

+---+-------+------+
| id|   name|salary|
+---+-------+------+
|  1|  Alice|100000|
|  2|    Bob|120000|
|  3|Charlie| 90000|
|  4|  David|110000|
|  5|    Eva| 95000|
|  6|  Frank|105000|
|  7|  Grace| 98000|
|  8|  Helen|115000|
+---+-------+------+

+---+------------------+----------+----------+------+
| id|             title|start_date|  end_date|budget|
+---+------------------+----------+----------+------+
|  1|  Website Redesign|2024-01-15|2024-07-15| 50000|
|  2|   App Development|2024-02-01|2024-05-31|100000|
|  3|   Cloud Migration|2024-03-01|2024-04-30| 20000|
|  4|Analytics Platform|2024-05-05|2024-08-05| 80000|
+---+------------------+----------+----------+------+

+----------+-----------+
|project_id|employee_id|
+----------+-----------+
|         1|          1|
|         2|          2|
|         2|          3|
|         2|          4|
|         3|          5|
|         3|          6|
|         3|          7|
|         3|          8|
|         4|          6|
|         4|  

In [0]:

from pyspark.sql.functions import *
from pyspark.sql.window import *

projects_df.alias("p") \
    .join(project_employees_df.alias("pe"), col("p.id")==col("pe.project_id"), "inner")\
    .join(employees_df.alias("e"), col("e.id")==col("pe.employee_id"), "inner") \
    .select("p.*", "e.salary", "pe.employee_id") \
    .withColumn("total_days", datediff(col("end_date"), col("start_date"))) \
    .withColumn("total_salary_per_day", sum(col("salary")).over(Window.partitionBy(col("id")))) \
    .withColumn("salary_for_total_days", (col("total_days") * col("total_salary_per_day")) / 365) \
    .withColumn("label", when(col("budget")>=col("salary_for_total_days"), lit("within budget")).otherwise("overbudget") ) \
    .groupBy(col("title")).agg(
        max(col("budget")).alias("budget"),
        max(col("label")).alias("label")
    ) \
    .orderBy(col("title")) \
    .show()

+------------------+------+-------------+
|             title|budget|        label|
+------------------+------+-------------+
|Analytics Platform| 80000|within budget|
|   App Development|100000|   overbudget|
|   Cloud Migration| 20000|   overbudget|
|  Website Redesign| 50000|within budget|
+------------------+------+-------------+



In [0]:
employees_df.createOrReplaceTempView("employees")
projects_df.createOrReplaceTempView("projects")
project_employees_df.createOrReplaceTempView("project_employees")

spark.sql("""
          with cte as (
            select 
                p.*, e.salary, pe.employee_id,
                date_diff(end_date, start_date) as total_days
            from projects p 
            inner join project_employees pe on p.id=pe.project_id
            inner join employees e on e.id=pe.employee_id
          ), cte1 as (
            select
                *,
                sum(salary) over(partition by id) as total_salary_per_day
            from cte
          )
          select 
            title,
            budget,
            max(case when budget > ((total_days * total_salary_per_day) / 365) then 'within budget' else 'overbudget' end) as label
          from cte1
          group by title, budget
          order by title
          """).show()


+------------------+------+-------------+
|             title|budget|        label|
+------------------+------+-------------+
|Analytics Platform| 80000|within budget|
|   App Development|100000|   overbudget|
|   Cloud Migration| 20000|   overbudget|
|  Website Redesign| 50000|within budget|
+------------------+------+-------------+

