In [0]:
"""
In your organization, each employee has a fixed joining salary recorded at the time they start. Over time, employees may receive one or more promotions, each offering a certain percentage increase to their current salary.

You're given two datasets:
employees :  contains each employeeâ€™s name and joining salary.
promotions:  lists all promotions that have occurred, including the promotion date and the percent increase granted during that promotion.

Your task is to write a SQL query to compute the current salary of every employee by applying each of their promotions increase round to 1 decimal places.
If an employee has no promotions, their current salary remains equal to the joining salary. Order the result by emp id.

Table: employees
+--------------+----------+
| COLUMN_NAME  | DATA_TYPE|
+--------------+----------+
| id            | INT     |
| name          | VARCHAR |  
|joining_salary | INT     |  
+--------------+----------+
Table: promotions
+----------------+----------+
| COLUMN_NAME    | DATA_TYPE|
+----------------+----------+
|emp_id          | INT      |
|promotion_date  | DATE     | 
|percent_increase| INT      |   
+--------------+------------+

+------+---------+----------------+
| id   | name    | joining_salary |
+------+---------+----------------+
|    1 | Alice   |          50000 |
|    2 | Bob     |          60000 |
|    3 | Charlie |          70000 |
|    4 | David   |          55000 |
|    5 | Eva     |          65000 |
|    6 | Frank   |          48000 |
|    7 | Grace   |          72000 |
|    8 | Henry   |          51000 |
+------+---------+----------------+
+--------+----------------+------------------+
| emp_id | promotion_date | percent_increase |
+--------+----------------+------------------+
|      1 | 2021-01-15     |               10 |
|      1 | 2022-03-20     |               20 |
|      2 | 2023-01-01     |                5 |
|      2 | 2024-01-01     |               10 |
|      3 | 2022-05-10     |                5 |
|      3 | 2023-07-01     |               10 |
|      3 | 2024-10-10     |                5 |
|      4 | 2021-09-21     |               15 |
|      4 | 2022-09-25     |               15 |
|      4 | 2023-09-01     |               15 |
|      4 | 2024-09-30     |               15 |
|      5 | 2023-02-01     |               10 |
|      5 | 2023-12-01     |               10 |
|      6 | 2022-06-15     |                5 |
|      6 | 2023-11-11     |               10 |
|      7 | 2022-01-01     |                7 |
+--------+----------------+------------------+


Output
+------+---------+----------------+----------------+
| id   | name    | initial_salary | current_salary |
+------+---------+----------------+----------------+
|    1 | Alice   |          50000 |          66000 |
|    2 | Bob     |          60000 |          69300 |
|    3 | Charlie |          70000 |        84892.5 |
|    4 | David   |          55000 |        96195.3 |
|    5 | Eva     |          65000 |          78650 |
|    6 | Frank   |          48000 |          55440 |
|    7 | Grace   |          72000 |          77040 |
|    8 | Henry   |          51000 |          51000 |
+------+---------+----------------+----------------+
"""

employees_df = spark.createDataFrame([
    (1,'Alice',50000),
    (2,'Bob',60000),
    (3,'Charlie',70000),
    (4,'David',55000),
    (5,'Eva',65000),
    (6,'Frank',48000),
    (7,'Grace',72000),
    (8,'Henry',51000)
 ], ["id", "name", "joining_salary"])

promotions_df = spark.createDataFrame([
    (1,'2021-01-15',10),
    (1,'2022-03-20',20),
    (2,'2023-01-01',5),
    (2,'2024-01-01',10),
    (3,'2022-05-10',5),
    (3,'2023-07-01',10),
    (3,'2024-10-10',5),
    (4,'2021-09-21',15),
    (4,'2022-09-25',15),
    (4,'2023-09-01',15),
    (4,'2024-09-30',15),
    (5,'2023-02-01',10),
    (5,'2023-12-01',10),
    (6,'2022-06-15',5),
    (6,'2023-11-11',10),
    (7,'2022-01-01',7)
], ["emp_id", "promotion_date", "percent_increase"])

employees_df.show()
promotions_df.show()

+---+-------+--------------+
| id|   name|joining_salary|
+---+-------+--------------+
|  1|  Alice|         50000|
|  2|    Bob|         60000|
|  3|Charlie|         70000|
|  4|  David|         55000|
|  5|    Eva|         65000|
|  6|  Frank|         48000|
|  7|  Grace|         72000|
|  8|  Henry|         51000|
+---+-------+--------------+

+------+--------------+----------------+
|emp_id|promotion_date|percent_increase|
+------+--------------+----------------+
|     1|    2021-01-15|              10|
|     1|    2022-03-20|              20|
|     2|    2023-01-01|               5|
|     2|    2024-01-01|              10|
|     3|    2022-05-10|               5|
|     3|    2023-07-01|              10|
|     3|    2024-10-10|               5|
|     4|    2021-09-21|              15|
|     4|    2022-09-25|              15|
|     4|    2023-09-01|              15|
|     4|    2024-09-30|              15|
|     5|    2023-02-01|              10|
|     5|    2023-12-01|             

In [0]:
from pyspark.sql.functions import  *
from pyspark.sql.window import *

employees_df.join(
    promotions_df.withColumn("log_component", log(1+col("percent_increase")/100.0)).groupBy("emp_id").agg(exp(sum("log_component")).alias("total_multiplier"))
    , employees_df.id == promotions_df.emp_id, "left") \
    .withColumn("total_multiplier", when(col("total_multiplier").isNull(), lit(1.0)).otherwise(col("total_multiplier"))) \
    .withColumn("current_salary", (col("joining_salary")*col("total_multiplier")).cast("decimal(6,1)") ) \
    .select("id","name",col("joining_salary").alias("initial_salary"),"current_salary").orderBy("id") \
    .show() 


+---+-------+--------------+--------------+
| id|   name|initial_salary|current_salary|
+---+-------+--------------+--------------+
|  1|  Alice|         50000|       66000.0|
|  2|    Bob|         60000|       69300.0|
|  3|Charlie|         70000|       84892.5|
|  4|  David|         55000|       96195.3|
|  5|    Eva|         65000|       78650.0|
|  6|  Frank|         48000|       55440.0|
|  7|  Grace|         72000|       77040.0|
|  8|  Henry|         51000|       51000.0|
+---+-------+--------------+--------------+



In [0]:
%sql


drop table if exists employees ;
create table employees (
  id INT,
  name VARCHAR(50),
  joining_salary INT  
);
insert into employees values
(1,'Alice',50000),
(2,'Bob',60000),
(3,'Charlie',70000),
(4,'David',55000),
(5,'Eva',65000),
(6,'Frank',48000),
(7,'Grace',72000),
(8,'Henry',51000) ;


drop table if exists promotions;
create table promotions (
  emp_id INT,
  promotion_date DATE,
  percent_increase INT    
);
insert into promotions values
(1,'2021-01-15',10),
(1,'2022-03-20',20),
(2,'2023-01-01',5),
(2,'2024-01-01',10),
(3,'2022-05-10',5),
(3,'2023-07-01',10),
(3,'2024-10-10',5),
(4,'2021-09-21',15),
(4,'2022-09-25',15),
(4,'2023-09-01',15),
(4,'2024-09-30',15),
(5,'2023-02-01',10),
(5,'2023-12-01',10),
(6,'2022-06-15',5),
(6,'2023-11-11',10),
(7,'2022-01-01',7);


num_affected_rows,num_inserted_rows
16,16


In [0]:
%sql
select * from employees ;

id,name,joining_salary
1,Alice,50000
2,Bob,60000
3,Charlie,70000
4,David,55000
5,Eva,65000
6,Frank,48000
7,Grace,72000
8,Henry,51000


In [0]:
%sql
select * from promotions ;

emp_id,promotion_date,percent_increase
1,2021-01-15,10
1,2022-03-20,20
2,2023-01-01,5
2,2024-01-01,10
3,2022-05-10,5
3,2023-07-01,10
3,2024-10-10,5
4,2021-09-21,15
4,2022-09-25,15
4,2023-09-01,15


In [0]:
"""
Databricks SQL does not support recursive CTEs (the recursive keyword or UNION ALL recursion). 
You need to implement recursion using PySpark DataFrame operations or iterative logic in Python. Here is a PySpark approach to calculate the salary after each promotion step by step.
"""

# Prepare promos DataFrame with promo_id
promos = (
  promotions_df
  .withColumn(
    "promo_id",
    row_number().over(
      Window.partitionBy("emp_id").orderBy("promotion_date")
    )
  )
)

# Start with employees DataFrame
result = (
  employees_df
  .withColumn("promo_id", lit(0))
  .withColumnRenamed("joining_salary", "current_salary")
)

# Iteratively apply promotions
max_promos = promos.agg(max("promo_id")).collect()[0][0]
for i in range(1, max_promos + 1):
    promo_step = (
      promos
      .filter(col("promo_id") == i)
      .select(
        col("emp_id").alias("id"),
        col("percent_increase"),
        col("promo_id")
      )
    )
    result = (
      result
      .join(promo_step, ["id"], "left")
      .withColumn(
        "current_salary",
        when(
          col("percent_increase").isNotNull(),
          col("current_salary") * (100 + col("percent_increase")) / 100.0
        ).otherwise(col("current_salary"))
      )
      .drop("percent_increase")
    )

display(result)

4

id,name,current_salary,promo_id,promo_id.1,promo_id.2,promo_id.3,promo_id.4
1,Alice,66000.0,0,1.0,2.0,,
2,Bob,69300.0,0,1.0,2.0,,
3,Charlie,84892.5,0,1.0,2.0,3.0,
4,David,96195.34375,0,1.0,2.0,3.0,4.0
5,Eva,78650.0,0,1.0,2.0,,
6,Frank,55440.0,0,1.0,2.0,,
7,Grace,77040.0,0,1.0,,,
8,Henry,51000.0,0,,,,
