In [16]:
// This question was asked in JPMorgan Chase & Co. interview for Senior Data Engineer Role.

// 📊 Question: Create a Comprehensive Fact Table

// ➡ Given two input files:
// 👉 employee.csv with columns: employee_id, department, salary
// 👉 person.csv with columns: employee_id, first_name, last_name, DOB, state, country

// ➡ Write transformations to create employee_fact with columns:
//  -- employee_id
//  -- employee_full_name
//  -- department
//  -- salary
//  -- Salary_Diff_to_reach_highest_sal (consider it to be Company's highest)
// -- DOB
// -- state
// -- country
// -- age

// ✅ 𝗘𝘅𝗽𝗹𝗮𝗻𝗮𝘁𝗶𝗼𝗻:
// 1️⃣ Fetch the highest salary at company's level
// 2️⃣ Join two files and prepare the required columns


val employee_fact_df = Seq(
    (1, "HR", 15000),
    (2, "IT", 18000),
    (3, "HR", 20000),
    (4, "IT", 25000),
    (5, "ADMIN", 12000)
).toDF("employee_id","department","salary")

employee_fact_df.show(false)
employee_fact_df.printSchema()

val person_dim_df = Seq(
    (1,"Rohit","Khanna","1995-12-10","Delhi","IN"),
    (2,"Arjun","Rao","1993-10-10","Chennai","IN"),
    (3,"Kuldeep","Nair","1994-02-20","Delhi","IN"),
    (4,"Viraj","Khaskar","1995-03-19","Bengalore","IN"),
    (5,"Aditya","Paul","1996-06-12","Mumbai","IN"),
).toDF("employee_id","first_name","last_name","DOB","state","country")

person_dim_df.show(false)

+-----------+----------+------+
|employee_id|department|salary|
+-----------+----------+------+
|1          |HR        |15000 |
|2          |IT        |18000 |
|3          |HR        |20000 |
|4          |IT        |25000 |
|5          |ADMIN     |12000 |
+-----------+----------+------+

root
 |-- employee_id: integer (nullable = false)
 |-- department: string (nullable = true)
 |-- salary: integer (nullable = false)

+-----------+----------+---------+----------+---------+-------+
|employee_id|first_name|last_name|DOB       |state    |country|
+-----------+----------+---------+----------+---------+-------+
|1          |Rohit     |Khanna   |1995-12-10|Delhi    |IN     |
|2          |Arjun     |Rao      |1993-10-10|Chennai  |IN     |
|3          |Kuldeep   |Nair     |1994-02-20|Delhi    |IN     |
|4          |Viraj     |Khaskar  |1995-03-19|Bengalore|IN     |
|5          |Aditya    |Paul     |1996-06-12|Mumbai   |IN     |
+-----------+----------+---------+----------+---------+-------+



employee_fact_df: org.apache.spark.sql.DataFrame = [employee_id: int, department: string ... 1 more field]
person_dim_df: org.apache.spark.sql.DataFrame = [employee_id: int, first_name: string ... 4 more fields]


In [27]:
import org.apache.spark.sql.expressions.Window

val result_df = employee_fact_df.as("e").join(person_dim_df.as("p"), $"e.employee_id" === $"p.employee_id","inner"
    ).withColumn("employee_full_name", concat_ws(" ", $"first_name", $"last_name")
    ).withColumn("max_salary",  max($"salary").over()
    ).withColumn("salary_diff_to_reach_max_salary", $"max_salary" - $"salary"
    ).withColumn("age", year(current_date()) - year($"DOB")
    ).drop($"p.employee_id")

result_df.select("employee_id","employee_full_name","department","salary","salary_diff_to_reach_max_salary",
                "DOB","state","country","age").show(false)

+-----------+------------------+----------+------+-------------------------------+----------+---------+-------+---+
|employee_id|employee_full_name|department|salary|salary_diff_to_reach_max_salary|DOB       |state    |country|age|
+-----------+------------------+----------+------+-------------------------------+----------+---------+-------+---+
|1          |Rohit Khanna      |HR        |15000 |10000                          |1995-12-10|Delhi    |IN     |29 |
|2          |Arjun Rao         |IT        |18000 |7000                           |1993-10-10|Chennai  |IN     |31 |
|3          |Kuldeep Nair      |HR        |20000 |5000                           |1994-02-20|Delhi    |IN     |30 |
|4          |Viraj Khaskar     |IT        |25000 |0                              |1995-03-19|Bengalore|IN     |29 |
|5          |Aditya Paul       |ADMIN     |12000 |13000                          |1996-06-12|Mumbai   |IN     |28 |
+-----------+------------------+----------+------+----------------------

import org.apache.spark.sql.expressions.Window
result_df: org.apache.spark.sql.DataFrame = [employee_id: int, department: string ... 10 more fields]
