In [4]:
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions._


val salary_df = Seq(
    (1, "Rohan", 5000),
    (2, "Alex", 6000),
    (3, "Maryam", 7000)
).toDF("emp_id", "emp_name", "base_salary")


val income_df = Seq(
    (1,"Basic", 100),
    (2,"Allowance", 4),
    (3,"Others", 6)
).toDF("id", "income", "percentage")


val deduction_df = Seq(
    (1,"Insurance", 5),
    (2,"Health", 6),
    (3,"House", 4)
).toDF("id", "deduction", "percentage")


salary_df.show(false)
income_df.show(false)
deduction_df.show(false)



+------+--------+-----------+
|emp_id|emp_name|base_salary|
+------+--------+-----------+
|1     |Rohan   |5000       |
|2     |Alex    |6000       |
|3     |Maryam  |7000       |
+------+--------+-----------+

+---+---------+----------+
|id |income   |percentage|
+---+---------+----------+
|1  |Basic    |100       |
|2  |Allowance|4         |
|3  |Others   |6         |
+---+---------+----------+

+---+---------+----------+
|id |deduction|percentage|
+---+---------+----------+
|1  |Insurance|5         |
|2  |Health   |6         |
|3  |House    |4         |
+---+---------+----------+



import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions._
salary_df: org.apache.spark.sql.DataFrame = [emp_id: int, emp_name: string ... 1 more field]
income_df: org.apache.spark.sql.DataFrame = [id: int, income: string ... 1 more field]
deduction_df: org.apache.spark.sql.DataFrame = [id: int, deduction: string ... 1 more field]


In [40]:
val joined_df1 = salary_df.join(income_df, lit(1) === lit(1), "inner")
val joined_df2 = salary_df.join(deduction_df, lit(1) === lit(1), "inner")

val joined_df = joined_df1.union(joined_df2)
joined_df.show(false)
joined_df.count()

+------+--------+-----------+---+---------+----------+
|emp_id|emp_name|base_salary|id |income   |percentage|
+------+--------+-----------+---+---------+----------+
|1     |Rohan   |5000       |1  |Basic    |100       |
|1     |Rohan   |5000       |2  |Allowance|4         |
|1     |Rohan   |5000       |3  |Others   |6         |
|2     |Alex    |6000       |1  |Basic    |100       |
|2     |Alex    |6000       |2  |Allowance|4         |
|2     |Alex    |6000       |3  |Others   |6         |
|3     |Maryam  |7000       |1  |Basic    |100       |
|3     |Maryam  |7000       |2  |Allowance|4         |
|3     |Maryam  |7000       |3  |Others   |6         |
|1     |Rohan   |5000       |1  |Insurance|5         |
|1     |Rohan   |5000       |2  |Health   |6         |
|1     |Rohan   |5000       |3  |House    |4         |
|2     |Alex    |6000       |1  |Insurance|5         |
|2     |Alex    |6000       |2  |Health   |6         |
|2     |Alex    |6000       |3  |House    |4         |
|3     |Ma

joined_df1: org.apache.spark.sql.DataFrame = [emp_id: int, emp_name: string ... 4 more fields]
joined_df2: org.apache.spark.sql.DataFrame = [emp_id: int, emp_name: string ... 4 more fields]
joined_df: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [emp_id: int, emp_name: string ... 4 more fields]
res38: Long = 18


In [41]:
val amount_df = joined_df.withColumn("amount", $"base_salary" * $"percentage" / 100).orderBy($"emp_id",$"income")
amount_df.show(false)

+------+--------+-----------+---+---------+----------+------+
|emp_id|emp_name|base_salary|id |income   |percentage|amount|
+------+--------+-----------+---+---------+----------+------+
|1     |Rohan   |5000       |2  |Allowance|4         |200.0 |
|1     |Rohan   |5000       |1  |Basic    |100       |5000.0|
|1     |Rohan   |5000       |2  |Health   |6         |300.0 |
|1     |Rohan   |5000       |3  |House    |4         |200.0 |
|1     |Rohan   |5000       |1  |Insurance|5         |250.0 |
|1     |Rohan   |5000       |3  |Others   |6         |300.0 |
|2     |Alex    |6000       |2  |Allowance|4         |240.0 |
|2     |Alex    |6000       |1  |Basic    |100       |6000.0|
|2     |Alex    |6000       |2  |Health   |6         |360.0 |
|2     |Alex    |6000       |3  |House    |4         |240.0 |
|2     |Alex    |6000       |1  |Insurance|5         |300.0 |
|2     |Alex    |6000       |3  |Others   |6         |360.0 |
|3     |Maryam  |7000       |2  |Allowance|4         |280.0 |
|3     |

amount_df: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [emp_id: int, emp_name: string ... 5 more fields]


In [42]:
val salary_report_df = amount_df.groupBy($"emp_name"
                 ).agg(
                    sum(when($"income"==="Basic", $"amount")).alias("Basic"),
                    sum(when($"income"==="Allowance", $"amount")).alias("Allowance"),
                    sum(when($"income"==="Others", $"amount")).alias("Others"),
                    sum(when($"income"==="Insurance", $"amount")).alias("Insurance"),
                    sum(when($"income"==="Health", $"amount")).alias("Health"),
                    sum(when($"income"==="House", $"amount")).alias("House")
                 )

salary_report_df.show(false)

+--------+------+---------+------+---------+------+-----+
|emp_name|Basic |Allowance|Others|Insurance|Health|House|
+--------+------+---------+------+---------+------+-----+
|Alex    |6000.0|240.0    |360.0 |300.0    |360.0 |240.0|
|Rohan   |5000.0|200.0    |300.0 |250.0    |300.0 |200.0|
|Maryam  |7000.0|280.0    |420.0 |350.0    |420.0 |280.0|
+--------+------+---------+------+---------+------+-----+



salary_report_df: org.apache.spark.sql.DataFrame = [emp_name: string, Basic: double ... 5 more fields]


In [43]:
salary_report_df.withColumn("gross", $"basic" + $"allowance" + $"others"
               ).withColumn("total_deductions", $"insurance" + $"health" + $"house"
               ).withColumn("net_pay", $"gross" - $"total_deductions"
               ).show(false)


+--------+------+---------+------+---------+------+-----+------+----------------+-------+
|emp_name|Basic |Allowance|Others|Insurance|Health|House|gross |total_deductions|net_pay|
+--------+------+---------+------+---------+------+-----+------+----------------+-------+
|Alex    |6000.0|240.0    |360.0 |300.0    |360.0 |240.0|6600.0|900.0           |5700.0 |
|Rohan   |5000.0|200.0    |300.0 |250.0    |300.0 |200.0|5500.0|750.0           |4750.0 |
|Maryam  |7000.0|280.0    |420.0 |350.0    |420.0 |280.0|7700.0|1050.0          |6650.0 |
+--------+------+---------+------+---------+------+-----+------+----------------+-------+

