# **Week -5**

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [2]:
spark = SparkSession.builder.appName("ETL").getOrCreate()
spark

#**Loading data**

In [None]:
dfAtt = spark.read.csv(r"C:\Stack overflow\Data-Engineering-Training\CapStone Project\Employee Attendance and Productivity Tracker\Week-5\Datasets\attendance.csv", header=True, inferSchema=True)
dfEmp = spark.read.csv(r"C:\Stack overflow\Data-Engineering-Training\CapStone Project\Employee Attendance and Productivity Tracker\Week-5\Datasets\employees.csv", header=True, inferSchema=True)
dfTas = spark.read.csv(r"C:\Stack overflow\Data-Engineering-Training\CapStone Project\Employee Attendance and Productivity Tracker\Week-5\Datasets\tasks.csv", header=True, inferSchema=True)

In [5]:
dfAtt.printSchema()

root
 |-- attendanceID: integer (nullable = true)
 |-- employeeID: integer (nullable = true)
 |-- date: string (nullable = true)
 |-- clockIN: string (nullable = true)
 |-- clockOUT: string (nullable = true)
 |-- isLate: integer (nullable = true)
 |-- isAbscent: integer (nullable = true)



In [6]:
dfEmp.printSchema()

root
 |-- employeeID: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- role: string (nullable = true)
 |-- email: string (nullable = true)
 |-- hireDate: date (nullable = true)
 |-- status: string (nullable = true)



In [7]:
dfTas.printSchema()

root
 |-- taskID: integer (nullable = true)
 |-- employeeID: integer (nullable = true)
 |-- taskName: string (nullable = true)
 |-- taskDate: date (nullable = true)
 |-- tasksCompeleted: integer (nullable = true)



# **Cleaning**
- Unable to drop Null from dfAtt(Attendance) because clockin and clockout abscent data reperesented as Null values

In [8]:
dfEmp = dfEmp.dropna()
dfTas = dfTas.dropna()

# **Top 5 abscentees**

In [None]:
dfAbs = dfEmp.join(dfAtt, on="employeeID", how="inner") \
             .groupBy("employeeID") \
             .agg(
                 F.sum("isAbscent").alias("AbscentCount")
             ) \
             .join(dfEmp.select(["employeeID", "name"]), how="inner", on="employeeID") \
             .sort("AbscentCount", ascending=False) \
             .limit(5) \
             .select(["employeeID", "name", "AbscentCount"])

dfAbs.write.mode("overwrite").csv("abscentees_top_5")

+----------+-------------+------------+
|employeeID|         name|AbscentCount|
+----------+-------------+------------+
|         3|Alice Johnson|           2|
|         5|    Eva Green|           2|
|         4|    Bob Brown|           1|
|         2|   Jane Smith|           1|
|         1|     John Doe|           0|
+----------+-------------+------------+



# **Lowest performing departments**

In [24]:
dfJoined = dfEmp.join(dfAtt, on="employeeID", how="inner") \
                .join(dfTas, on="employeeID", how="inner") \
                .groupBy("department") \
                .agg(
                    F.sum("tasksCompeleted").alias("TasksProductivityScore")
                ) \
                .sort("TasksProductivityScore", ascending=True) \
                .limit(2)

dfJoined.write.mode("overwrite").csv("lowest_performing_departments")

+----------+----------------------+
|department|TasksProductivityScore|
+----------+----------------------+
|        HR|                    18|
| Marketing|                    90|
+----------+----------------------+

