# **Week-3 Pyspark**

In [37]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# **Initalizing spark session**

In [38]:
spark = SparkSession.builder.appName("week-3").getOrCreate()
spark

# **Loading large datasets**
---
Same as `week-2` datasets but contains more records with little alterations




In [39]:
dfAtt = spark.read.csv(r"/content/attendance.csv", header=True, inferSchema=True)
dfEmp = spark.read.csv(r"/content/employees.csv", header=True, inferSchema=True)
dfTas = spark.read.csv(r"/content/tasks.csv", header=True, inferSchema=True)

# **Printing Schema**

In [52]:
dfAtt.printSchema()

root
 |-- attendanceID: integer (nullable = true)
 |-- employeeID: integer (nullable = true)
 |-- date: string (nullable = true)
 |-- clockIN: string (nullable = true)
 |-- clockOUT: string (nullable = true)
 |-- isLate: integer (nullable = true)
 |-- isAbscent: integer (nullable = true)



In [41]:
dfEmp.printSchema()

root
 |-- employeeID: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- role: string (nullable = true)
 |-- email: string (nullable = true)
 |-- hireDate: date (nullable = true)
 |-- status: string (nullable = true)



In [42]:
dfTas.printSchema()

root
 |-- taskID: integer (nullable = true)
 |-- employeeID: integer (nullable = true)
 |-- taskName: string (nullable = true)
 |-- taskDate: date (nullable = true)
 |-- tasksCompeleted: integer (nullable = true)



# **Filtering late login and abscences**

In [43]:
dfAtt.filter((dfAtt.isLate == 1) | (dfAtt.isAbscent == 1)) \
  .join(dfEmp.select(["name", "employeeID"]), on="employeeID", how="inner") \
  .withColumn("Attendance", F.when(F.col("islate") == 1, "Late Login").otherwise("Abscent")) \
  .select(["name", "Attendance", "date"]) \
  .show()

+-------------+----------+----------+
|         name|Attendance|      date|
+-------------+----------+----------+
|     John Doe|Late Login|01-06-2024|
|Alice Johnson|Late Login|01-06-2024|
|    Eva Green|   Abscent|01-06-2024|
|   Jane Smith|Late Login|02-06-2024|
|Alice Johnson|   Abscent|02-06-2024|
|    Eva Green|Late Login|02-06-2024|
|   Jane Smith|   Abscent|03-06-2024|
|    Bob Brown|Late Login|03-06-2024|
|     John Doe|Late Login|04-06-2024|
|Alice Johnson|Late Login|04-06-2024|
|    Bob Brown|   Abscent|04-06-2024|
|   Jane Smith|Late Login|05-06-2024|
|Alice Johnson|   Abscent|05-06-2024|
|    Eva Green|Late Login|05-06-2024|
|   Jane Smith|Late Login|06-06-2024|
|    Bob Brown|Late Login|06-06-2024|
|    Eva Green|   Abscent|06-06-2024|
+-------------+----------+----------+



# **Average work hours and productivity**

In [46]:
dfAtt_cleaned = dfAtt.filter(F.col("clockIN") != "NULL")

In [67]:
dfJoined = dfAtt_cleaned.join(dfEmp, on="employeeID", how="inner").join(dfTas, on="employeeID", how="inner")
dfJoined = dfJoined \
    .withColumn(
     "workHours",
    F.round(
        (F.unix_timestamp(F.col("clockOUT"), "dd-MM-yyyy HH:mm") - F.unix_timestamp(F.col("clockIN"), "dd-MM-yyyy HH:mm")) / 3600,
        2
    )) \
    .withColumn("productivityScore", F.round(F.col("tasksCompeleted") / F.col("workHours"), 4))

In [74]:
dfJoined.groupBy("department").agg(
    F.round(F.mean("workHours"), 2).alias("averageWorkHours"),
    F.round(F.mean("productivityScore"), 2).alias("averageProductivityScore")
).show()

+-----------+----------------+------------------------+
| department|averageWorkHours|averageProductivityScore|
+-----------+----------------+------------------------+
|Engineering|            8.02|                    0.52|
|         HR|            8.08|                    0.06|
|    Finance|            7.87|                    0.38|
|  Marketing|            8.07|                    0.31|
+-----------+----------------+------------------------+



# **Deliverables**

1.   Pyspark script with filtering, groupby aggregations
2.   Showing attendance issuses by department



In [75]:
# 1. pyspark script has attached in .ipynb format in git repo

In [80]:
# 2. attendance issues by department
dfJoined_2 = dfAtt.join(dfEmp, on="employeeID", how="inner")

dfJoined_2.select(["department", "isLate", "isAbscent"]) \
  .filter((F.col("isLate") == 1) | (F.col("isAbscent") == 1)) \
  .groupby("department") \
  .agg(
      F.sum("isLate").alias("lateCount"),
      F.sum("isAbscent").alias("abscentCount")
  ) \
  .withColumn("issuesCount", F.col("lateCount") + F.col("abscentCount")) \
  .show()

+-----------+---------+------------+-----------+
| department|lateCount|abscentCount|issuesCount|
+-----------+---------+------------+-----------+
|Engineering|        4|           1|          5|
|         HR|        2|           2|          4|
|    Finance|        2|           2|          4|
|  Marketing|        3|           1|          4|
+-----------+---------+------------+-----------+

