In [1]:
import pandas as pd
import numpy as np

# **Loading Dataset**

In [20]:
dfAttendance = pd.read_csv(r"/content/attendance.csv")
dfTasks = pd.read_csv(r"/content/tasks.csv")
dfEmpl = pd.read_csv(r"/content/employees.csv")


# **Check dataframes by printing**

In [21]:
dfAttendance.head()

Unnamed: 0,attendenceID,employeeID,date,clockIN,clockOUT,isLate,isAbscent
0,1,1,2024-06-01,2024-06-01 09:02:00.000,2024-06-01 17:00:00.000,1,0
1,2,2,2024-06-01,2024-06-01 08:55:00.000,2024-06-01 17:10:00.000,0,0
2,3,3,2024-06-01,,,0,1
3,4,4,2024-06-01,2024-06-01 09:10:00.000,2024-06-01 17:05:00.000,1,0
4,5,5,2024-06-01,2024-06-01 08:50:00.000,2024-06-01 16:45:00.000,0,0


In [22]:
dfTasks.head()

Unnamed: 0,taskID,employeeID,taskName,taskDate,tasksCompeleted
0,1,1,API Integration,01-06-2024,5
1,2,2,Content Calendar Creation,01-06-2024,3
2,3,3,Policy Review,01-06-2024,0
3,4,4,CI/CD Setup,01-06-2024,4
4,5,5,Invoice Auditing,01-06-2024,6


In [23]:
dfEmpl.head()

Unnamed: 0,employeeID,name,department,role,email,hireDate,status
0,1,John Doe,Engineering,Software Developer,john.doe@example.com,2023-01-15 00:00:00.000,Active
1,2,Jane Smith,Marketing,Content Strategist,jane.smith@example.com,2022-11-20 00:00:00.000,Active
2,3,Alice Johnson,HR,HR Manager,alice.johnson@example.com,2021-09-10 00:00:00.000,Active
3,4,Bob Brown,Engineering,DevOps Engineer,bob.brown@example.com,2023-05-01 00:00:00.000,Active
4,5,Eva Green,Finance,Accountant,eva.green@example.com,2022-06-30 00:00:00.000,Resigned


# **Clean missing entries**

*  Only Null values present in attendance we can consider it as absent and drop it



In [24]:
dfTasks = dfTasks.dropna()
dfAttendance = dfAttendance.dropna()

# **Calculating workhours, breaktimes and productivity score**

In [25]:
df = dfAttendance.merge(dfTasks, how="inner", on="employeeID").merge(dfEmpl, how="inner", on="employeeID")


In [30]:
df["workingHours"] = round(abs((pd.to_datetime(df["clockIN"]) - pd.to_datetime(df["clockOUT"])).dt.total_seconds() / 3600), 2)
df["productivityScore"] = round(df["tasksCompeleted"] / df["workingHours"], 2)
df["breakTimes"] = round(df["workingHours"] / 4)

In [31]:
df.head()

Unnamed: 0,attendenceID,employeeID,date,clockIN,clockOUT,isLate,isAbscent,taskID,taskName,taskDate,tasksCompeleted,name,department,role,email,hireDate,status,workingHours,productivityScore,breakTimes
0,1,1,2024-06-01,2024-06-01 09:02:00.000,2024-06-01 17:00:00.000,1,0,1,API Integration,01-06-2024,5,John Doe,Engineering,Software Developer,john.doe@example.com,2023-01-15 00:00:00.000,Active,7.97,0.63,2.0
1,2,2,2024-06-01,2024-06-01 08:55:00.000,2024-06-01 17:10:00.000,0,0,2,Content Calendar Creation,01-06-2024,3,Jane Smith,Marketing,Content Strategist,jane.smith@example.com,2022-11-20 00:00:00.000,Active,8.25,0.36,2.0
2,4,4,2024-06-01,2024-06-01 09:10:00.000,2024-06-01 17:05:00.000,1,0,4,CI/CD Setup,01-06-2024,4,Bob Brown,Engineering,DevOps Engineer,bob.brown@example.com,2023-05-01 00:00:00.000,Active,7.92,0.51,2.0
3,5,5,2024-06-01,2024-06-01 08:50:00.000,2024-06-01 16:45:00.000,0,0,5,Invoice Auditing,01-06-2024,6,Eva Green,Finance,Accountant,eva.green@example.com,2022-06-30 00:00:00.000,Resigned,7.92,0.76,2.0


# **Finding top and bottom performer**

In [34]:
summary = df.groupby("employeeID").agg(
    hoursSpent=pd.NamedAgg(column="workingHours", aggfunc="mean"),
    productivityScore=pd.NamedAgg(column="productivityScore", aggfunc="mean"),
    abscentCount=pd.NamedAgg(column="isAbscent", aggfunc="sum")
)

In [68]:
summary_final = dfEmpl[["employeeID", "name"]].merge(summary, on="employeeID", how="left")

topPerformer = summary_final.sort_values("productivityScore", ascending=False).iloc[0, :].rename("TopPerformer")

bottomPerformer = summary_final.sort_values(["abscentCount", "productivityScore"], ascending=[False, True]).iloc[0, :].rename("BottomPerformer")

# **Deliverables:-**
- Cleaned attendance and task dataset  
- Report of top and bottom performer

In [None]:
# 1. cleaned dataset
dfAttendance.to_csv("cleaned_attendance.csv")
dfTasks.to_csv("cleaned_tasks.csv")

In [70]:
# 2. report of top and bottom performer
print("-----------------Top performer report----------------------")
print(f"Top performer: {topPerformer.iloc[1]}")
for i, j in topPerformer.items():
  if i != "name":
    print(f"{i}: {j}")
print("-----------------------------------------------------------\n")

print("-----------------Bottom performer report-------------------")
print(f"Bottom performer: {bottomPerformer.iloc[1]}")
for i, j in bottomPerformer.items():
  if i != "name":
    print(f"{i}: {j}")
print("-----------------------------------------------------------\n")

-----------------Top performer report----------------------
Top performer: Eva Green
employeeID: 5
hoursSpent: 7.92
productivityScore: 0.76
abscentCount: 0.0
-----------------------------------------------------------

-----------------Bottom performer report-------------------
Bottom performer: Jane Smith
employeeID: 2
hoursSpent: 8.25
productivityScore: 0.36
abscentCount: 0.0
-----------------------------------------------------------

