### A dataset containing employee attendance records has missing values, duplicate rows, wrong date formats, and inconsistent entries. You must do followings:

a) Load the dataset and identify missing, duplicated, or invalid data using info(),
isna(), duplicated().

In [9]:
import pandas as pd
import numpy as np

data = pd.read_csv("employee_data.csv")
data = data[["Date" , "EmployeeID", "Attendance"]]

data.head()

Unnamed: 0,Date,EmployeeID,Attendance
0,2024-02-30,E010,Y
1,01/14/2024,e007,Y
2,30-01-2024,E016,0
3,2024-02-11,E026,Absent
4,15-02-2024,E005,1


In [None]:

# Data cleaning steps

# 1) Quick diagnostics
print("Info:")
print(data.info())
print("\nMissing counts:")
print(data.isna().sum())
print("\nDuplicate rows:", data.duplicated().sum())

# 2) Normalize Date -> coerce invalid formats to NaT so we can detect them
data['Date'] = pd.to_datetime(data['Date'], errors='coerce')
print("Invalid dates (NaT) after coercion:", data['Date'].isna().sum())

# 3) Normalize Attendance values (trim, lower, map common variants)
data['Attendance'] = data['Attendance'].astype(str).str.strip().str.lower()
att_map = {
    'p': 'present', 'present': 'present', 'pd': 'present',
    'a': 'absent', 'absent': 'absent',
    'na': 'absent', 'nan': np.nan
}
data['Attendance'] = data['Attendance'].map(att_map).fillna(data['Attendance'])

# 4) Remove exact duplicate rows (keeps first occurrence)
data = data.drop_duplicates()

# 5) Drop rows missing critical identifiers (cannot recover EmployeeID or Date reliably)
data = data.dropna(subset=['EmployeeID', 'Date'])

# 6) Impute non-critical missing values (Attendance) using mode (most common)
if data['Attendance'].dropna().empty:
    fill_att = 'absent'
else:
    fill_att = data['Attendance'].mode().iloc[0]
data['Attendance'] = data['Attendance'].fillna(fill_att)

# 7) Final checks and save cleaned file
print("\nAfter cleaning - missing counts:")
print(data.isna().sum())
data.to_csv('employee_data_cleaned.csv', index=False)
print("Cleaned data written to employee_data_cleaned.csv")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2809 entries, 0 to 2808
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Date        2809 non-null   object
 1   EmployeeID  2809 non-null   object
 2   Attendance  2753 non-null   object
dtypes: object(3)
memory usage: 66.0+ KB
None


In [11]:
data.isna()

Unnamed: 0,Date,EmployeeID,Attendance
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
...,...,...,...
2804,False,False,False
2805,False,False,False
2806,False,False,False
2807,False,False,False


b) Clean the dataset using dropna(), fillna(), and drop_duplicates() (justify which
method is appropriate for each case).