In [2]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv("data/Employee.csv")

# 01. Print records count
print("01. Records Count:", len(df))

# 02. Remove duplicate records
df = df.drop_duplicates()

# 03. Convert all text data to uppercase
df = df.apply(lambda x: x.str.upper() if x.dtype == "object" else x)

# 04. For EmpCode, print the count of duplicate EmpCode
print("04. Duplicate EmpCode Count:")
print(df['EmpCode'].duplicated().sum())

# 05. Print duplicate EmpCode list
print("05. Duplicate EmpCode List:")
print(df[df['EmpCode'].duplicated()]['EmpCode'].tolist())

# 06. Remove all duplicate EmpCode with the record
df = df.drop_duplicates(subset='EmpCode', keep=False)

# 07. If there is data for DateofJoin, convert the format
df['DateofJoin'] = pd.to_datetime(df['DateofJoin'], errors='coerce', format='%m/%d/%Y')

# 08. Create EmployeeAge column
df['EmployeeAge'] = (pd.to_datetime('12/31/2015') - df['DateofJoin']).dt.days // 365

# 09. Add an Index column as the first column
df.insert(0, "Index", range(1, 1 + len(df)))

# 10. Save the output to Cleaned_Employee.csv
df.to_csv("data/Cleaned_Employee.csv", index=False)

# 11. Print output records count
print("11. Output Records Count:", len(df))


01. Records Count: 2015
04. Duplicate EmpCode Count:
83
05. Duplicate EmpCode List:
[463994, 2020808, 2020909, 2021010, 42723, 46763, 65852, 66256, 68579, 69589, 69993, 71205, 71710, 72821, 72922, 79487, 81709, 84032, 85446, 85951, 88072, 88173, 89284, 89587, 90193, 92112, 92314, 93425, 93829, 95041, 95647, 96354, 96859, 97566, 98071, 98273, 99182, 99283, 99586, 99990, 100091, 100192, 102818, 103626, 104838, 104939, 106252, 106757, 107161, 107666, 108676, 109888, 109989, 110797, 111100, 111201, 111504, 111605, 111908, 112110, 112413, 112514, 112615, 112716, 113120, 113221, 113322, 113423, 113625, 113726, 113827, 113928, 114029, 95647, 109383, 110797, 111908, 352793, 439552, 512272, 528533, 536209, 555197]
11. Output Records Count: 1852
