# Importing Libraries and Loading Dataset

In [1]:
# importing the required libraries

import pandas as pd
import re
import numpy as np
from word2number import w2n
from datetime import datetime

# Dataset Loading
hr= pd.read_csv(r"C:\Users\DELL\Desktop\Regonet_project\messy_HR_data.csv")

# make a copy of the original dataset
hr_1 = hr.copy()

# Rename the columns to conform with Python variable naming conventions
hr_1 = hr_1.rename(columns={'Joining Date': 'Joining_date', 'Performance Score': 'Performance_score','Phone Number': 'Phone_number'})


# Resolving Inconsistent Formatting in Age and Salary Columns

In [2]:
# Converting text enteries in 'age' column to numeric values
hr_1['Salary'] = hr_1['Salary'].apply(
    lambda x: int(x) if str(x).isdigit() else (
        w2n.word_to_num(x.lower()) if isinstance(x, str) and x.replace('-', '').isalpha() else np.nan
        )
)

# Converting text enteries in 'age' column to numeric values
hr_1['Age'] = hr_1['Age'].apply(
    lambda x: int(x) if str(x).isdigit() else (
        w2n.word_to_num(x.lower()) if isinstance(x, str) and x.replace('-', '').isalpha() else np.nan
        )
)

# Removing extra leading and trailing spaces from text-based columns
for col in hr_1.select_dtypes(include='object'):
    hr_1[col] = hr_1[col].map(lambda x: re.sub(r'\s+', ' ', x).strip() if isinstance(x,str) else x)


# Handling Incorrect Data Types

In [3]:
# Converting date stored as strings with varyig formats to a proper date fromat

# List of date formats
date_formats = [
    "%Y.%m.%d",
    "%m-%d-%Y",
    "%B %d, %Y",
    "%Y/%m/%d",
    "%m/%d/%Y"
]

# Helper function to test and parse
def _can_parse(date_str, fmt):
    try:
        datetime.strptime(date_str, fmt)
        return True
    except:
        return False

# Convert and replace the Joining_date column
hr_1['Joining_date'] = hr_1['Joining_date'].apply(
    lambda x: next(
        (datetime.strptime(x, fmt) for fmt in date_formats if _can_parse(x, fmt)),
        None
    )
)

print(hr_1)

       Name   Age   Salary  Gender Department   Position Joining_date  \
0     grace  25.0  50000.0    Male         HR    Manager   2018-04-05   
1     david   NaN  65000.0  Female    Finance   Director   2020-02-20   
2    hannah  35.0      NaN  Female      Sales   Director   2020-01-15   
3       eve   NaN  50000.0  Female         IT    Manager   2018-04-05   
4     grace   NaN      NaN  Female    Finance    Manager   2020-01-15   
..      ...   ...      ...     ...        ...        ...          ...   
995    jack  50.0  65000.0  Female         HR    Manager   2020-02-20   
996    jack  30.0  50000.0    Male    Finance    Analyst   2018-04-05   
997  hannah  30.0  70000.0    Male         IT  Assistant   2020-01-15   
998     bob  25.0  65000.0   Other  Marketing    Manager   2018-04-05   
999     ivy  30.0      NaN    Male    Finance    Manager   2020-02-20   

    Performance_score              Email  Phone_number  
0                   D  email@example.com           NaN  
1        