In [1]:
# Activity 3: Data Standardization & Validation

# Task A: Enforcing Data Formats & Constraints

# 13. Date Format Standardization:
# - Convert all date entries into a uniform format (e.g., YYYY-MM-DD).

import pandas as pd
import re
from datetime import datetime

# Create a sample DataFrame with various data types
data = {
    'EmployeeID': [1, 2, 3, 4, 5],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'BirthDate': ['2025-03-12', '03/15/1990', '12-25-1985', '1992-07-10', '2000/06/22'],  # Mixed date formats
    'Age': [25, -30, 35, 40, 45],  # Invalid negative age
    'Email': ['alice@example.com', 'bob@company', 'charlie_at_example.com', 'david@example.com', 'eva@domain.com']  # Invalid email
}

df = pd.DataFrame(data)

# Task 13: Date Format Standardization
# Convert all date entries into a uniform format (YYYY-MM-DD)
def standardize_date_format(date):
    try:
        return datetime.strptime(date, '%Y-%m-%d').strftime('%Y-%m-%d')  # Try YYYY-MM-DD format first
    except ValueError:
        try:
            return datetime.strptime(date, '%m/%d/%Y').strftime('%Y-%m-%d')  # Try MM/DD/YYYY format
        except ValueError:
            return datetime.strptime(date, '%d-%m-%Y').strftime('%Y-%m-%d')  # Try DD-MM-YYYY format

df['BirthDate'] = df['BirthDate'].apply(standardize_date_format)

print("DataFrame after Date Format Standardization:")
print(df)

# Task 14: Numeric Constraints Enforcement
# Enforce numeric constraints: Age must be > 0
df['Age'] = df['Age'].apply(lambda x: x if x > 0 else None)  # Set invalid ages to None

print("\nDataFrame after enforcing Numeric Constraints (Age > 0):")
print(df)

# Task 15: String Format Checks
# Ensure email fields follow a valid email format using regular expressions
def is_valid_email(email):
    # Regular expression for basic email validation
    email_regex = r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$'
    return bool(re.match(email_regex, email))

df['Email'] = df['Email'].apply(lambda x: x if is_valid_email(x) else None)  # Set invalid emails to None

print("\nDataFrame after String Format Check (Email Validation):")
print(df)



# 14. Numeric Constraints Enforcement:
# - Check and enforce numeric constraints (e.g., age > 0).






# 15. String Format Checks:
# - Ensure text fields meet certain constraints (e.g., valid email format).

ValueError: time data '12-25-1985' does not match format '%d-%m-%Y'

In [2]:
# Task B: Addressing Inconsistent Representations

# 16. Standardizing Date Formats:
# - Identify and correct inconsistent date formats within the dataset.








# 17. Pattern Matching for Consistency:
# - Standardize phone numbers to a specific pattern (e.g., (123) 456-7890).





# 18. Handling Mixed Case Text:
# - Convert all text entries to a consistent case (e.g., all uppercase).





import pandas as pd
import re
from datetime import datetime

# Sample DataFrame with inconsistent representations
data = {
    'EmployeeID': [1, 2, 3, 4, 5],
    'Name': ['Alice', 'bob', 'CHARLIE', 'David', 'Eva'],
    'Phone': ['123-456-7890', '(123) 456-7890', '123.456.7890', '1234567890', '(123) 456-7890'],
    'HireDate': ['2025-03-12', '03/15/1990', '12-25-1985', '1992-07-10', '2000/06/22']  # Mixed date formats
}

df = pd.DataFrame(data)

# Task 16: Standardizing Date Formats
# Convert all date entries into a uniform format (YYYY-MM-DD)
def standardize_date_format(date):
    try:
        return datetime.strptime(date, '%Y-%m-%d').strftime('%Y-%m-%d')  # Try YYYY-MM-DD format first
    except ValueError:
        try:
            return datetime.strptime(date, '%m/%d/%Y').strftime('%Y-%m-%d')  # Try MM/DD/YYYY format
        except ValueError:
            return datetime.strptime(date, '%d-%m-%Y').strftime('%Y-%m-%d')  # Try DD-MM-YYYY format

df['HireDate'] = df['HireDate'].apply(standardize_date_format)

print("DataFrame after Standardizing Date Formats:")
print(df)

# Task 17: Pattern Matching for Consistency (Phone Numbers)
# Standardize phone numbers to the format (XXX) XXX-XXXX
def standardize_phone_number(phone):
    # Remove all non-numeric characters
    digits = re.sub(r'\D', '', phone)
    if len(digits) == 10:  # Check if it has 10 digits
        return f"({digits[:3]}) {digits[3:6]}-{digits[6:]}"
    return None  # Return None if the phone number does not match the pattern

df['Phone'] = df['Phone'].apply(standardize_phone_number)

print("\nDataFrame after Pattern Matching for Phone Numbers:")
print(df)

# Task 18: Handling Mixed Case Text
# Convert all text entries to uppercase
df['Name'] = df['Name'].apply(lambda x: x.upper())

print("\nDataFrame after Handling Mixed Case Text:")
print(df)





ValueError: time data '12-25-1985' does not match format '%d-%m-%Y'