In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# ----------------------------------------
# Step 1: Load or Simulate Extended Dataset
# ----------------------------------------
data = {
    'ID': [101, 102, 103, 104, 105, 105],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Eve'],
    'Age': ['25', '30', '35', '40', '45', '45'],
    'JoiningDate': ['2022-01-10', '2022-03-15', '2022-03-15', '2022-05-20', '2022-07-25', '2022-07-25'],
    'Salary': ['50000', '60000', '70000', '80000', '90000', '90000']
}

df = pd.DataFrame(data)
print("🔹 Original Data:\n", df)

# ----------------------------------------
# Step 2: Remove Duplicate Records
# ----------------------------------------
duplicates = df.duplicated().sum()
print(f"\n🔍 Duplicate Rows: {duplicates}")
df = df.drop_duplicates()
print(f"✅ Data After Removing Duplicates:\n", df)

# ----------------------------------------
# Step 3: Fix Data Types
# ----------------------------------------
# Print current types
print("\n📌 Before Type Fix:\n", df.dtypes)

# Convert to appropriate types
df['Age'] = pd.to_numeric(df['Age'])
df['Salary'] = pd.to_numeric(df['Salary'])
df['JoiningDate'] = pd.to_datetime(df['JoiningDate'])

# Confirm fixed types
print("\n✅ After Type Fix:\n", df.dtypes)

# ----------------------------------------
# Step 4: Visual Summary
# ----------------------------------------
plt.figure(figsize=(10, 4))

# Age Distribution
plt.subplot(1, 2, 1)
sns.histplot(df['Age'], bins=5, kde=True, color='skyblue')
plt.title('Age Distribution')

# Salary Over Time
plt.subplot(1, 2, 2)
sns.lineplot(x='JoiningDate', y='Salary', data=df, marker='o', color='green')
plt.title('Salary Over Time')

plt.tight_layout()
plt.show()

# ----------------------------------------
# Step 5: Final Summary
# ----------------------------------------
print("\n📋 Final Summary Report:")
print(f"Rows After Cleaning: {len(df)}")
print(f"Missing Values:\n{df.isnull().sum()}")