In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Simulate some data with errors
data = {
    'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', None, 'Grace', 'Alice'],
    'age': [25, 30, 35, -5, 40, 28, 29, 200, 30],  # -5 and 200 are unrealistic ages
    'salary': [50000, 55000, None, 58000, 'Unknown', 60000, 65000, 70000, 70000],  # 'Unknown' is invalid for salary
    'city': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix', 'Philadelphia', 'Chicago', 'Phoenix', 'Los Angeles']
}

df = pd.DataFrame(data)

# 1. Identify Missing Values
missing_values = df.isnull().sum()
print("Missing Values:\n", missing_values)

# 2. Identify Duplicates
duplicates = df[df.duplicated()]
print("\nDuplicates:\n", duplicates)

# 3. Check Data Types
print("\nData Types:\n", df.dtypes)

# 4. Identify Outliers in 'age' and 'salary' (using IQR method)
def detect_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers

outliers_age = detect_outliers_iqr(df, 'age')
outliers_salary = detect_outliers_iqr(df, 'salary')

print("\nOutliers in 'age':\n", outliers_age)
print("\nOutliers in 'salary':\n", outliers_salary)

# 5. Invalid Values Check
# Example: Check if salary is numeric (it should be)
invalid_salary = df[~df['salary'].apply(pd.to_numeric, errors='coerce').notnull()]
print("\nInvalid salary values:\n", invalid_salary)

# 6. Visualizations: Boxplot for outlier visualization
plt.figure(figsize=(10, 6))
sns.boxplot(x=df['age'], color='lightblue')
plt.title('Boxplot for Age to Detect Outliers')
plt.show()

plt.figure(figsize=(10, 6))
sns.boxplot(x=df['salary'], color='lightgreen')
plt.title('Boxplot for Salary to Detect Outliers')
plt.show()

Missing Values:
 name      1
age       0
salary    1
city      0
dtype: int64

Duplicates:
 Empty DataFrame
Columns: [name, age, salary, city]
Index: []

Data Types:
 name      object
age        int64
salary    object
city      object
dtype: object


TypeError: '<' not supported between instances of 'str' and 'int'