# Setup

In [2]:
import pandas as pd
#Create a messy dataset
data = {
    "Name":["Alice","Bob","Charlie","Alice","Eve",None],
    "Age":[25,None, 35, 25, 29, 22],
    "Salary":[50000, 60000, None, 50000, 58000, 62000],
    "JoinDate":["01-01-2021","14-02-2021","not available","01-01-2021","20-03-2021","10-04-2021"],
    "Department":["HR", "Finance", "IT", "HR", "Finance", "finance"]

}

df = pd.DataFrame(data) #constructor for creating a DataFrame
df

Unnamed: 0,Name,Age,Salary,JoinDate,Department
0,Alice,25.0,50000.0,01-01-2021,HR
1,Bob,,60000.0,14-02-2021,Finance
2,Charlie,35.0,,not available,IT
3,Alice,25.0,50000.0,01-01-2021,HR
4,Eve,29.0,58000.0,20-03-2021,Finance
5,,22.0,62000.0,10-04-2021,finance


# 2.Handling Missing Data

In [3]:
#Check for missing values
df.isnull().sum()

Name          1
Age           1
Salary        1
JoinDate      0
Department    0
dtype: int64

In [4]:
#Drop missing values
df_dropna = df.dropna()
df_dropna

Unnamed: 0,Name,Age,Salary,JoinDate,Department
0,Alice,25.0,50000.0,01-01-2021,HR
3,Alice,25.0,50000.0,01-01-2021,HR
4,Eve,29.0,58000.0,20-03-2021,Finance


In [5]:
#Fill missing values with mean/mode
df_fill = df.copy() # Make a copy of the original DataFrame so we donâ€™t overwrite it
df_fill["Age"] = df_fill["Age"].fillna(df_fill["Age"].mean())
df_fill["Salary"] = df_fill["Salary"].fillna(df_fill["Salary"].median())
df_fill["Name"] = df_fill["Name"].fillna("Unknown")
df_fill


Unnamed: 0,Name,Age,Salary,JoinDate,Department
0,Alice,25.0,50000.0,01-01-2021,HR
1,Bob,27.2,60000.0,14-02-2021,Finance
2,Charlie,35.0,58000.0,not available,IT
3,Alice,25.0,50000.0,01-01-2021,HR
4,Eve,29.0,58000.0,20-03-2021,Finance
5,Unknown,22.0,62000.0,10-04-2021,finance


# Outlier Detection

In [6]:
import numpy as np

# Example: detect outliers in Age using IQR
Q1 = df["Age"].quantile(0.25)
Q3 = df["Age"].quantile(0.75)
IQR = Q3 - Q1

outliers = df[(df["Age"] < Q1 - 1.5*IQR) | (df["Age"] > Q3 + 1.5*IQR)]
outliers

Unnamed: 0,Name,Age,Salary,JoinDate,Department


## 4. Data Type Conversions

In [7]:
# Convert JoinDate to datetime, handling errors
df["JoinDate"] = pd.to_datetime(df["JoinDate"], errors="coerce")
df.dtypes

Name                  object
Age                  float64
Salary               float64
JoinDate      datetime64[ns]
Department            object
dtype: object

## 5. String Operations

In [8]:
# Clean up Department column
df["Department"] = df["Department"].str.strip().str.lower()

# Replace text
df["Department"] = df["Department"].replace({"finance": "Finance", "hr": "HR", "it": "IT"})
df

Unnamed: 0,Name,Age,Salary,JoinDate,Department
0,Alice,25.0,50000.0,2021-01-01,HR
1,Bob,,60000.0,NaT,Finance
2,Charlie,35.0,,NaT,IT
3,Alice,25.0,50000.0,2021-01-01,HR
4,Eve,29.0,58000.0,NaT,Finance
5,,22.0,62000.0,2021-10-04,Finance


## 6. Duplicate Handling

In [11]:
# Detect duplicates
df.duplicated()

# Drop duplicates
df_nodup = df.drop_duplicates()
df_nodup

Unnamed: 0,Name,Age,Salary,JoinDate,Department
0,Alice,25.0,50000.0,2021-01-01,HR
1,Bob,,60000.0,NaT,Finance
2,Charlie,35.0,,NaT,IT
4,Eve,29.0,58000.0,NaT,Finance
5,,22.0,62000.0,2021-10-04,Finance


In [None]:
import numpy as np

# --- IQR Method ---
# Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1 = df["Age"].quantile(0.25)
Q3 = df["Age"].quantile(0.75)

# Interquartile Range
IQR = Q3 - Q1

# Find outliers using IQR rule (values below Q1-1.5*IQR or above Q3+1.5*IQR)
outliers_iqr = df[(df["Age"] < Q1 - 1.5*IQR) | (df["Age"] > Q3 + 1.5*IQR)]
print("Outliers detected using IQR:")
print(outliers_iqr)


# --- Z-score Method ---
# Compute the mean and standard deviation of Age
mean_age = df["Age"].mean()
std_age = df["Age"].std()

# Calculate Z-scores
df["z_score_age"] = (df["Age"] - mean_age) / std_age

# Mark rows where Z-score > 3 or < -3 as outliers
outliers_z = df[(df["z_score_age"] > 3) | (df["z_score_age"] < -3)]
print("\nOutliers detected using Z-score:")
print(outliers_z)
