# 5 Examples of Data Cleaning with Pandas and Visualizations
This notebook demonstrates five common data cleaning techniques using pandas, with generated data and visualizations.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## 1. Handling Missing Values

In [None]:
# Generate data with missing values
df = pd.DataFrame({
    'A': np.random.randn(100),
    'B': np.random.choice([np.nan, 1, 2, 3], size=100)
})

# Visualize missing values
plt.figure(figsize=(6,2))
plt.title("Missing Values Before Cleaning")
plt.bar(['A', 'B'], [df['A'].isna().sum(), df['B'].isna().sum()])
plt.show()

# Fill missing values
df['B'] = df['B'].fillna(df['B'].mean())

# Visualize after cleaning
plt.figure(figsize=(6,2))
plt.title("Missing Values After Cleaning")
plt.bar(['A', 'B'], [df['A'].isna().sum(), df['B'].isna().sum()])
plt.show()

## 2. Removing Duplicates

In [None]:
# Generate data with duplicates
df = pd.DataFrame({
    'A': np.random.randint(0, 10, 100),
    'B': np.random.randint(0, 10, 100)
})
df = pd.concat([df, df.iloc[:10]], ignore_index=True)  # Add duplicates

# Visualize duplicates
plt.figure(figsize=(6,2))
plt.title("Number of Rows Before Removing Duplicates")
plt.bar(['Rows'], [len(df)])
plt.show()

# Remove duplicates
df_clean = df.drop_duplicates()

plt.figure(figsize=(6,2))
plt.title("Number of Rows After Removing Duplicates")
plt.bar(['Rows'], [len(df_clean)])
plt.show()

## 3. Converting Data Types

In [None]:
# Generate data with wrong types
df = pd.DataFrame({
    'date': ['2025-08-26', '2025-08-27', '2025-08-28', 'not_a_date'],
    'value': ['1', '2', 'three', '4']
})

# Before conversion
plt.figure(figsize=(6,2))
plt.title("Data Types Before Cleaning")
plt.bar(df.columns, [df[col].dtype == 'object' for col in df.columns])
plt.show()

# Convert types
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df['value'] = pd.to_numeric(df['value'], errors='coerce')

# After conversion
plt.figure(figsize=(6,2))
plt.title("Data Types After Cleaning")
plt.bar(df.columns, [df[col].dtype == 'object' for col in df.columns])
plt.show()

## 4. Handling Outliers

In [None]:
# Generate data with outliers
data = np.random.normal(50, 10, 100)
data[::10] = 200  # Add outliers
df = pd.DataFrame({'value': data})

# Visualize before cleaning
plt.figure(figsize=(6,2))
plt.title("Boxplot Before Removing Outliers")
plt.boxplot(df['value'])
plt.show()

# Remove outliers
q_low = df['value'].quantile(0.01)
q_high = df['value'].quantile(0.99)
df_clean = df[(df['value'] > q_low) & (df['value'] < q_high)]

# Visualize after cleaning
plt.figure(figsize=(6,2))
plt.title("Boxplot After Removing Outliers")
plt.boxplot(df_clean['value'])
plt.show()

## 5. Standardizing Text Data

In [None]:
# Generate messy text data
df = pd.DataFrame({
    'city': ['New York', 'new york', 'NEW YORK', 'Los Angeles', 'los angeles', 'LOS ANGELES']
})

# Before cleaning
plt.figure(figsize=(6,2))
plt.title("City Value Counts Before Cleaning")
df['city'].value_counts().plot(kind='bar')
plt.show()

# Standardize text
df['city'] = df['city'].str.lower().str.strip().str.title()

# After cleaning
plt.figure(figsize=(6,2))
plt.title("City Value Counts After Cleaning")
df['city'].value_counts().plot(kind='bar')
plt.show()