# Data Quality & Validation - Implementation

This notebook demonstrates the implementation of data quality methods using simple toy examples. Each section shows the key Python methods and their usage.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

np.random.seed(42)

## Part 1: Type Validation

### Converting to Numeric

**Method:** `pd.to_numeric(series, errors='coerce')`

- `errors='coerce'`: Invalid values become NaN
- `errors='raise'`: Throw error on invalid values
- `errors='ignore'`: Return original if conversion fails

In [None]:
# Create data with mixed types
data = pd.Series(['25', '30.5', 'unknown', '45', '12.3'])
print("Original data:")
print(data)
print(f"Type: {data.dtype}")

In [None]:
# Convert to numeric
data_numeric = pd.to_numeric(data, errors='coerce')
print("\nAfter conversion:")
print(data_numeric)

### Converting to Datetime

**Method:** `pd.to_datetime(series, errors='coerce')`

In [None]:
# Create date strings with an invalid date
dates = pd.Series(['2024-01-15', '2024-02-20', 'not-a-date', '2024-03-10'])

dates_converted = pd.to_datetime(dates, errors='coerce')
print("Original:")
print(dates)
print("\nConverted:")
print(dates_converted)

## Part 2: Range Validation

### Checking Value Ranges

**Methods:**

- Boolean masking: `df[condition]`
- `.between(lower, upper)`: Cleaner syntax for ranges

In [None]:
# Create data with invalid values
ages = pd.Series([25, 30, -5, 45, 150, 28, 35])
print("Ages:", ages.tolist())

# Find invalid ages (must be 0-120)
invalid = (ages < 0) | (ages > 120)
print(f"\nInvalid ages: {invalid.sum()}")
print(f"Invalid values: {ages[invalid].tolist()}")

In [None]:
# Using .between() method
valid = ages.between(0, 120)
print(f"Valid ages: {valid.sum()}")
print(f"Valid values: {ages[valid].tolist()}")

In [None]:
# Fix by setting invalid values to NaN
ages_fixed = ages.copy()
ages_fixed[~ages.between(0, 120)] = np.nan
print("\nFixed ages:")
print(ages_fixed)

## Part 3: Duplicate Detection

### Finding Duplicates

**Method:** `.duplicated(keep='first'/'last'/False)`

- `keep='first'`: Mark duplicates as True except first occurrence
- `keep='last'`: Mark duplicates as True except last occurrence
- `keep=False`: Mark all duplicates including first

In [None]:
# Create data with duplicates
df = pd.DataFrame({
    'id': [1, 2, 2, 3, 3, 3, 4],
    'value': ['a', 'b', 'b', 'c', 'c', 'c', 'd']
})
print("Original data:")
print(df)

In [None]:
# Find duplicates (default: keep='first')
dups = df.duplicated()
print("\nDuplicates (keep='first'):")
print(df[dups])

In [None]:
# Find all instances of duplicated rows
all_dups = df.duplicated(keep=False)
print("\nAll duplicate instances (keep=False):")
print(df[all_dups])

### Removing Duplicates

**Method:** `.drop_duplicates(subset=None, keep='first')`

By default checks all columns for duplicates (`subset=None`) keeping the first instance (same notation as above).

In [None]:
# Remove duplicates
df_clean = df.drop_duplicates()
print("After removing duplicates:")
print(df_clean)

In [None]:
# Remove based on specific columns only
df2 = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Alice', 'Charlie'],
    'age': [25, 30, 26, 35],
    'city': ['NYC', 'LA', 'NYC', 'Chicago']
})

print("Original:")
print(df2)

# Remove duplicates based on name only
df2_clean = df2.drop_duplicates(subset=['name'])
print("\nDuplicates removed (based on 'name'):")
print(df2_clean)

## Part 4: Detecting Missing Data

### Identifying Missing Values

**Methods:**

- `.isna()` / `.isnull()`: Returns Boolean Series (True = missing)
- `.notna()` / `.notnull()`: Returns Boolean Series (True = not missing)

`isna` and `isnull` are equivalent, as are `notna` and `notnull`, but the `na` forms are strongly preferred now. Use them. The `null` forms are included here as they show up in older code.

In [None]:
# Create data with missing values
data = pd.DataFrame({
    'A': [1, 2, np.nan, 4, 5],
    'B': [10, np.nan, np.nan, 40, 50],
    'C': [100, 200, 300, 400, 500]
})

print("Data:")
print(data)

In [None]:
# Check for missing values
print("\nMissing values (True = missing):")
print(data.isna())

In [None]:
# Count missing values per column
print("\nMissing count by column:")
print(data.isna().sum())

In [None]:
# Count missing values per row
print("\nMissing count by row:")
print(data.isna().sum(axis=1))

## Part 5: Handling Missing Data - Drop

### Dropping Missing Values

**Method:** `.dropna(axis=0, how='any', subset=None, thresh=None)`

- `axis=0`: Drop rows; `axis=1`: Drop columns
- `how='any'`: Drop if any value is missing
- `how='all'`: Drop only if all values are missing
- `subset`: Check only specific columns
- `thresh=N`: Require at least N non-null values

In [None]:
data = pd.DataFrame({
    'A': [1, 2, np.nan, 4, 5],
    'B': [10, np.nan, np.nan, 40, 50],
    'C': [100, 200, 300, 400, 500]
})

print("Original data:")
print(data)
print(f"Shape: {data.shape}")

In [None]:
# Drop entire rows where any value is missing
dropped_any = data.dropna()
print("\nAfter dropna() - any missing:")
print(dropped_any)
print(f"Shape: {dropped_any.shape}")

In [None]:
# Drop only if specific column is missing
dropped_subset = data.dropna(subset=['A'])
print("\nAfter dropna(subset=['A']):")
print(dropped_subset)
print(f"Shape: {dropped_subset.shape}")

In [None]:
# Drop only if at least N values are non-null
dropped_thresh = data.dropna(thresh=2)
print("\nAfter dropna(thresh=2) - keep rows with at least 2 non-null:")
print(dropped_thresh)
print(f"Shape: {dropped_thresh.shape}")

## Part 6: Handling Missing Data - Fill

### Filling Missing Values

**Methods:**

- `.fillna(value)` - replace missing with specified value
  - `value`: Constant, Series, DataFrame, or dict specifying replacements for each column
- `.ffill()` - forward fille (use previous value)
- `.bfill()`- backward fill (use next value)

In [None]:
data = pd.Series([1, np.nan, 3, np.nan, 5])
print("Original:")
print(data)

In [None]:
# Fill with a constant
filled_constant = data.fillna(0)
print("\nFilled with 0:")
print(filled_constant)

In [None]:
# Fill with mean
filled_mean = data.fillna(data.mean())
print("\nFilled with mean:")
print(filled_mean)

In [None]:
# Fill with different values per column using a dict
data_df = pd.DataFrame({
    'A': [1, 2, np.nan, 4, 5],
    'B': [10, np.nan, np.nan, 40, 50],
    'C': [100, 200, 300, 400, 500]
})

filled_dict = data_df.fillna({'A': 0, 'B': 999})
print("\nOriginal:")
print(data_df)
print("\nFilled with dict (A=0, B=999):")
print(filled_dict)

In [None]:
# Forward fill
filled_ffill = data.ffill()
print("\nForward fill:")
print(filled_ffill)

### Group-Based Filling

In [None]:
# Create data with groups
df = pd.DataFrame({
    'group': ['A', 'A', 'A', 'B', 'B', 'B'],
    'value': [10, np.nan, 12, 20, np.nan, 22]
})

print("Original:")
print(df)

In [None]:
# Fill with group-specific mean
df['value_filled'] = df.groupby('group')['value'].transform(lambda x: x.fillna(x.mean()))

print("\nFilled with group means:")
print(df)

## Part 7: Handling Missing Data - Interpolate

### Interpolation

**Method:** `.interpolate(method='linear')`

- `method='linear'`: Straight line between points (default)
- `method='time'`: Uses actual time intervals
- Only use for ordered data (time series, spatial)

In [None]:
# Create time series with gaps
series = pd.Series([1, np.nan, np.nan, 4, 5, np.nan, 7])
print("Original:")
print(series)

In [None]:
# Linear interpolation
interpolated = series.interpolate()
print("\nAfter interpolation:")
print(interpolated)

In [None]:
# Visualize interpolation
fig, ax = plt.subplots(figsize=(10, 4))
series.plot(ax=ax, style='o-', label='Original (with gaps)', markersize=8, linewidth=2, alpha=0.6)
interpolated.plot(ax=ax, style='s-', label='Interpolated', markersize=6, linewidth=2)
ax.set_title('Linear Interpolation')
ax.legend()
ax.grid(alpha=0.3)
plt.tight_layout()
plt.show()

## Part 8: Outlier Detection - IQR Method

### IQR Method

**Formula:**

- IQR = Q3 - Q1
- Lower bound = Q1 - 1.5 × IQR
- Upper bound = Q3 + 1.5 × IQR

**Method:** `.quantile(q)`

In [None]:
# Create data with outliers
data = pd.Series([10, 12, 13, 14, 15, 16, 18, 20, 22, 100])
print("Data:", data.tolist())

In [None]:
# Calculate IQR
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print(f"Q1: {Q1}")
print(f"Q3: {Q3}")
print(f"IQR: {IQR}")
print(f"\nBounds: [{lower_bound:.2f}, {upper_bound:.2f}]")

In [None]:
# Identify outliers
outliers = (data < lower_bound) | (data > upper_bound)
print(f"\nOutliers detected: {outliers.sum()}")
print(f"Outlier values: {data[outliers].tolist()}")

## Part 9: Outlier Detection - Z-Score Method

### Z-Score Method

**Formula:** z = (x - mean) / std

**Rule:** |z| > 3 indicates outlier

In [None]:
# Same data
data = pd.Series([10, 12, 13, 14, 15, 16, 18, 20, 22, 100])

# Calculate z-scores
mean = data.mean()
std = data.std()
z_scores = (data - mean) / std

print("Data:", data.tolist())
print(f"\nMean: {mean:.2f}, Std: {std:.2f}")
print("\nZ-scores:", z_scores.round(2).tolist())

In [None]:
# Identify outliers (|z| > 3)
outliers_z = np.abs(z_scores) > 3
print(f"\nOutliers (|z| > 3): {outliers_z.sum()}")
print(f"Outlier values: {data[outliers_z].tolist()}")

## Part 10: Handling Outliers

### Three Approaches

1. Remove outliers
2. Cap outliers (winsorize)
3. Keep and flag

In [None]:
data = pd.Series([10, 12, 13, 14, 15, 16, 18, 20, 22, 100])

# Calculate bounds (using IQR)
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

print(f"Original data: {data.tolist()}")
print(f"Upper bound: {upper:.2f}")

In [None]:
# Approach 1: Remove outliers
data_removed = data[data <= upper].copy()
print(f"\nRemoved outliers: {data_removed.tolist()}")

In [None]:
# Approach 2: Cap outliers
data_capped = data.clip(upper=upper)
print(f"\nCapped at {upper:.2f}: {data_capped.tolist()}")

In [None]:
# Approach 3: Flag outliers
df_flagged = pd.DataFrame({
    'value': data,
    'is_outlier': data > upper
})
print("\nFlagged outliers:")
print(df_flagged)

## Part 11: End-to-End Example with Titanic

Now let's apply these methods to real data. We'll use the Titanic dataset as-is and perform a systematic data quality check.

In [None]:
# Load Titanic dataset
titanic = sns.load_dataset('titanic')
print(f"Dataset shape: {titanic.shape}")
print(f"\nFirst few rows:")
titanic.head()

In [None]:
# Step 1: Check data types
print("Data types:")
print(titanic.dtypes)

In [None]:
# Step 2: Check for missing values
missing = titanic.isna().sum()
missing_pct = (missing / len(titanic) * 100).round(1)

missing_summary = pd.DataFrame({
    'Missing': missing[missing > 0],
    'Percent': missing_pct[missing > 0]
}).sort_values('Missing', ascending=False)

print("\nMissing data:")
print(missing_summary)

In [None]:
# Step 3: Check for duplicates
dups = titanic.duplicated()
print(f"\nDuplicate rows: {dups.sum()}")

In [None]:
# Step 4: Check value ranges
print("\nAge statistics:")
print(titanic['age'].describe())

print("\nPassenger class values:")
print(titanic['pclass'].value_counts().sort_index())

print("\nFare statistics:")
print(titanic['fare'].describe())

In [None]:
# Step 5: Identify outliers in fare (using IQR)
Q1 = titanic['fare'].quantile(0.25)
Q3 = titanic['fare'].quantile(0.75)
IQR = Q3 - Q1

lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

fare_outliers = (titanic['fare'] < lower) | (titanic['fare'] > upper)
print(f"\nFare outliers: {fare_outliers.sum()} ({fare_outliers.sum()/len(titanic)*100:.1f}%)")
print(f"Upper bound: ${upper:.2f}")
print(f"Max fare: ${titanic['fare'].max():.2f}")

In [None]:
# Step 6: Make cleaning decisions

# Decision 1: Drop rows where 'survived' is missing (critical variable)
titanic_clean = titanic.dropna(subset=['survived']).copy()
print(f"After dropping rows with missing 'survived': {len(titanic_clean)} rows")

# Decision 2: Fill missing 'age' with median by passenger class (MAR pattern)
titanic_clean['age'] = titanic_clean.groupby('pclass')['age'].transform(
    lambda x: x.fillna(x.median())
)

# Decision 3: Fill missing 'embarked' with mode
mode_embarked = titanic_clean['embarked'].mode()[0]
titanic_clean['embarked'] = titanic_clean['embarked'].fillna(mode_embarked)

# Decision 4: Keep fare outliers (legitimate high fares exist)
print(f"\nFinal dataset: {len(titanic_clean)} rows")
print(f"Remaining missing values:\n{titanic_clean.isna().sum().sum()} total")

In [None]:
# Final quality check
print("Final Data Quality Report:")
print(f"Shape: {titanic_clean.shape}")
print(f"\nMissing values by column:")
print(titanic_clean.isna().sum())
print(f"\nDuplicates: {titanic_clean.duplicated().sum()}")

## Summary

**Key methods learned:**

| Task | Method | Key Parameter |
|------|--------|---------------|
| Convert to numeric | `pd.to_numeric()` | `errors='coerce'` |
| Check range | `.between()` | `lower`, `upper` |
| Find duplicates | `.duplicated()` | `keep='first'/'last'/False` |
| Remove duplicates | `.drop_duplicates()` | `subset=['col']` |
| Drop missing | `.dropna()` | `subset=['col']`, `thresh=N` |
| Fill missing | `.fillna()` | `value`, `method='ffill'` |
| Interpolate | `.interpolate()` | `method='linear'` |
| Get quantiles | `.quantile()` | `q=0.25` |
| Cap values | `.clip()` | `lower`, `upper` |
