# 02 - Data Validation

This notebook validates the collected data for quality issues:
- Missing values
- Logical constraints (e.g., revenue <= fees)
- Negative values where unexpected

In [None]:
import os
import sys
from collections import Counter

sys.path.insert(0, os.path.dirname(os.getcwd()))

import pandas as pd

from src import validation as val

In [None]:
# Load raw panel
panel = pd.read_csv('../data/processed/panel_raw.csv')
print(f"Loaded panel with shape: {panel.shape}")

In [None]:
# Check missing values
print("Missing values per column:")
missing = panel.isnull().sum()
missing_pct = (missing / len(panel) * 100).round(2)
missing_df = pd.DataFrame({'count': missing, 'percent': missing_pct})
missing_df[missing_df['count'] > 0]

In [None]:
# Apply row-level validation
print("Applying validation rules...")
validated = panel.apply(val.validate_row, axis=1)

# Extract QA notes
issues = validated['qa_notes'].fillna('') if 'qa_notes' in validated.columns else pd.Series([''] * len(validated))
flags = issues != ''
frac_flagged = float(flags.mean()) if len(flags) else 0.0
print(f"Fraction of rows flagged: {frac_flagged:.2%}")

In [None]:
# Count issue types
print("Issue counts:")
issue_counts = Counter([i for s in issues[issues != ''] for i in s.split(';') if i])
for issue, count in issue_counts.most_common():
    print(f"  {issue}: {count}")

In [None]:
# Create validation summary
summary_df = pd.DataFrame({
    'metric': list(issue_counts.keys()) + ['_any_flagged_fraction'],
    'value': list(issue_counts.values()) + [frac_flagged]
})

# Save validation summary
os.makedirs('../results/tables', exist_ok=True)
summary_df.to_csv('../results/tables/validation_summary.csv', index=False)
print("Validation summary saved to results/tables/validation_summary.csv")
summary_df