# De-Center: Exploratory Data Analysis (EDA)
## Analyzing Policy Claims vs. Community Impact Contradictions

This notebook explores the contradiction dataset collected from official policy documents, news articles, and local filings regarding California's AI data center environmental and water impacts.

## 1. Load and Inspect the Data

In [None]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set style for visualizations
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Load ground truth contradictions
with open('../data/ground_truth.json', 'r') as f:
    ground_truth = json.load(f)

print(f"Loaded {len(ground_truth)} contradiction pairs\n")
print("Sample contradiction:")
print(json.dumps(ground_truth[0], indent=2))

## 2. Summary Statistics

In [None]:
# Convert to DataFrame for easier analysis
df = pd.DataFrame(ground_truth)

print("=== DATASET OVERVIEW ===")
print(f"Total contradictions: {len(df)}")
print(f"Columns: {list(df.columns)}")
print(f"\nDataFrame shape: {df.shape}")
print(f"\nData types:\n{df.dtypes}")

## 3. Analyze Contradiction Issues

In [None]:
# Issue distribution
issue_counts = df['issue'].value_counts()
print("=== CONTRADICTIONS BY ISSUE ===")
print(issue_counts)
print(f"\nTotal unique issues: {len(issue_counts)}")

# Visualize issue distribution
fig, ax = plt.subplots(figsize=(10, 6))
issue_counts.plot(kind='barh', ax=ax, color='steelblue')
ax.set_xlabel('Number of Contradictions')
ax.set_ylabel('Issue Type')
ax.set_title('Contradiction Distribution by Issue')
plt.tight_layout()
plt.savefig('../data/processed/issue_distribution.png', dpi=300, bbox_inches='tight')
plt.show()
print("\n✓ Chart saved to data/processed/issue_distribution.png")

## 4. Analyze Source Patterns

In [None]:
# Extract source types from URLs
source_a_counts = Counter()
source_b_counts = Counter()

def categorize_source(url):
    """Categorize source by domain/type"""
    if 'calmatters' in url:
        return 'News (CalMatters)'
    elif 'gov.ca.gov' in url or 'cpuc' in url:
        return 'Government (CA)'
    elif 'next10.org' in url:
        return 'Research (Next10/UCR)'
    elif 'legistar' in url:
        return 'Government (Local)'
    elif 'siliconvalleypower' in url:
        return 'Utility (SVP)'
    elif 'sanjose' in url:
        return 'Government (San Jose)'
    else:
        return 'Other'

for row in ground_truth:
    source_a_counts[categorize_source(row['source_a'])] += 1
    source_b_counts[categorize_source(row['source_b'])] += 1

print("=== SOURCE A (Official Claims) ===")
for source, count in source_a_counts.most_common():
    print(f"  {source}: {count}")

print("\n=== SOURCE B (Community Impact Data) ===")
for source, count in source_b_counts.most_common():
    print(f"  {source}: {count}")

## 5. Contradiction Claims Text Analysis

In [None]:
# Analyze claim lengths and keywords
print("=== OFFICIAL CLAIM ANALYSIS ===")
official_lengths = [len(row['official_claim'].split()) for row in ground_truth]
print(f"Average words per official claim: {sum(official_lengths) / len(official_lengths):.1f}")
print(f"Min/Max: {min(official_lengths)}/{max(official_lengths)} words")

print("\n=== CONTRADICTORY DATA ANALYSIS ===")
contrary_lengths = [len(row['contradictory_data'].split()) for row in ground_truth]
print(f"Average words per contradictory data: {sum(contrary_lengths) / len(contrary_lengths):.1f}")
print(f"Min/Max: {min(contrary_lengths)}/{max(contrary_lengths)} words")

# Show sample claims
print("\n=== SAMPLE CLAIMS ===")
for i, row in enumerate(ground_truth[:1], 1):
    print(f"\nContradiction {i}: {row['issue']}")
    print(f"Official: {row['official_claim'][:100]}...")
    print(f"Counter-data: {row['contradictory_data'][:100]}...")

## 6. Key Metrics for Model Evaluation

In [None]:
# Define metrics for later model evaluation
print("=== BENCHMARK METRICS ===")
print(f"Total Contradictions (Ground Truth): {len(ground_truth)}")
print(f"\nMetrics for model evaluation:")
print(f"  - Recall: (Detected Contradictions) / (Actual {len(ground_truth)})")
print(f"  - Precision: (True Contradictions) / (Total Detected)")
print(f"  - F1-Score: 2 * (Precision * Recall) / (Precision + Recall)")
print(f"\nBaseline (random):")
print(f"  - Expected Recall: 0.5")
print(f"  - Expected Precision: ~(Contradictions / All Pairs)")

# Calculate data coverage
print(f"\n=== DATA COVERAGE ===")
print(f"Topics covered: water usage, oversight delays, emissions reporting")
print(f"Geographic scope: California (state-level policy + local impacts)")
print(f"Timeframe: 2019-2028 (historical + projected)")
print(f"Source diversity: {len(set([row['source_a'] for row in ground_truth] + [row['source_b'] for row in ground_truth]))} unique sources")

## 7. Data Gaps and Next Steps

In [None]:
print("=== IDENTIFIED GAPS ===")
print("1. Local filing evidence: Limited 2026 San Jose/Santa Clara rate-hike or water-restriction language")
print("2. Scope of contradictions: Currently 3 pairs - need 10-20 for robust model training")
print("3. Annotation quality: Needs multi-annotator review and uncertainty scoring")
print(f"\n=== NEXT STEPS ===")
print("1. Run web scraper to expand ground_truth.json with additional contradictions")
print("2. Add severity and confidence scores to each contradiction pair")
print("3. Create training/test splits for model evaluation")
print("4. Develop contradiction detection baseline (keyword matching)")

## 8. Export Summary Report

In [None]:
# Create summary report
summary_report = {
    'dataset_name': 'De-Center: California Data Center Policy Contradictions',
    'total_contradictions': len(ground_truth),
    'unique_issues': list(df['issue'].unique()),
    'data_collection_date': '2026-02-21',
    'key_statistics': {
        'avg_official_claim_length': round(sum(official_lengths) / len(official_lengths), 1),
        'avg_contrary_data_length': round(sum(contrary_lengths) / len(contrary_lengths), 1),
        'unique_sources': len(set([row['source_a'] for row in ground_truth] + [row['source_b'] for row in ground_truth]))
    },
    'source_distribution': {
        'source_a': dict(source_a_counts),
        'source_b': dict(source_b_counts)
    },
    'next_steps': [
        'Expand dataset via web scraper',
        'Add severity and confidence scores',
        'Create train/test splits',
        'Build contradiction detection baseline'
    ]
}

with open('../data/processed/eda_summary.json', 'w') as f:
    json.dump(summary_report, f, indent=2)

print("✓ EDA summary saved to data/processed/eda_summary.json")
print(json.dumps(summary_report, indent=2))