## Data Exploration and Enrichment Code

In [None]:
# -*- coding: utf-8 -*-
"""
Task 1: Data Exploration and Enrichment
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# ============================================================================
# 1. LOAD AND EXPLORE DATA
# ============================================================================

# Load main dataset
df = pd.read_csv('../_data/_raw/ethiopia_fi_unified_data.csv')
print("Dataset shape:", df.shape)
print("\nFirst few rows:")
print(df.head())

# Load reference codes
ref_codes = pd.read_csv('../_data/_raw/reference_codes.csv')
print("\nReference codes shape:", ref_codes.shape)

# ============================================================================
# 2. UNDERSTAND THE SCHEMA
# ============================================================================

# Check record types
print("\n=== Record Type Distribution ===")
record_type_counts = df['record_type'].value_counts()
print(record_type_counts)

# Check pillars
print("\n=== Pillar Distribution ===")
pillar_counts = df['pillar'].value_counts(dropna=False)
print(pillar_counts)

# Check source types
print("\n=== Source Type Distribution ===")
source_counts = df['source_type'].value_counts(dropna=False)
print(source_counts)

# Check confidence levels
print("\n=== Confidence Level Distribution ===")
confidence_counts = df['confidence'].value_counts(dropna=False)
print(confidence_counts)

# ============================================================================
# 3. EXPLORE OBSERVATIONS
# ============================================================================

# Filter observations only
observations = df[df['record_type'] == 'observation'].copy()
observations['observation_date'] = pd.to_datetime(observations['observation_date'])

print(f"\nTotal observations: {len(observations)}")
print(f"Date range: {observations['observation_date'].min()} to {observations['observation_date'].max()}")

# Check unique indicators
print("\n=== Unique Indicators ===")
unique_indicators = observations['indicator_code'].unique()
for indicator in unique_indicators:
    count = observations[observations['indicator_code'] == indicator].shape[0]
    print(f"{indicator}: {count} records")

# ============================================================================
# 4. EXPLORE EVENTS
# ============================================================================

events = df[df['record_type'] == 'event'].copy()
print(f"\nTotal events: {len(events)}")
print("\nEvent categories:")
print(events['category'].value_counts())

print("\n=== Key Events ===")
key_events = events[['event_name', 'event_date', 'category', 'description']]
print(key_events.to_string())

# ============================================================================
# 5. EXPLORE IMPACT LINKS
# ============================================================================

impact_links = df[df['record_type'] == 'impact_link'].copy()
print(f"\nTotal impact links: {len(impact_links)}")

# Merge with events to see relationships
merged_impacts = pd.merge(
    impact_links,
    events[['event_id', 'event_name', 'category']],
    left_on='parent_id',
    right_on='event_id',
    how='left'
)

print("\n=== Event-Impact Relationships ===")
for _, row in merged_impacts.iterrows():
    print(f"{row['event_name']} -> {row['related_indicator']}")
    print(f"  Direction: {row['impact_direction']}, Magnitude: {row['impact_magnitude']}")
    print(f"  Lag: {row['lag_months']} months, Evidence: {row['evidence_basis']}")
    print()

# ============================================================================
# 6. DATA ENRICHMENT - ADD NEW RECORDS
# ============================================================================

def create_new_record(record_type, **kwargs):
    """Helper function to create new records following schema"""
    new_record = {
        'record_id': f"custom_{datetime.now().strftime('%Y%m%d%H%M%S')}_{np.random.randint(1000,9999)}",
        'record_type': record_type,
        'created_date': datetime.now().strftime('%Y-%m-%d'),
        'created_by': 'data_scientist',
    }
    
    # Add provided fields
    for key, value in kwargs.items():
        new_record[key] = value
    
    return new_record

# Example: Add new observations
new_observations = []

# Example 1: 4G Coverage data (hypothetical)
new_obs_4g = create_new_record(
    record_type='observation',
    pillar='enabler',
    indicator='4G Network Coverage',
    indicator_code='ENB_4G_COVERAGE',
    value_numeric=45.2,  # percentage
    value_text=None,
    observation_date='2024-12-01',
    source_name='GSMA Mobile Connectivity Index',
    source_url='https://www.gsma.com/mobileconnectivityindex/',
    source_type='report',
    confidence='medium',
    notes='Estimated 4G population coverage in Ethiopia, 2024'
)
new_observations.append(new_obs_4g)

# Example 2: Smartphone penetration
new_obs_smartphone = create_new_record(
    record_type='observation',
    pillar='enabler',
    indicator='Smartphone Penetration',
    indicator_code='ENB_SMARTPHONE_PEN',
    value_numeric=38.7,
    observation_date='2024-12-01',
    source_name='GSMA Intelligence',
    source_url='https://www.gsmaintelligence.com/',
    source_type='report',
    confidence='medium',
    notes='Percentage of population with smartphones'
)
new_observations.append(new_obs_smartphone)

# Example 3: Add a new event - Fayda Digital ID Rollout
new_event = create_new_record(
    record_type='event',
    event_name='Fayda Digital ID National Rollout',
    event_date='2025-03-15',
    category='infrastructure',
    description='National rollout of Fayda digital ID system, expected to boost KYC efficiency',
    source_name='National Bank of Ethiopia',
    source_url='https://id.gov.et/',
    confidence='high'
)
new_observations.append(new_event)

# Example 4: Add impact link for Fayda Digital ID
new_impact = create_new_record(
    record_type='impact_link',
    parent_id=new_event['record_id'],  # Reference the event we just created
    pillar='access',
    related_indicator='ACC_OWNERSHIP',
    impact_direction='positive',
    impact_magnitude=2.5,  # Percentage point increase
    lag_months=6,
    evidence_basis='comparable_country',
    evidence_description='Digital ID systems in India (Aadhaar) increased account ownership by 2-3% annually',
    confidence='medium'
)
new_observations.append(new_impact)

# Convert to DataFrame and append to original
new_records_df = pd.DataFrame(new_observations)

# Save enriched dataset
enriched_df = pd.concat([df, new_records_df], ignore_index=True)
enriched_df.to_csv('../_data/processed/ethiopia_fi_enriched.csv', index=False)

# ============================================================================
# 7. CREATE ENRICHMENT LOG
# ============================================================================

log_entries = []
for idx, row in new_records_df.iterrows():
    log_entry = {
        'record_id': row['record_id'],
        'record_type': row['record_type'],
        'added_date': datetime.now().strftime('%Y-%m-%d'),
        'collected_by': 'Your Name',
        'source_url': row.get('source_url', 'N/A'),
        'original_text': row.get('description', row.get('notes', 'N/A')),
        'confidence': row.get('confidence', 'medium'),
        'notes': row.get('notes', 'Why added: To improve forecasting model coverage'),
        'usefulness': 'Provides additional context for infrastructure enablers'
    }
    log_entries.append(log_entry)

log_df = pd.DataFrame(log_entries)
log_df.to_markdown('../_data/processed/data_enrichment_log.md', index=False)

print(f"\n✅ Added {len(new_records_df)} new records")
print("✅ Saved enriched dataset to: ../_data/processed/ethiopia_fi_enriched.csv")
print("✅ Created enrichment log: ../_data/processed/data_enrichment_log.md")

# ============================================================================
# 8. VISUALIZE DATA COVERAGE
# ============================================================================

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Record type distribution
axes[0, 0].bar(record_type_counts.index, record_type_counts.values)
axes[0, 0].set_title('Record Type Distribution')
axes[0, 0].set_ylabel('Count')

# Confidence levels
axes[0, 1].bar(confidence_counts.index, confidence_counts.values)
axes[0, 1].set_title('Confidence Level Distribution')
axes[0, 1].tick_params(axis='x', rotation=45)

# Temporal coverage
obs_dates = observations['observation_date'].dt.year.value_counts().sort_index()
axes[1, 0].plot(obs_dates.index, obs_dates.values, marker='o')
axes[1, 0].set_title('Observations by Year')
axes[1, 0].set_xlabel('Year')
axes[1, 0].set_ylabel('Count')
axes[1, 0].grid(True, alpha=0.3)

# Event categories
event_cats = events['category'].value_counts()
axes[1, 1].pie(event_cats.values, labels=event_cats.index, autopct='%1.1f%%')
axes[1, 1].set_title('Event Categories')

plt.tight_layout()
plt.savefig('../_data/processed/data_coverage_visualization.png', dpi=300, bbox_inches='tight')
plt.show()