# Data Import Workflow with Nepal Entity Service

This notebook demonstrates a complete data import workflow using the Nepal Entity Service (nes). We'll import authentic Nepali political data from various sources, validate it, and create entities and relationships.

## Topics Covered

1. Prepare import data
2. Validate data before import
3. Check for duplicates
4. Import entities in batch
5. Create relationships between imported entities
6. Verify imported data
7. Handle import errors
8. Generate import report

## 1. Setup

In [None]:
from pathlib import Path
from datetime import date
from typing import List, Dict, Any

from nes.database.file_database import FileDatabase
from nes.services.publication import PublicationService
from nes.services.search import SearchService
from nes.core.identifiers import build_entity_id

In [None]:
# Initialize services
db_path = Path("../nes-db/v2")
db = FileDatabase(base_path=str(db_path))
pub_service = PublicationService(database=db)
search_service = SearchService(database=db)

print("✓ Services initialized")

## 2. Prepare Import Data

Let's prepare a dataset of Nepali provinces to import.

In [None]:
# Nepali provinces data
provinces_data = [
    {
        "slug": "koshi-province",
        "type": "location",
        "sub_type": "province",
        "names": [
            {
                "kind": "PRIMARY",
                "en": {"full": "Koshi Province"},
                "ne": {"full": "कोशी प्रदेश"},
            }
        ],
        "attributes": {
            "number": 1,
            "capital": "Biratnagar",
            "area_km2": 25905,
            "districts": 14,
        },
    },
    {
        "slug": "madhesh-province",
        "type": "location",
        "sub_type": "province",
        "names": [
            {
                "kind": "PRIMARY",
                "en": {"full": "Madhesh Province"},
                "ne": {"full": "मधेश प्रदेश"},
            }
        ],
        "attributes": {
            "number": 2,
            "capital": "Janakpur",
            "area_km2": 9661,
            "districts": 8,
        },
    },
    {
        "slug": "bagmati-province",
        "type": "location",
        "sub_type": "province",
        "names": [
            {
                "kind": "PRIMARY",
                "en": {"full": "Bagmati Province"},
                "ne": {"full": "बागमती प्रदेश"},
            }
        ],
        "attributes": {
            "number": 3,
            "capital": "Hetauda",
            "area_km2": 20300,
            "districts": 13,
        },
    },
    {
        "slug": "gandaki-province",
        "type": "location",
        "sub_type": "province",
        "names": [
            {
                "kind": "PRIMARY",
                "en": {"full": "Gandaki Province"},
                "ne": {"full": "गण्डकी प्रदेश"},
            }
        ],
        "attributes": {
            "number": 4,
            "capital": "Pokhara",
            "area_km2": 21504,
            "districts": 11,
        },
    },
    {
        "slug": "lumbini-province",
        "type": "location",
        "sub_type": "province",
        "names": [
            {
                "kind": "PRIMARY",
                "en": {"full": "Lumbini Province"},
                "ne": {"full": "लुम्बिनी प्रदेश"},
            }
        ],
        "attributes": {
            "number": 5,
            "capital": "Deukhuri",
            "area_km2": 22288,
            "districts": 12,
        },
    },
    {
        "slug": "karnali-province",
        "type": "location",
        "sub_type": "province",
        "names": [
            {
                "kind": "PRIMARY",
                "en": {"full": "Karnali Province"},
                "ne": {"full": "कर्णाली प्रदेश"},
            }
        ],
        "attributes": {
            "number": 6,
            "capital": "Birendranagar",
            "area_km2": 27984,
            "districts": 10,
        },
    },
    {
        "slug": "sudurpashchim-province",
        "type": "location",
        "sub_type": "province",
        "names": [
            {
                "kind": "PRIMARY",
                "en": {"full": "Sudurpashchim Province"},
                "ne": {"full": "सुदूरपश्चिम प्रदेश"},
            }
        ],
        "attributes": {
            "number": 7,
            "capital": "Godawari",
            "area_km2": 19539,
            "districts": 9,
        },
    },
]

print(f"Prepared {len(provinces_data)} provinces for import")
for province in provinces_data:
    print(
        f"  - {province['names'][0]['en']['full']} (Province {province['attributes']['number']})"
    )

## 3. Validate Data Before Import

In [None]:
def validate_entity_data(entity_data: Dict[str, Any]) -> tuple[bool, List[str]]:
    """Validate entity data before import.

    Returns:
        (is_valid, errors)
    """
    errors = []

    # Check required fields
    if "slug" not in entity_data:
        errors.append("Missing 'slug' field")
    if "type" not in entity_data:
        errors.append("Missing 'type' field")
    if "names" not in entity_data or not entity_data["names"]:
        errors.append("Missing or empty 'names' field")

    # Check for PRIMARY name
    if "names" in entity_data:
        has_primary = any(
            name.get("kind") == "PRIMARY" for name in entity_data["names"]
        )
        if not has_primary:
            errors.append("No PRIMARY name found")

    return len(errors) == 0, errors


# Validate all entities
print("Validating import data...\n")
validation_results = []

for entity_data in provinces_data:
    is_valid, errors = validate_entity_data(entity_data)
    validation_results.append((entity_data["slug"], is_valid, errors))

    if is_valid:
        print(f"✓ {entity_data['slug']}: Valid")
    else:
        print(f"❌ {entity_data['slug']}: Invalid")
        for error in errors:
            print(f"   - {error}")

valid_count = sum(1 for _, is_valid, _ in validation_results if is_valid)
print(f"\nValidation complete: {valid_count}/{len(provinces_data)} valid")

## 4. Check for Duplicates

In [None]:
# Check if entities already exist
print("Checking for existing entities...\n")

duplicate_check = []

for entity_data in provinces_data:
    entity_id = build_entity_id(
        entity_data["type"], entity_data.get("sub_type"), entity_data["slug"]
    )

    existing = await pub_service.get_entity(entity_id)

    if existing:
        print(
            f"⚠ {entity_data['slug']}: Already exists (version {existing.version_summary.version_number})"
        )
        duplicate_check.append((entity_data["slug"], True, existing))
    else:
        print(f"✓ {entity_data['slug']}: New entity")
        duplicate_check.append((entity_data["slug"], False, None))

new_count = sum(1 for _, is_duplicate, _ in duplicate_check if not is_duplicate)
print(f"\n{new_count} new entities, {len(duplicate_check) - new_count} existing")

## 5. Import Entities in Batch

In [None]:
# Import entities
print("Importing entities...\n")

import_stats = {
    "total": len(provinces_data),
    "created": 0,
    "updated": 0,
    "skipped": 0,
    "failed": 0,
    "errors": [],
}

for entity_data in provinces_data:
    try:
        entity_id = build_entity_id(
            entity_data["type"], entity_data.get("sub_type"), entity_data["slug"]
        )

        existing = await pub_service.get_entity(entity_id)

        if existing:
            # Update existing entity
            if not existing.attributes:
                existing.attributes = {}
            existing.attributes.update(entity_data.get("attributes", {}))

            await pub_service.update_entity(
                entity=existing,
                author_id="author:system:import-workflow",
                change_description="Updated via import workflow",
            )
            import_stats["updated"] += 1
            print(f"✓ Updated: {entity_data['slug']}")
        else:
            # Create new entity
            await pub_service.create_entity(
                entity_data=entity_data,
                author_id="author:system:import-workflow",
                change_description="Imported via import workflow",
            )
            import_stats["created"] += 1
            print(f"✓ Created: {entity_data['slug']}")

    except Exception as e:
        import_stats["failed"] += 1
        import_stats["errors"].append(
            {"slug": entity_data.get("slug", "unknown"), "error": str(e)}
        )
        print(f"❌ Failed: {entity_data.get('slug', 'unknown')} - {e}")

print(f"\nImport complete!")
print(f"  Created: {import_stats['created']}")
print(f"  Updated: {import_stats['updated']}")
print(f"  Failed: {import_stats['failed']}")

## 6. Verify Imported Data

In [None]:
# Verify all provinces were imported
print("Verifying imported entities...\n")

for entity_data in provinces_data:
    entity_id = build_entity_id(
        entity_data["type"], entity_data.get("sub_type"), entity_data["slug"]
    )

    entity = await pub_service.get_entity(entity_id)

    if entity:
        print(f"✓ {entity.names[0].en.full}")
        print(f"  ID: {entity.id}")
        print(f"  Version: {entity.version_summary.version_number}")
        print(f"  Capital: {entity.attributes.get('capital', 'Unknown')}")
        print(f"  Districts: {entity.attributes.get('districts', 'Unknown')}")
        print()
    else:
        print(f"❌ Not found: {entity_data['slug']}\n")

## 7. Generate Import Report

In [None]:
# Generate comprehensive import report
print("=" * 70)
print("IMPORT REPORT")
print("=" * 70)

print(f"\nDataset: Nepal Provinces")
print(f"Import Date: {date.today()}")
print(f"Author: author:system:import-workflow")

print(f"\nStatistics:")
print(f"  Total entities: {import_stats['total']}")
print(f"  Created: {import_stats['created']}")
print(f"  Updated: {import_stats['updated']}")
print(f"  Failed: {import_stats['failed']}")
print(
    f"  Success rate: {((import_stats['created'] + import_stats['updated']) / import_stats['total'] * 100):.1f}%"
)

if import_stats["errors"]:
    print(f"\nErrors:")
    for error in import_stats["errors"]:
        print(f"  - {error['slug']}: {error['error']}")

print(f"\nImported Entities:")
for entity_data in provinces_data:
    print(f"  - {entity_data['names'][0]['en']['full']} ({entity_data['slug']})")

print("\n" + "=" * 70)

## 8. Summary

In this notebook, we've completed a full data import workflow:

- ✓ Prepared import data with authentic Nepali provinces
- ✓ Validated data before import
- ✓ Checked for duplicates
- ✓ Imported entities in batch
- ✓ Handled errors gracefully
- ✓ Verified imported data
- ✓ Generated import report

### Next Steps

1. Perform **Data Quality Analysis** in `04_data_quality_analysis.ipynb`
2. Create relationships between imported entities
3. Build custom import workflows for your data sources