# Parallel Tools - Programmatic Usage

This notebook demonstrates how to use `parallel_web_tools` programmatically in your Python code, notebooks, or data pipelines.

## Setup

First, load environment variables and import the library:

In [None]:
import os

from dotenv import load_dotenv

# Load environment variables from .env.local
load_dotenv("../.env.local")

# Verify API key is loaded
if os.getenv("PARALLEL_API_KEY"):
    print("✓ API key loaded")
else:
    print("⚠ Warning: PARALLEL_API_KEY not found in .env.local")

## Method 1: Run Enrichment from YAML File

The simplest way to run enrichment - just point to a YAML config file:

In [None]:
from parallel_web_tools import run_enrichment

# Run enrichment from existing YAML config
try:
    run_enrichment("example_csv_schema.yaml")
    print("\n✓ Enrichment completed successfully!")
except Exception as e:
    print(f"✗ Error: {e}")

## Method 2: Run from Configuration Dictionary

Build your configuration programmatically and run it directly:

In [None]:
from parallel_web_tools import run_enrichment_from_dict

# Define configuration as a dictionary
config = {
    "source": "example_file.csv",
    "target": "../data/output_notebook.csv",
    "source_type": "csv",
    "source_columns": [
        {"name": "business_name", "description": "The name of a business"},
        {"name": "web_site", "description": "The business's website URL"},
    ],
    "enriched_columns": [
        {"name": "industry", "description": "The primary industry or sector of the business"},
        {"name": "employee_count_estimate", "description": "Estimated number of employees (as a range like '50-100')"},
    ],
}

# Run the enrichment
try:
    run_enrichment_from_dict(config)
    print("\n✓ Enrichment completed successfully!")
except Exception as e:
    print(f"✗ Error: {e}")

## Method 3: Build Schema Objects Programmatically

Use the schema classes directly for full type safety:

In [None]:
from parallel_web_tools import Column, InputSchema, SourceType

# Build schema programmatically
schema = InputSchema(
    source="example_file.csv",
    target="../data/output_schema_based.csv",
    source_type=SourceType.CSV,
    source_columns=[
        Column("business_name", "The name of a business"),
        Column("web_site", "The business's website URL"),
    ],
    enriched_columns=[
        Column("headquarters_location", "City and country of company headquarters"),
        Column("year_founded", "Year the company was founded"),
    ],
)

print("Schema created:")
print(f"  Source: {schema.source}")
print(f"  Target: {schema.target}")
print(f"  Type: {schema.source_type.value}")
print(f"  Source columns: {[c.name for c in schema.source_columns]}")
print(f"  Enriched columns: {[c.name for c in schema.enriched_columns]}")

In [None]:
# Now run the enrichment with our schema
from parallel_web_tools.processors import process_csv

try:
    process_csv(schema)
    print("\n✓ Enrichment completed successfully!")
except Exception as e:
    print(f"✗ Error: {e}")

## Inspect Results

Let's look at the enriched data:

In [None]:
import pandas as pd

# Read the enriched CSV
df = pd.read_csv("../data/output_notebook.csv")

print(f"\nEnriched dataset: {len(df)} rows, {len(df.columns)} columns")
print(f"\nColumns: {list(df.columns)}")
print("\nFirst few rows:")
df.head()

## Example: Dynamic Configuration Based on Data

Build configurations dynamically based on your data:

In [None]:
import pandas as pd

# Read source data to inspect columns
source_df = pd.read_csv("example_file.csv")
print(f"Source data columns: {list(source_df.columns)}")
print(f"Source data shape: {source_df.shape}")

# Dynamically create config based on what we find
detected_columns = list(source_df.columns)

dynamic_config = {
    "source": "example_file.csv",
    "target": "../data/output_dynamic.csv",
    "source_type": "csv",
    "source_columns": [{"name": col, "description": f"Data from {col} column"} for col in detected_columns],
    "enriched_columns": [
        {"name": "technology_stack", "description": "Primary technologies or platforms used by the company"}
    ],
}

print(f"\nDynamic config created with {len(dynamic_config['source_columns'])} source columns")

## Example: Batch Processing Multiple Files

Process multiple datasets in a loop:

In [None]:
from pathlib import Path

from parallel_web_tools import run_enrichment

# List of config files to process
config_files = [
    "example_csv_schema.yaml",
    # Add more configs here
]

results = []

for config_file in config_files:
    config_path = Path(config_file)

    if not config_path.exists():
        print(f"⚠ Skipping {config_file} - file not found")
        continue

    print(f"\nProcessing: {config_file}")

    try:
        run_enrichment(config_file)
        results.append({"config": config_file, "status": "success"})
        print("  ✓ Completed")
    except Exception as e:
        results.append({"config": config_file, "status": "failed", "error": str(e)})
        print(f"  ✗ Failed: {e}")

# Summary
print(f"\n{'=' * 60}")
print("Batch processing complete")
print(f"  Successful: {sum(1 for r in results if r['status'] == 'success')}")
print(f"  Failed: {sum(1 for r in results if r['status'] == 'failed')}")
print(f"{'=' * 60}")

## Example: Error Handling and Validation

Proper error handling for production use:

In [None]:
import logging

from parallel_web_tools import ParseError, run_enrichment_from_dict

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def safe_enrichment(config):
    """
    Safely run enrichment with proper error handling.
    """
    try:
        logger.info(f"Starting enrichment: {config['source']} -> {config['target']}")
        run_enrichment_from_dict(config)
        logger.info("✓ Enrichment completed successfully")
        return {"success": True}

    except FileNotFoundError as e:
        logger.error(f"✗ File not found: {e}")
        return {"success": False, "error": "file_not_found", "message": str(e)}

    except ParseError as e:
        logger.error(f"✗ Invalid configuration: {e}")
        return {"success": False, "error": "invalid_config", "message": str(e)}

    except Exception as e:
        logger.error(f"✗ Unexpected error: {e}")
        return {"success": False, "error": "unknown", "message": str(e)}


# Test the safe wrapper
test_config = {
    "source": "example_file.csv",
    "target": "../data/output_safe.csv",
    "source_type": "csv",
    "source_columns": [{"name": "business_name", "description": "Company name"}],
    "enriched_columns": [{"name": "ceo_name", "description": "Name of the CEO"}],
}

result = safe_enrichment(test_config)
print(f"\nResult: {result}")

## Integration Example: Use in Data Pipeline

Example of how to integrate into a data pipeline:

In [None]:
def data_pipeline_with_enrichment():
    """
    Example data pipeline that includes enrichment step.
    This could be part of an Airflow DAG, cron job, etc.
    """
    print("Step 1: Extract data from source...")
    # Your ETL logic here

    print("Step 2: Transform data...")
    # Data transformation logic

    print("Step 3: Enrich data with Parallel...")
    config = {
        "source": "example_file.csv",
        "target": "../data/output_pipeline.csv",
        "source_type": "csv",
        "source_columns": [
            {"name": "business_name", "description": "Company name"},
        ],
        "enriched_columns": [
            {"name": "description", "description": "Brief company description"},
        ],
    }

    try:
        run_enrichment_from_dict(config)
        print("  ✓ Enrichment successful")
    except Exception as e:
        print(f"  ✗ Enrichment failed: {e}")
        raise

    print("Step 4: Load enriched data to destination...")
    # Load to data warehouse, database, etc.

    print("\n✓ Pipeline completed successfully!")


# Run the pipeline
# data_pipeline_with_enrichment()

## Summary

You've learned how to:

1. ✓ Run enrichment from YAML files
2. ✓ Run enrichment from dictionaries
3. ✓ Build schemas programmatically with type safety
4. ✓ Inspect and validate results
5. ✓ Handle errors gracefully
6. ✓ Integrate into data pipelines
7. ✓ Process multiple datasets in batch

For more information:
- See `API.md` for complete API reference
- See `programmatic_usage.py` for standalone Python script examples
- Run `parallel-cli --help` for CLI usage