In [9]:
import os
import json
import asyncio
import uuid
from pathlib import Path
from datetime import datetime

import nats
from nats.js.api import StreamConfig, RetentionPolicy, DiscardPolicy
from dotenv import load_dotenv
from IPython.display import display, JSON

# Load environment variables from .env file
load_dotenv(os.path.join("keys", ".env"))

# NATS connection settings
NAT_URL = os.getenv("NAT_URL", "nats://localhost:4222")
INPUT_STREAM = os.getenv("INPUT_STREAM", "PII-TASKS")
INPUT_SUBJECT = os.getenv("INPUT_SUBJECT", "pii.tasks.started")  # Removed '>' suffix
LOCAL_ENV = os.getenv("LOCAL_ENV", "1")

# Display current configuration
print("NATS Producer Configuration:")
print(f"NATS URL: {NAT_URL}")
print(f"Input Stream: {INPUT_STREAM}")
print(f"Input Subject: {INPUT_SUBJECT}")
print(f"Local Environment: {LOCAL_ENV}")


NATS Producer Configuration:
NATS URL: nats://localhost:4222
Input Stream: PII-TASKS
Input Subject: pii.tasks.started.>
Local Environment: 1


In [10]:
# Create output directory for results
OUTPUT_DIR = "output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [11]:
async def ensure_stream_exists(
    nats_url,
    stream_name,
    subjects
):
    """Ensure that a NATS stream exists with the given name and subjects"""
    nc = await nats.connect(nats_url)
    js = nc.jetstream()
    
    try:
        # Check if stream exists
        try:
            await js.stream_info(stream_name)
            print(f"Stream '{stream_name}' already exists.")
        except Exception as e:
            # Stream doesn't exist, create it
            stream_config = StreamConfig(
                name=stream_name,
                subjects=subjects,
                retention=RetentionPolicy.WORK_QUEUE,
                max_age=30 * 24 * 60 * 60,  # 30 days
                duplicate_window=6 * 60,  # 6 minutes
                discard=DiscardPolicy.NEW,  # Discard new messages if the stream is full
            )
            
            await js.add_stream(config=stream_config)
            print(f"Stream '{stream_name}' created with subjects: {subjects}")
    finally:
        await nc.close()

# Main function to send files to NATS - simplified without batch IDs
async def publish_files_to_nats(
    folder_path, 
    nats_url=NAT_URL,
    input_stream=INPUT_STREAM,
    input_subject=INPUT_SUBJECT,
    local_env=LOCAL_ENV
):
    """
    Publish all JSON files from a folder to the NATS input stream/subject.
    
    Args:
        folder_path: Path to the folder containing JSON files to process
        nats_url: The NATS server URL
        input_stream: The input stream name
        input_subject: The input subject
        local_env: Local environment flag
    """
    # Check if folder exists
    if not os.path.exists(folder_path):
        raise FileNotFoundError(f"Folder not found: {folder_path}")
    
    # First ensure the stream exists
    await ensure_stream_exists(
        nats_url=nats_url,
        stream_name=input_stream,
        subjects=[input_subject]
    )
    
    # Connect to NATS
    nc = await nats.connect(nats_url)
    js = nc.jetstream()
    
    # Track files published
    files_published = []
    
    try:
        # Process each file in the folder
        for filename in os.listdir(folder_path):
            if filename.endswith('.json'):
                file_path = os.path.join(folder_path, filename)
                print(f"Processing file: {filename}")
                
                try:
                    # Create a temporary HTTP server URL or file URL
                    # For local testing, we'll use file:// URLs
                    if local_env == "1":
                        # Using absolute path for file:// URLs
                        abs_path = os.path.abspath(file_path)
                        uri = f"file://{abs_path}"
                    else:
                        # In production, this might be an HTTP URL
                        uri = f"file://{file_path}"
                    
                    # Create message payload
                    message = {
                        "source": {
                            "uri": uri,
                            "type": "json"
                        },
                        "state": {
                            "status": "STARTED",
                            "timestamp": datetime.now().isoformat()
                        }
                    }
                    
                    # Create message headers with filename
                    headers = {
                        "filename": filename
                    }
                    
                    # Publish message to input subject
                    await js.publish(
                        input_subject, 
                        json.dumps(message).encode(), 
                        headers=headers
                    )
                    
                    print(f"Published file {filename} to {input_subject}")
                    files_published.append(filename)
                    
                except Exception as e:
                    print(f"Error processing file {filename}: {str(e)}")
        
    finally:
        # Close NATS connection
        await nc.close()
    
    return files_published

In [12]:
async def process_folder(folder_path):
    """Publish all files in the folder to NATS"""
    print(f"Publishing files from {folder_path} to NATS...")
    files_published = await publish_files_to_nats(folder_path)
    
    if files_published:
        print(f"\nPublished {len(files_published)} files to NATS")
        print("Files published:")
        for file in files_published:
            print(f"- {file}")
    else:
        print("No files were published to NATS")
    
    return files_published

In [14]:
folder_path = "input"  # Replace with your input folder
batch_ids_map = await process_folder(folder_path)

Publishing files from input to NATS...
Stream 'PII-TASKS' already exists.
Processing file: example1.json
Published file example1.json to pii.tasks.started.>
Processing file: example2.json
Published file example2.json to pii.tasks.started.>
Processing file: example3.json
Published file example3.json to pii.tasks.started.>
Processing file: example5.json
Published file example5.json to pii.tasks.started.>

Published 4 files to NATS
Files published:
- example1.json
- example2.json
- example3.json
- example5.json
