In [1]:
import os
import json
import asyncio
import uuid
from pathlib import Path
from datetime import datetime
import base64
from PIL import Image
import io
import matplotlib.pyplot as plt
import traceback

import nats
from nats.js.api import StreamConfig, ConsumerConfig, AckPolicy, RetentionPolicy, DiscardPolicy
from dotenv import load_dotenv
from IPython.display import display, JSON, HTML

# Load environment variables from .env file
load_dotenv(os.path.join("keys", ".env"))

# NATS connection settings - use original env variables with exact capitalization
NAT_URL = os.getenv("NAT_URL", "nats://localhost:4222")
OUTPUT_STREAM = os.getenv("OUTPUT_STREAM", "image-RESULTS")
OUTPUT_SUBJECT = os.getenv("OUTPUT_SUBJECT", "IMAGE.results.completed.>")
LOCAL_ENV = os.getenv("LOCAL_ENV", "1")

# Output directory for results
OUTPUT_DIR = "output_images"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Display current configuration
print("NATS Image Tagger Consumer Configuration:")
print(f"NATS URL: {NAT_URL}")
print(f"Output Stream: {OUTPUT_STREAM}")
print(f"Output Subject: {OUTPUT_SUBJECT}")
print(f"Local Environment: {LOCAL_ENV}")
print(f"Output Directory: {OUTPUT_DIR}")

NATS Image Tagger Consumer Configuration:
NATS URL: nats://localhost:4222
Output Stream: IMAGE-RESULTS
Output Subject: image.results.completed.>
Local Environment: 1
Output Directory: output_images


In [2]:
async def check_stream_exists(nats_url, stream_name):
    """Check if a stream exists but don't try to create it"""
    nc = await nats.connect(nats_url)
    js = nc.jetstream()
    
    try:
        try:
            await js.stream_info(stream_name)
            print(f"Stream '{stream_name}' exists.")
            return True
        except Exception as e:
            print(f"Stream '{stream_name}' does not exist: {str(e)}")
            return False
    finally:
        await nc.close()

In [3]:
async def list_all_streams(nats_url=NAT_URL):
    """List all streams in the NATS server"""
    nc = await nats.connect(nats_url)
    js = nc.jetstream()
    
    try:
        # Get streams info
        streams = await js.streams_info()
        print("\nStreams available on NATS server:")
        
        for stream in streams:
            print(f"- Name: {stream.config.name}")
            print(f"  Subjects: {stream.config.subjects}")
            if hasattr(stream.state, 'messages'):
                print(f"  Messages: {stream.state.messages}")
            print("")
            
        return streams
    except Exception as e:
        print(f"Error listing streams: {e}")
        traceback.print_exc()
        return []
    finally:
        await nc.close()

In [4]:
def analyze_image_tagging_results(data):
    """Analyze image tagging results from a message"""
    if not data:
        print("No data to analyze")
        return
    
    # Extract documents from the result
    documents_treated = data.get("number_documents_treated", 0)
    documents_not_treated = data.get("number_documents_non_treated", 0)
    tagging_data = data.get("data", [])
    
    print(f"Total images processed: {documents_treated}")
    if documents_not_treated > 0:
        print(f"Images that failed processing: {documents_not_treated}")
        
        # Show IDs of documents not treated
        not_treated_list = data.get("list_id_not_treated", [])
        for item in not_treated_list:
            if isinstance(item, dict):
                for doc_id, reason in item.items():
                    print(f"  - ID: {doc_id}, Reason: {reason}")
    
    # Display tagging results for each image
    for i, image_result in enumerate(tagging_data):
        print(f"\nImage #{i+1} ID: {image_result.get('id', 'N/A')}")
        
        # Handle different response structures
        source = image_result.get('source', {})
        file_name = source.get('file_name', 'Unknown')
        print(f"Filename: {file_name}")
        
        # Display tags - they might be directly in 'source.content' or in a 'result' field
        tags = []
        content = source.get('content')
        if isinstance(content, list):
            tags = content
        elif image_result.get("result"):
            tags = image_result.get("result", [])
        
        print("Tags:")
        for tag in tags:
            label = tag.get("label", "Unknown")
            score = tag.get("score", 0)
            print(f"  - {label}: {score:.4f}")


def save_images_with_tags(data, output_dir=OUTPUT_DIR):
    """Save processed images with their tags for visualization"""
    if not data:
        return
    
    tagging_data = data.get("data", [])
    if not tagging_data:
        print("No image tagging data found")
        return
    
    for image_result in tagging_data:
        try:
            # Get image info
            image_id = image_result.get("id", "unknown")
            
            # Extract filename from source if available
            source = image_result.get('source', {})
            filename = source.get('file_name', f"image_{image_id}")
            
            # Extract tags - they could be in different places based on response structure
            tags = []
            content = source.get('content')
            if isinstance(content, list):
                tags = content
            elif image_result.get("result"):
                tags = image_result.get("result", [])
            
            if not tags:
                print(f"No tags found for image {filename}")
                continue
            
            # Create a JSON file with tags for this image
            json_filename = f"{os.path.splitext(filename)[0]}_tags.json"
            json_path = os.path.join(output_dir, json_filename)
            
            with open(json_path, "w") as f:
                json.dump({
                    "image_id": image_id,
                    "filename": filename,
                    "tags": tags
                }, f, indent=2)
            
            print(f"Saved tags for {filename} to {json_path}")
            
        except Exception as e:
            print(f"Error saving results for image: {str(e)}")
            traceback.print_exc()
            continue

In [5]:
async def monitor_stream_for_new_messages(
    output_dir=OUTPUT_DIR,
    nats_url=NAT_URL,
    stream_name=OUTPUT_STREAM,
    run_time_seconds=3600,  # Default to run for 1 hour
    poll_interval=1  # Seconds between polling the stream
):
    """
    Monitor a NATS stream for new image tagging results.
    
    Args:
        output_dir: Directory to save output files
        nats_url: The NATS server URL
        stream_name: The stream name to read from
        run_time_seconds: How long to run the consumer (in seconds)
        poll_interval: Seconds between polling the stream
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Check if stream exists
    stream_exists = await check_stream_exists(nats_url, stream_name)
    if not stream_exists:
        print(f"ERROR: Stream {stream_name} does not exist!")
        print("This consumer cannot monitor a non-existent stream.")
        print("Please run the image_tagger_nats.py service first to create the streams.")
        return 0
    
    # List all streams for debugging
    await list_all_streams(nats_url)
    
    # Connect to NATS
    nc = await nats.connect(nats_url)
    js = nc.jetstream()
    
    # Track number of messages processed
    messages_processed = 0
    
    # Initialize start_time
    start_time = datetime.now()
    
    try:
        # Get stream info to find the current last sequence
        try:
            stream_info = await js.stream_info(stream_name)
            last_seq = stream_info.state.last_seq
            print(f"Connected to stream '{stream_name}'")
            print(f"Stream contains {stream_info.state.messages} messages")
            print(f"Last sequence is {last_seq}")
        except Exception as e:
            print(f"Error getting stream info: {e}")
            traceback.print_exc()
            return messages_processed
        
        print(f"Results monitor will run for {run_time_seconds} seconds or until interrupted")
        print(f"Results will be saved to {output_dir}")
        print("Monitoring for new image tagging results...")
        
        # Current sequence to start from
        current_seq = last_seq + 1
        
        # Poll the stream until the run time is reached
        while (datetime.now() - start_time).total_seconds() < run_time_seconds:
            try:
                # Get the current stream info
                stream_info = await js.stream_info(stream_name)
                new_last_seq = stream_info.state.last_seq
                
                # Check if there are new messages
                if new_last_seq >= current_seq:
                    print(f"Found {new_last_seq - current_seq + 1} new results")
                    
                    # Get messages from current_seq to new_last_seq
                    for seq in range(current_seq, new_last_seq + 1):
                        try:
                            # Get the message at this sequence
                            msg = await js.get_msg(stream_name, seq)
                            
                            # Parse message data
                            data = json.loads(msg.data.decode("utf-8"))
                            
                            # Extract filename from headers if available
                            if hasattr(msg, 'headers') and msg.headers and "filename" in msg.headers:
                                filename = msg.headers["filename"]
                                base_filename = os.path.splitext(filename)[0]
                            else:
                                # Generate a unique filename
                                timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
                                base_filename = f"result_{timestamp}_{seq}"
                            
                            # Save result to file
                            output_path = os.path.join(output_dir, f"{base_filename}_result.json")
                            with open(output_path, "w") as f:
                                json.dump(data, f, indent=2)
                            
                            # Increment counter and display info
                            messages_processed += 1
                            print(f"Received result #{messages_processed}, seq={seq}, saved to {output_path}")
                            
                            # Analyze the image tagging results
                            print("\nImage Tagging Analysis:")
                            analyze_image_tagging_results(data)
                            
                            # Save images with tags
                            save_images_with_tags(data, output_dir)
                            
                            print("-" * 50)
                            
                        except Exception as e:
                            print(f"Error processing message with seq={seq}: {str(e)}")
                            traceback.print_exc()
                            continue
                    
                    # Update current_seq
                    current_seq = new_last_seq + 1
                else:
                    # No new messages
                    print("No new results found. Waiting...")
                
                # Wait before polling again
                await asyncio.sleep(poll_interval)
                
            except Exception as e:
                print(f"Error polling stream: {str(e)}")
                traceback.print_exc()
                await asyncio.sleep(poll_interval)
                
    except KeyboardInterrupt:
        print("\nReceived interrupt signal, shutting down...")
    except Exception as e:
        print(f"Error in monitor: {str(e)}")
        traceback.print_exc()
    finally:
        # Close NATS connection
        await nc.close()
        print("NATS connection closed")
    
    # Print summary
    run_time = (datetime.now() - start_time).total_seconds()
    print(f"\nMonitor summary:")
    print(f"- Run time: {run_time:.2f} seconds")
    print(f"- Results processed: {messages_processed}")
    print(f"- Results saved to: {output_dir}")
    
    return messages_processed


async def process_and_monitor_stream(
    output_dir=OUTPUT_DIR,
    nats_url=NAT_URL,
    stream_name=OUTPUT_STREAM,
    run_time_seconds=3600,  # Default to run for 1 hour
    poll_interval=1,  # Seconds between polling the stream
    start_from_beginning=False  # Whether to process all existing messages
):
    """
    Process existing image tagging results and monitor for new ones.
    
    Args:
        output_dir: Directory to save output files
        nats_url: The NATS server URL
        stream_name: The stream name to read from
        run_time_seconds: How long to run the consumer (in seconds)
        poll_interval: Seconds between polling the stream
        start_from_beginning: Whether to process all existing messages
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Check if stream exists
    stream_exists = await check_stream_exists(nats_url, stream_name)
    if not stream_exists:
        print(f"ERROR: Stream {stream_name} does not exist!")
        print("This consumer cannot monitor a non-existent stream.")
        print("Please run the image_tagger_nats.py service first to create the streams.")
        return 0
    
    # List all streams for debugging
    await list_all_streams(nats_url)
    
    # Connect to NATS
    nc = await nats.connect(nats_url)
    js = nc.jetstream()
    
    # Track number of messages processed
    messages_processed = 0
    
    # Initialize start_time
    start_time = datetime.now()
    
    try:
        # Get stream info
        try:
            stream_info = await js.stream_info(stream_name)
            total_messages = stream_info.state.messages
            first_seq = stream_info.state.first_seq
            last_seq = stream_info.state.last_seq
            
            print(f"Connected to stream '{stream_name}'")
            print(f"Stream contains {total_messages} messages")
            print(f"Sequence range: {first_seq} to {last_seq}")
        except Exception as e:
            print(f"Error getting stream info: {e}")
            traceback.print_exc()
            return messages_processed
        
        # Determine starting sequence
        if start_from_beginning and total_messages > 0:
            current_seq = first_seq
            print(f"Will process all existing results starting from sequence {current_seq}")
        else:
            current_seq = last_seq + 1
            print(f"Will only process new results starting from sequence {current_seq}")
        
        print(f"Stream processor will run for {run_time_seconds} seconds or until interrupted")
        print(f"Results will be saved to {output_dir}")
        print("Processing image tagging results...")
        
        # Process messages until the run time is reached
        while (datetime.now() - start_time).total_seconds() < run_time_seconds:
            try:
                # Get the current stream info
                stream_info = await js.stream_info(stream_name)
                new_last_seq = stream_info.state.last_seq
                
                # Check if there are messages to process
                if new_last_seq >= current_seq:
                    if current_seq <= last_seq:
                        print(f"Processing {min(new_last_seq, last_seq) - current_seq + 1} existing results")
                    else:
                        print(f"Found {new_last_seq - current_seq + 1} new results")
                    
                    # Get messages from current_seq to new_last_seq
                    for seq in range(current_seq, new_last_seq + 1):
                        try:
                            # Get the message at this sequence
                            msg = await js.get_msg(stream_name, seq)
                            
                            # Parse message data
                            data = json.loads(msg.data.decode("utf-8"))
                            
                            # Extract filename from headers if available
                            if hasattr(msg, 'headers') and msg.headers and "filename" in msg.headers:
                                filename = msg.headers["filename"]
                                base_filename = os.path.splitext(filename)[0]
                            else:
                                # Generate a unique filename
                                timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
                                base_filename = f"result_{timestamp}_{seq}"
                            
                            # Save result to file
                            output_path = os.path.join(output_dir, f"{base_filename}_result.json")
                            with open(output_path, "w") as f:
                                json.dump(data, f, indent=2)
                            
                            # Increment counter and display info
                            messages_processed += 1
                            print(f"Processed result #{messages_processed}, seq={seq}, saved to {output_path}")
                            
                            # Analyze the image tagging results
                            print("\nImage Tagging Analysis:")
                            analyze_image_tagging_results(data)
                            
                            # Save images with tags
                            save_images_with_tags(data, output_dir)
                            
                            print("-" * 50)
                            
                        except Exception as e:
                            print(f"Error processing message with seq={seq}: {str(e)}")
                            traceback.print_exc()
                            continue
                    
                    # Update current_seq
                    current_seq = new_last_seq + 1
                else:
                    # No new messages
                    print("No new results found. Waiting...")
                
                # Wait before polling again
                await asyncio.sleep(poll_interval)
                
            except Exception as e:
                print(f"Error polling stream: {str(e)}")
                traceback.print_exc()
                await asyncio.sleep(poll_interval)
                
    except KeyboardInterrupt:
        print("\nReceived interrupt signal, shutting down...")
    except Exception as e:
        print(f"Error in processor: {str(e)}")
        traceback.print_exc()
    finally:
        # Close NATS connection
        await nc.close()
        print("NATS connection closed")
    
    # Print summary
    run_time = (datetime.now() - start_time).total_seconds()
    print(f"\nProcessor summary:")
    print(f"- Run time: {run_time:.2f} seconds")
    print(f"- Results processed: {messages_processed}")
    print(f"- Results saved to: {output_dir}")
    
    return messages_processed

In [6]:
def visualize_image_tags(output_dir=OUTPUT_DIR):
    """
    Visualize the tags for processed images.
    This function finds all the tag JSON files and creates a visual summary.
    """
    # Find all the tag JSON files in the output directory
    tag_files = [f for f in os.listdir(output_dir) if f.endswith('_tags.json')]
    
    if not tag_files:
        print("No tag files found in the output directory")
        return
    
    print(f"Found {len(tag_files)} image tag results")
    
    for tag_file in tag_files:
        try:
            # Load the tag data
            with open(os.path.join(output_dir, tag_file), 'r') as f:
                tag_data = json.load(f)
                
            filename = tag_data.get('filename')
            tags = tag_data.get('tags', [])
            
            if not tags:
                continue
                
            # Create a simple horizontal bar chart
            labels = [tag.get('label', 'Unknown') for tag in tags]
            scores = [tag.get('score', 0) for tag in tags]
            
            # Sort by score
            sorted_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
            labels = [labels[i] for i in sorted_indices]
            scores = [scores[i] for i in sorted_indices]
            
            plt.figure(figsize=(10, 5))
            plt.barh(labels, scores)
            plt.xlabel('Confidence Score')
            plt.title(f'Image Tags: {filename}')
            plt.xlim(0, 1.0)
            plt.tight_layout()
            
            # Save the chart
            chart_filename = f"{os.path.splitext(tag_file)[0]}_chart.png"
            plt.savefig(os.path.join(output_dir, chart_filename))
            plt.close()
            
            print(f"Created visualization for {filename}")
            
        except Exception as e:
            print(f"Error visualizing tags for {tag_file}: {str(e)}")
            traceback.print_exc()
            continue
    
    print("Visualization complete!")

In [7]:
# Set parameters
run_time = 3600  # 1 hour
output_directory = OUTPUT_DIR
poll_interval = 1
start_from_beginning = True

# First, check streams
await list_all_streams()


Streams available on NATS server:
- Name: IMAGE-RESULTS
  Subjects: ['IMAGE.RESULTS.COMPLETED.>', 'image.results.completed.>', 'IMAGE.results.completed.>']
  Messages: 1

- Name: IMAGE-TASKS
  Subjects: ['image.tasks.started.>']
  Messages: 0

- Name: IMAGE_STREAM
  Subjects: ['images.process']
  Messages: 0

- Name: TEXT_STREAM
  Subjects: ['text.results']
  Messages: 0



[StreamInfo(config=StreamConfig(name='IMAGE-RESULTS', description=None, subjects=['IMAGE.RESULTS.COMPLETED.>', 'image.results.completed.>', 'IMAGE.results.completed.>'], retention='workqueue', max_consumers=-1, max_msgs=-1, max_bytes=-1, discard='new', discard_new_per_subject=False, max_age=2592000.0, max_msgs_per_subject=-1, max_msg_size=-1, storage='file', num_replicas=1, no_ack=False, template_owner=None, duplicate_window=360.0, placement=None, mirror=None, sources=None, sealed=False, deny_delete=False, deny_purge=False, allow_rollup_hdrs=False, republish=None, subject_transform=None, allow_direct=False, mirror_direct=False, compression='none', metadata={'_nats.req.level': '0'}), state=StreamState(messages=1, bytes=516, first_seq=1, last_seq=1, consumer_count=0, deleted=None, num_deleted=None, lost=None, subjects=None), mirror=None, sources=None, cluster=None, did_create=None),
 StreamInfo(config=StreamConfig(name='IMAGE-TASKS', description=None, subjects=['image.tasks.started.>'], 

In [8]:
messages_processed = await monitor_stream_for_new_messages(
    output_dir=output_directory,
    run_time_seconds=run_time,
    poll_interval=poll_interval
)


Stream 'IMAGE-RESULTS' exists.

Streams available on NATS server:
- Name: IMAGE-RESULTS
  Subjects: ['IMAGE.RESULTS.COMPLETED.>', 'image.results.completed.>', 'IMAGE.results.completed.>']
  Messages: 1

- Name: IMAGE-TASKS
  Subjects: ['image.tasks.started.>']
  Messages: 0

- Name: IMAGE_STREAM
  Subjects: ['images.process']
  Messages: 0

- Name: TEXT_STREAM
  Subjects: ['text.results']
  Messages: 0

Connected to stream 'IMAGE-RESULTS'
Stream contains 1 messages
Last sequence is 1
Results monitor will run for 3600 seconds or until interrupted
Results will be saved to output_images
Monitoring for new image tagging results...
No new results found. Waiting...
No new results found. Waiting...
No new results found. Waiting...
No new results found. Waiting...
No new results found. Waiting...
No new results found. Waiting...
No new results found. Waiting...
NATS connection closed


CancelledError: 

In [None]:
# Uncomment this to use instead of Option 1
messages_processed = await process_and_monitor_stream(
    output_dir=output_directory,
    run_time_seconds=run_time,
    poll_interval=poll_interval,
    start_from_beginning=start_from_beginning
)

Stream 'IMAGE-RESULTS' exists.

Streams available on NATS server:
- Name: IMAGE-RESULTS
  Subjects: ['IMAGE.RESULTS.COMPLETED.>', 'image.results.completed.>', 'IMAGE.results.completed.>']
  Messages: 1

- Name: IMAGE-TASKS
  Subjects: ['image.tasks.started.>']
  Messages: 0

- Name: IMAGE_STREAM
  Subjects: ['images.process']
  Messages: 0

- Name: TEXT_STREAM
  Subjects: ['text.results']
  Messages: 0

Connected to stream 'IMAGE-RESULTS'
Stream contains 1 messages
Sequence range: 1 to 1
Will process all existing results starting from sequence 1
Stream processor will run for 3600 seconds or until interrupted
Results will be saved to output_images
Processing image tagging results...
Processing 1 existing results
Processed result #1, seq=1, saved to output_images\result_20250508_160934_1_result.json

Image Tagging Analysis:
Total images processed: 1

Image #1 ID: 3d40d6a2-9a27-4bfc-812b-9257c8d340c8
Filename: 2Persons.jpg
Tags:
  - Unknown: 0.0000
  - Unknown: 0.0000
  - Unknown: 0.0000
