In [16]:
import sys
import os
from pathlib import Path
import importlib

# Add src directory to path for imports
script_dir = Path().resolve()
src_dir = script_dir.parent.parent.parent.parent / "src"
if str(src_dir) not in sys.path:
    sys.path.insert(0, str(src_dir))

# 모듈 리로드 (변경사항 반영)
modules_to_reload = [
    'utils.vlm.vlm_processor',
    'utils.vlm.vlm_postprocessor',
    'utils.vlm.vlm_wrapper',
    'utils.vlm.handlers.gemini_handler'
]

for module_name in modules_to_reload:
    if module_name in sys.modules:
        importlib.reload(sys.modules[module_name])

import numpy as np

# Import VLM components
from utils.vlm.vlm_wrapper import VLMWrapper
from utils.vlm.vlm_postprocessor import VLMResponsePostProcessor

# .env 파일에서 환경 변수를 불러오도록 구현
from dotenv import load_dotenv

# .env 파일이 상위경로(또는 프로젝트 루트)에 있다고 가정
load_dotenv(dotenv_path="../../../../.env")  # .env 파일 경로를 프로젝트 구조에 맞게 수정

credentials_path = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")
project_id = os.environ.get("GOOGLE_CLOUD_PROJECT")
location = os.environ.get("GOOGLE_CLOUD_LOCATION", "us-central1")

print(f"Credentials path: {credentials_path}")
print(f"Project ID: {project_id}")
print(f"Location: {location}")

Credentials path: ../../trans-century-405100-c8827a8ffbf4.json
Project ID: trans-century-405100
Location: us-central1


In [None]:
# Example 2: Extract logprobs for action field in JSON response
print("\n" + "="*80)
print("Example 2: Extract logprobs for action field in JSON response")
print("="*80)

if not credentials_path or not project_id:
    print("[SKIP] Vertex AI credentials not configured.")
else:
    try:
        # Initialize wrapper
        wrapper = VLMWrapper(
            model="gemini-2.5-flash-vertex",
            logprobs=5,
            credentials=credentials_path,
            project_id=project_id,
            location=location,
            temperature=0.0,
            max_tokens=2000
        )
        
        # System prompt for robot control
        system_prompt = """You are a robot controller. 
Respond with JSON format containing:
- action: The action to take (e.g., "move up", "pickup", "drop")
- reasoning: Brief explanation of why this action was chosen
"""
        
        user_prompt = """Based on the current situation, what action should the robot take?
Respond in JSON format:
{
  "action": "move up",
  "reasoning": "The goal is to the north"
}
"""
        
        print("\n[1] Generating response with logprobs...")
        response, logprobs_metadata = wrapper.generate_with_logprobs(
            system_prompt=system_prompt,
            user_prompt=user_prompt,
            debug=False
        )
        
        print(f"\n[2] Response:\n{response}")
        
        # Process with postprocessor
        print("\n[3] Processing with postprocessor...")
        processor = VLMResponsePostProcessor(
            required_fields=["action", "reasoning"]
        )
        
        # Option A: Get clean JSON without logprobs
        print("\n[4] Option A: Clean JSON (without logprobs)")
        parsed_clean = processor.process_without_logprobs(
            response,
            logprobs_metadata
        )
        print(f"  Action: {parsed_clean.get('action')}")
        print(f"  Reasoning: {parsed_clean.get('reasoning')}")
        
        # Option B: Get JSON with action logprobs wrapped
        print("\n[5] Option B: JSON with action logprobs wrapped")
        parsed_with_logprobs = processor.process_with_action_logprobs(
            response,
            logprobs_metadata,
            action_field="action"
        )
        print(f"  Action: {parsed_with_logprobs.get('action')}")
        print(f"  Reasoning: {parsed_with_logprobs.get('reasoning')}")
        
        # Display action logprobs
        if 'action_logprobs' in parsed_with_logprobs:
            action_logprobs = parsed_with_logprobs['action_logprobs']
            print(f"\n[6] Action Logprobs:")
            print(f"  Action tokens: {action_logprobs.get('action_tokens', [])}")
            print(f"  Number of action tokens: {len(action_logprobs.get('action_tokens', []))}")
            if action_logprobs.get('action_entropies'):
                avg_entropy = np.mean(action_logprobs['action_entropies'])
                print(f"  Average entropy for action: {avg_entropy:.4f} bits")
        
        # Display remaining logprobs
        if 'remaining_logprobs' in parsed_with_logprobs:
            remaining = parsed_with_logprobs['remaining_logprobs']
            print(f"\n[7] Remaining Logprobs:")
            print(f"  Number of remaining tokens: {len(remaining.get('tokens', []))}")
            if remaining.get('entropies'):
                avg_entropy = np.mean(remaining['entropies'])
                print(f"  Average entropy for remaining: {avg_entropy:.4f} bits")
        
    except Exception as e:
        print(f"[ERROR] {e}")
        import traceback
        traceback.print_exc()

## Example 4: Using VLMProcessor with logprobs

This example shows how to:
- Use VLMProcessor with logprobs support
- Request and parse responses with logprobs
- Extract action-specific logprobs using VLMProcessor

In [11]:
# Example 4: Using VLMProcessor with logprobs
print("\n" + "="*80)
print("Example 4: Using VLMProcessor with logprobs")
print("="*80)


from utils.vlm.vlm_processor import VLMProcessor

# Initialize processor with logprobs
processor = VLMProcessor(
    model="gemini-2.5-flash-vertex",
    logprobs=5,
    credentials=credentials_path,
    project_id=project_id,
    location=location,
    temperature=0.0,
    max_tokens=2000,
    debug=False
)

# Create dummy image (or use real image)
dummy_image = np.random.randint(0, 255, (100, 100, 3), dtype=np.uint8)

system_prompt = """You are a robot controller. 
Respond with JSON format containing:
- action: You must choose 3 component list, each component should be exactly one from ["up", "down", "left", or "right"]. (e.g., ["up"])
- format should be like this: "action": ["<action1>", "<action2>", "<action3>"]
- reasoning: Brief explanation of why select action components
"""

user_prompt = """What action should the robot take? Respond in JSON format."""

print("\n[1] Requesting with logprobs...")
response, logprobs_metadata = processor.requester_with_logprobs(
    image=dummy_image,
    system_prompt=system_prompt,
    user_prompt=user_prompt,
    debug=False
)

print(f"\n[2] Response:\n{response}")



Example 4: Using VLMProcessor with logprobs

[1] Requesting with logprobs...

[2] Response:
```json
{
  "action": ["up", "left", "right"],
  "reasoning": "The provided image appears to be entirely static or noise, offering no discernible features or context to inform a specific action. Therefore, a set of exploratory movements (up, left, right) is chosen as a default."
}
```


In [22]:
print("\n[3] Parsing with action logprobs...")
parsed = processor.parser_action_with_logprobs(
    response,
    logprobs_metadata,
    action_field="action",
    remove_logprobs=False
)

print(f"\n[4] Parsed result:")
print(f"  Action: {parsed.get('action')}")
print(f"  Reasoning: {parsed.get('reasoning')}")

# action_logprobs_info는 {'action_positions', 'action_logprobs', 'action_entropies'} 형태
info = parsed.get('action_logprobs_info', {})
action_positions = info.get('action_positions', [])
action_logprobs_list = info.get('action_logprobs', [])
action_entropies = info.get('action_entropies', [])

print(f"\n[5] Action logprobs info:")
print(f"  Positions: {action_positions}")
print(f"  Count: {len(action_logprobs_list)}")

for idx, entry in enumerate(action_logprobs_list):
    token_str, top_logs, entropy, pos = entry
    print(f"  - Action {idx+1} token: '{token_str}' (pos {pos})")
    if entropy is not None:
        print(f"    entropy: {entropy:.4f}")
    if top_logs:
        print(f"    top logprobs (first 3): {top_logs[:3]}")

if action_entropies:
    print(f"  Entropies list: {[round(e,4) if e is not None else None for e in action_entropies]}")


[3] Parsing with action logprobs...

[4] Parsed result:
  Action: ['up', 'left', 'right']
  Reasoning: The provided image appears to be entirely static or noise, offering no discernible features or context to inform a specific action. Therefore, a set of exploratory movements (up, left, right) is chosen as a default.

[5] Action logprobs info:
  Positions: []
  Count: 3
  - Action 1 token: 'up' (pos 10)
    entropy: 0.0132
    top logprobs (first 3): ['up:-0.0011', 'left:-7.0816', 'down:-8.8559']
  - Action 2 token: 'left' (pos 13)
    entropy: 0.0141
    top logprobs (first 3): ['left:-0.0012', 'down:-7.2783', 'right:-7.7593']
  - Action 3 token: 'right' (pos 16)
    entropy: 0.0009
    top logprobs (first 3): ['right:-0.0001', 'down:-9.8019', 'left:-14.5727']
  Entropies list: [np.float64(0.0132), np.float64(0.0141), np.float64(0.0009)]


In [20]:
parsed['action_logprobs_info']['action_logprobs']

[['up',
  ['up:-0.0011',
   'left:-7.0816',
   'down:-8.8559',
   'right:-9.4103',
   'forward:-11.9427'],
  np.float64(0.013176400540552765),
  10],
 ['left',
  ['left:-0.0012',
   'down:-7.2783',
   'right:-7.7593',
   'up:-10.5662',
   'turn:-13.6821'],
  np.float64(0.014091295248416316),
  13],
 ['right',
  ['right:-0.0001',
   'down:-9.8019',
   'left:-14.5727',
   'forward:-15.4860',
   'up:-15.7652'],
  np.float64(0.0008809247111316944),
  16]]