In [1]:
from google.colab import userdata
hf_token = userdata.get('HF_TOKEN')

# Use it with huggingface_hub
from huggingface_hub import login
login(token=hf_token)

In [None]:
"""
Enhanced Chat Agent for Llama 3.2-1B-Instruct
Token Budget Context Management with Best Practices

This implementation includes:
Token-based budgeting context management (ONLY strategy)

 History toggle (ON/OFF)
 All 4 test cases covered
"""

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# ============================================================================
# CONFIGURATION
# ============================================================================

MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
SYSTEM_PROMPT = "You are a helpful AI assistant. Be concise and friendly."

# ============================================================================
# BEST PRACTICE 2: SET APPROPRIATE MAX LENGTH
# ============================================================================
# From professor's guide: Leave room for response generation
MAX_CONTEXT_LENGTH = 2048  # Total context window for Llama 3.2-1B
MAX_NEW_TOKENS = 512       # Maximum tokens for response
# Effective history limit: 2048 - 512 = 1536 tokens
MAX_CONTEXT_TOKENS = MAX_CONTEXT_LENGTH - MAX_NEW_TOKENS

# History toggle
USE_CONVERSATION_HISTORY = True  # Set to False to disable memory

# ============================================================================
# LOAD MODEL
# ============================================================================

print("Loading model (this takes 1-2 minutes)...")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto",
    low_cpu_mem_usage=True
)

model.eval()
print(f" Model loaded! Using device: {model.device}")
print(f" Memory usage: ~2.5 GB (FP16)\n")

# ============================================================================
# DISPLAY CONFIGURATION
# ============================================================================
print("="*70)
print("CONFIGURATION:")
print(f"  • Conversation History: {'ENABLED' if USE_CONVERSATION_HISTORY else 'DISABLED'}")
print(f"  • Context Strategy: TOKEN_BUDGET")
print(f"  • Max Context Length: {MAX_CONTEXT_LENGTH} tokens (total window)")
print(f"  • Max Response Tokens: {MAX_NEW_TOKENS} tokens")
print(f"  • Effective History Limit: {MAX_CONTEXT_TOKENS} tokens")
print("="*70 + "\n")

# ============================================================================
# CONVERSATION HISTORY
# ============================================================================

full_chat_history = []      # Complete history (for logging)
working_chat_history = []   # What the model sees (managed)

system_message = {"role": "system", "content": SYSTEM_PROMPT}
full_chat_history.append(system_message)
working_chat_history.append(system_message)

# ============================================================================
# BEST PRACTICE 6: TOKEN COUNTING (FAST AND ACCURATE)
# ============================================================================

def approximate_token_count(text):
    """
    Fast token count approximation.
    From professor's guide: approx_tokens = len(text.split()) * 1.3
    """
    return int(len(text.split()) * 1.3)


def accurate_token_count(messages):
    """
    Accurate token count using actual tokenization.
    From professor's guide: actual_tokens = len(tokenizer.encode(text))
    """
    try:
        formatted = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            tokenize=False
        )
        tokens = tokenizer.encode(formatted)
        return len(tokens)
    except:
        # Fallback to approximation
        total = 0
        for msg in messages:
            total += approximate_token_count(msg["content"])
        return total


# ============================================================================
# BEST PRACTICE 1: MONITOR TOKEN USAGE
# ============================================================================

def get_context_stats(history):
    """
    Get statistics about current context usage.
    From professor's guide: stats = chat_manager.get_context_stats()
    """
    num_tokens = accurate_token_count(history)
    num_messages = len([msg for msg in history if msg["role"] != "system"])

    return {
        'num_tokens': num_tokens,
        'max_tokens': MAX_CONTEXT_TOKENS,
        'num_messages': num_messages,
        'utilization': f"{(num_tokens/MAX_CONTEXT_TOKENS)*100:.1f}%",
        'tokens_remaining': MAX_CONTEXT_TOKENS - num_tokens
    }


# ============================================================================
# TOKEN BUDGET CONTEXT MANAGEMENT
# ============================================================================

def token_budget_management(history, max_tokens):
    """
    Token-based context management with all best practices.

    Implements:
    - BEST PRACTICE 3: Preserve system prompts
    - BEST PRACTICE 4: Edge case handling
    - BEST PRACTICE 4: User warnings when truncating
    """
    # BEST PRACTICE 3: Always preserve system message
    system_msgs = [msg for msg in history if msg["role"] == "system"]
    conversation_msgs = [msg for msg in history if msg["role"] != "system"]

    # BEST PRACTICE 4: Handle edge case - prevent infinite loops
    if len(conversation_msgs) <= 1:
        return history  # Can't truncate further

    # Start with all messages
    current_history = system_msgs + conversation_msgs
    current_tokens = accurate_token_count(current_history)

    # Remove oldest messages until under budget
    removed_count = 0
    while current_tokens > max_tokens and len(conversation_msgs) > 2:
        # Remove oldest message pair (user + assistant)
        conversation_msgs = conversation_msgs[2:]
        current_history = system_msgs + conversation_msgs
        current_tokens = accurate_token_count(current_history)
        removed_count += 2

    # BEST PRACTICE 4: Warn user when truncating
    if removed_count > 0:
        print(f"[Note: Earlier messages removed to fit context]")
        print(f"  • Removed: {removed_count} messages")
        print(f"  • Current context: {current_tokens}/{max_tokens} tokens")

    return current_history


# ============================================================================
# MAIN CHAT LOOP
# ============================================================================

print("Chat started! Type 'quit' or 'exit' to end the conversation.")
print("Type 'stats' to see detailed context statistics.")
print()
print("="*70 + "\n")

turn_number = 0

while True:
    # ========================================================================
    # Get user input
    # ========================================================================
    user_input = input("You: ").strip()

    # Handle special commands
    if user_input.lower() in ['quit', 'exit', 'q']:
        print("\n" + "="*70)
        print("CONVERSATION SUMMARY:")
        print(f"  • Total turns: {turn_number}")
        print(f"  • Total messages: {len(full_chat_history)}")
        if USE_CONVERSATION_HISTORY:
            stats = get_context_stats(working_chat_history)
            print(f"  • Final tokens: {stats['num_tokens']}/{stats['max_tokens']}")
            print(f"  • Context utilization: {stats['utilization']}")
        print("="*70)
        print("\nGoodbye!")
        break

    # BEST PRACTICE 1: Allow user to check stats manually
    if user_input.lower() == 'stats':
        stats = get_context_stats(working_chat_history)
        print("\nCONTEXT STATISTICS:")
        print(f"  • Tokens used: {stats['num_tokens']}/{stats['max_tokens']}")
        print(f"  • Utilization: {stats['utilization']}")
        print(f"  • Messages in context: {stats['num_messages']}")
        print(f"  • Tokens remaining: {stats['tokens_remaining']}")
        print()
        continue

    if not user_input:
        continue

    turn_number += 1

    # ========================================================================
    # Add user message to histories
    # ========================================================================
    user_message = {"role": "user", "content": user_input}
    full_chat_history.append(user_message)

    # ========================================================================
    # Prepare working history based on settings
    # ========================================================================
    if USE_CONVERSATION_HISTORY:
        # Add to working history
        working_chat_history.append(user_message)

        # Apply token budget context management
        managed_history = token_budget_management(working_chat_history, MAX_CONTEXT_TOKENS)

        # Update working history with managed version
        working_chat_history = managed_history

    else:
        # NO HISTORY MODE: Only system + current message
        managed_history = [system_message, user_message]
        print(f"[No history mode: Bot sees only current message]")

    # ========================================================================
    # BEST PRACTICE 1: MONITOR TOKEN USAGE
    # ========================================================================
    stats = get_context_stats(managed_history)
    print(f"[Context: {stats['num_tokens']}/{stats['max_tokens']} tokens " +
          f"({stats['utilization']} used) across {stats['num_messages']} messages]")

    # ========================================================================
    # Tokenize
    # ========================================================================
    input_ids = tokenizer.apply_chat_template(
        managed_history,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    attention_mask = torch.ones_like(input_ids)

    # ========================================================================
    # Generate response
    # ========================================================================
    print("Assistant: ", end="", flush=True)

    with torch.no_grad():
        outputs = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_new_tokens=MAX_NEW_TOKENS,  # BEST PRACTICE 2
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )

    # ========================================================================
    # Decode response
    # ========================================================================
    new_tokens = outputs[0][input_ids.shape[1]:]
    assistant_response = tokenizer.decode(
        new_tokens,
        skip_special_tokens=True
    )

    print(assistant_response)

    # ========================================================================
    # Add assistant response to histories
    # ========================================================================
    assistant_message = {"role": "assistant", "content": assistant_response}
    full_chat_history.append(assistant_message)

    if USE_CONVERSATION_HISTORY:
        working_chat_history.append(assistant_message)

    print()

# ============================================================================
# IMPLEMENTATION SUMMARY
# ============================================================================
"""
CONTEXT MANAGEMENT STRATEGY: Token Budget

From professor's guide:
"Keep recent messages that fit within context window. When limit is reached,
remove oldest messages. Preserve system message if present."

BEST PRACTICES IMPLEMENTED:

✓ 1. Monitor Token Usage
     - get_context_stats() provides detailed statistics
     - Displayed after each turn
     - User can type 'stats' for details

✓ 2. Set Appropriate Max Length
     - MAX_CONTEXT_LENGTH = 2048 (total window)
     - MAX_NEW_TOKENS = 512 (for response)
     - Effective history = 1536 tokens

✓ 3. Preserve System Prompts
     - System message always extracted first
     - Never removed during truncation

✓ 4. Handle Edge Cases
     - Prevents infinite loops (checks len <= 1)
     - Warns user when truncating

✓ 5. Optimize for Use Case
     - Token budget best for production apps
     - Good for varying message lengths

✓ 6. Token Counting
     - Fast: approximate_token_count()
     - Accurate: accurate_token_count()

TEST CASES COVERED:

✓ Test Case 1: Context Overflow
     - Automatic truncation with warnings

✓ Test Case 2: System Prompt Preservation
     - System message always preserved

✓ Test Case 3: Token Counting Accuracy
     - Both fast and accurate methods

✓ Test Case 4: Multi-turn Coherence
     - Context management maintains coherence

HISTORY TOGGLE:

✓ USE_CONVERSATION_HISTORY = True  → Stateful (remembers)
✓ USE_CONVERSATION_HISTORY = False → Stateless (no memory)
"""

Loading model (this takes 1-2 minutes)...
 Model loaded! Using device: cpu
 Memory usage: ~2.5 GB (FP16)

CONFIGURATION:
  • Conversation History: ENABLED
  • Context Strategy: TOKEN_BUDGET
  • Max Context Length: 2048 tokens (total window)
  • Max Response Tokens: 512 tokens
  • Effective History Limit: 1536 tokens

Chat started! Type 'quit' or 'exit' to end the conversation.
Type 'stats' to see detailed context statistics.



CONTEXT STATISTICS:
  • Tokens used: 43/1536
  • Utilization: 2.8%
  • Messages in context: 0
  • Tokens remaining: 1493

[Context: 64/1536 tokens (4.2% used) across 1 messages]
Assistant: That sounds like an interesting project. Reddit is a great platform to gather data, as it's a community-driven platform with a vast number of users. You can collect data on burnout in cybersecurity professionals by analyzing various subreddits related to cybersecurity.

Some potential subreddits to consider:

1. r/cybersecurity
2. r/netsec
3. r/askcybersecurity
4. r/AskSecurit