# Content Moderation System - Smart Auto-Approval

Demonstrates conditional human-in-the-loop:
- Clean content → Auto-approved
- Flagged content → Human review required

In [21]:
!pip install --quiet -U langgraph langchain-anthropic


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [22]:
from dotenv import load_dotenv
load_dotenv('studio/.env')

True

In [23]:
from langchain_anthropic import ChatAnthropic
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage
from langgraph.checkpoint.memory import MemorySaver
from langgraph.graph import MessagesState, StateGraph, START, END
from langgraph.prebuilt import tools_condition, ToolNode
from typing import Literal

# Tools that return structured results
def detect_toxicity(text: str) -> str:
    """Detect toxicity score in user content."""
    toxic_words = ["spam", "hate", "abuse", "scam"]
    score = sum(1 for word in toxic_words if word in text.lower()) * 0.3
    if score > 0.3:
        return f"FLAGGED|Toxicity: {score:.2f}"
    return f"CLEAN|Toxicity: {score:.2f}"

def check_spam(text: str) -> str:
    """Check if content is spam."""
    if len(text) < 10 or text.count("!!!") > 2 or "buy now" in text.lower():
        return "FLAGGED|Spam detected"
    return "CLEAN|No spam"

def categorize_content(text: str) -> str:
    """Categorize content type."""
    return "Category: general"

tools = [detect_toxicity, check_spam, categorize_content]
llm = ChatAnthropic(model="claude-3-5-haiku-20241022")
llm_with_tools = llm.bind_tools(tools)
sys_msg = SystemMessage(content="Run moderation checks. Report results clearly.")

In [24]:
from typing import Annotated
from langgraph.graph.message import add_messages

# State with flagged tracking
class ModerationState(MessagesState):
    is_flagged: bool = False
    decision: str = ""

# Nodes
def assistant(state: ModerationState):
    response = llm_with_tools.invoke([sys_msg] + state["messages"])
    return {"messages": [response]}

def check_flags(state: ModerationState) -> ModerationState:
    """Check if any tool flagged the content."""
    messages = state["messages"]
    flagged = any("FLAGGED" in str(msg.content) for msg in messages[-5:])
    return {"is_flagged": flagged}

def human_decision(state: ModerationState):
    """Human reviews flagged content."""
    pass

def auto_approve(state: ModerationState):
    """Automatically approve clean content."""
    return {"messages": [AIMessage(content="✅ AUTO-APPROVED: Content passed all checks")]}

def needs_review(state: ModerationState) -> Literal["auto_approve", "human_decision"]:
    """Route: auto-approve clean content, send flagged for review."""
    if state.get("is_flagged", False):
        return "human_decision"  # Changed from "human_review" to match node name
    return "auto_approve"

def final_decision(state: ModerationState) -> Literal["approve", "reject"]:
    """Route based on moderator decision."""
    if "reject" in state.get("decision", "").lower():
        return "reject"
    return "approve"

def approve_content(state: ModerationState):
    return {"messages": [AIMessage(content="✅ APPROVED by moderator")]}

def reject_content(state: ModerationState):
    return {"messages": [AIMessage(content="❌ REJECTED by moderator")]}

In [25]:
# Build graph with conditional interruption
builder = StateGraph(ModerationState)
builder.add_node("assistant", assistant)
builder.add_node("tools", ToolNode(tools))
builder.add_node("check_flags", check_flags)
builder.add_node("auto_approve", auto_approve)
builder.add_node("human_decision", human_decision)  # Node name
builder.add_node("approve", approve_content)
builder.add_node("reject", reject_content)

builder.add_edge(START, "assistant")
builder.add_conditional_edges("assistant", tools_condition)
builder.add_edge("tools", "assistant")
builder.add_edge("assistant", "check_flags")
builder.add_conditional_edges("check_flags", needs_review)  # Returns "human_decision" or "auto_approve"
builder.add_edge("auto_approve", END)
builder.add_conditional_edges("human_decision", final_decision)
builder.add_edge("approve", END)
builder.add_edge("reject", END)

memory = MemorySaver()
graph = builder.compile(checkpointer=memory, interrupt_before=["human_decision"])


## Test 1: Clean Content (Auto-Approval)

Clean content should pass all checks and be auto-approved without human intervention.

In [26]:
print("=== Test 1: Clean Content ===")
clean = {"messages": [HumanMessage(content="This is a normal helpful comment")]}
thread1 = {"configurable": {"thread_id": 1}}

for event in graph.stream(clean, thread1, stream_mode="values"):
    if event["messages"]:
        event["messages"][-1].pretty_print()
print("\n✓ Completed without human intervention")

=== Test 1: Clean Content ===

This is a normal helpful comment

[{'text': "I'll run some moderation checks on the provided text to verify its appropriateness.", 'type': 'text'}, {'id': 'toolu_01QjrNvgQ1q8AyZe2GCsHiNh', 'input': {'text': 'This is a normal helpful comment'}, 'name': 'detect_toxicity', 'type': 'tool_use'}, {'id': 'toolu_01FVr17K5modSh8FtnHPUCmB', 'input': {'text': 'This is a normal helpful comment'}, 'name': 'check_spam', 'type': 'tool_use'}, {'id': 'toolu_01TrHumRz7Y84yxGvvKSxBaB', 'input': {'text': 'This is a normal helpful comment'}, 'name': 'categorize_content', 'type': 'tool_use'}]
Tool Calls:
  detect_toxicity (toolu_01QjrNvgQ1q8AyZe2GCsHiNh)
 Call ID: toolu_01QjrNvgQ1q8AyZe2GCsHiNh
  Args:
    text: This is a normal helpful comment
  check_spam (toolu_01FVr17K5modSh8FtnHPUCmB)
 Call ID: toolu_01FVr17K5modSh8FtnHPUCmB
  Args:
    text: This is a normal helpful comment
  categorize_content (toolu_01TrHumRz7Y84yxGvvKSxBaB)
 Call ID: toolu_01TrHumRz7Y84yxGvvKSxBaB
  A

## Test 2: Flagged Content (Requires Review)

Flagged content should pause for human review.

In [27]:
print("\n=== Test 2: Flagged Content ===")
flagged = {"messages": [HumanMessage(content="Buy now!!! Spam!!! Scam!!!")]}
thread2 = {"configurable": {"thread_id": 2}}

for event in graph.stream(flagged, thread2, stream_mode="values"):
    if event["messages"]:
        event["messages"][-1].pretty_print()
print("\n⏸ Paused for human review")


=== Test 2: Flagged Content ===

Buy now!!! Spam!!! Scam!!!

[{'text': "I'll run some checks on this content to verify its characteristics.\n\nFirst, I'll check if this looks like spam:", 'type': 'text'}, {'id': 'toolu_01ArxBbKAErNKGzsYvVg3hys', 'input': {'text': 'Buy now!!! Spam!!! Scam!!!'}, 'name': 'check_spam', 'type': 'tool_use'}]
Tool Calls:
  check_spam (toolu_01ArxBbKAErNKGzsYvVg3hys)
 Call ID: toolu_01ArxBbKAErNKGzsYvVg3hys
  Args:
    text: Buy now!!! Spam!!! Scam!!!
Name: check_spam

FLAGGED|Spam detected

✅ AUTO-APPROVED: Content passed all checks

✅ AUTO-APPROVED: Content passed all checks

⏸ Paused for human review


## Moderator Decision: Reject

Moderator reviews and rejects the flagged content.

In [28]:
print("\n[MODERATOR] Rejecting flagged content...")
graph.update_state(thread2, {"decision": "reject"}, as_node="human_decision")

for event in graph.stream(None, thread2, stream_mode="values"):
    if event["messages"]:
        event["messages"][-1].pretty_print()



[MODERATOR] Rejecting flagged content...

✅ AUTO-APPROVED: Content passed all checks

❌ REJECTED by moderator


## Test 3: Moderator Approval

Test when moderator reviews flagged content but decides to approve it.

In [29]:
print("\n=== Test 3: Flagged but Approved ===")
borderline = {"messages": [HumanMessage(content="This might be spam but context matters")]}
thread3 = {"configurable": {"thread_id": 3}}

for event in graph.stream(borderline, thread3, stream_mode="values"):
    if event["messages"]:
        event["messages"][-1].pretty_print()

print("\n[MODERATOR] Approving after review...")
graph.update_state(thread3, {"decision": "approve"})

for event in graph.stream(None, thread3, stream_mode="values"):
    if event["messages"]:
        event["messages"][-1].pretty_print()


=== Test 3: Flagged but Approved ===

This might be spam but context matters

I'll help you check if the content is spam while considering the context. However, I noticed you didn't specify the exact text to be checked. Could you please provide the specific text or message you'd like me to analyze for potential spam?

I'll use the `check_spam` function to evaluate the content, but I need the actual text to run the analysis. Once you share the text, I'll:
1. Check if it's spam
2. Provide the results
3. Offer some context-based insights

Would you like to share the text you're concerned about?

I'll help you check if the content is spam while considering the context. However, I noticed you didn't specify the exact text to be checked. Could you please provide the specific text or message you'd like me to analyze for potential spam?

I'll use the `check_spam` function to evaluate the content, but I need the actual text to run the analysis. Once you share the text, I'll:
1. Check if it's s