In [1]:
import getpass
import os

def _set_env(key: str):
    if key not in os.environ:
        os.environ[key] = getpass.getpass(f"{key}:")

_set_env("ANTHROPIC_API_KEY")

In [2]:
from langchain_community.document_loaders import WebBaseLoader

urls = [
    "https://lilianweng.github.io/posts/2024-11-28-reward-hacking/",
    "https://lilianweng.github.io/posts/2024-07-07-hallucination/",
    "https://lilianweng.github.io/posts/2024-04-12-diffusion-video/",
]

docs = [WebBaseLoader(url).load() for url in urls]

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [3]:
docs

[[Document(metadata={'source': 'https://lilianweng.github.io/posts/2024-11-28-reward-hacking/', 'title': "Reward Hacking in Reinforcement Learning | Lil'Log", 'description': 'Reward hacking occurs when a reinforcement learning (RL) agent exploits flaws or ambiguities in the reward function to achieve high rewards, without genuinely learning or completing the intended task. Reward hacking exists because RL environments are often imperfect, and it is fundamentally challenging to accurately specify a reward function.\nWith the rise of language models generalizing to a broad spectrum of tasks and RLHF becomes a de facto method for alignment training, reward hacking in RL training of language models has become a critical practical challenge. Instances where the model learns to modify unit tests to pass coding tasks, or where responses contain biases that mimic a user’s preference, are pretty concerning and are likely one of the major blockers for real-world deployment of more autonomous use

In [4]:
file = "ML_spe_certif copy.pdf"

In [5]:
import fitz

def read_pdf(file):
    doc = fitz.open(file)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

In [6]:
text = read_pdf(file)

text

'3 Courses\nSupervised Machine\nLearning: Regression and\nClassiﬁcation\nAdvanced Learning\nAlgorithms\nUnsupervised Learning,\nRecommenders,\nReinforcement Learning\nAndrew Ng \nFounder,\nDeepLearning.AI and\nAdjunct Professor,\nStanford University\n \nAug 9, 2025\nLouis Salanon\nhas successfully completed the online, non-credit Specialization\nMachine Learning\nCongratulations on completing all three courses of the Machine\nLearning Specialization! You studied modern machine learning\nconcepts, including supervised learning (linear regression, logistic\nregression, neural networks, decision trees), unsupervised learning\n(clustering, anomaly detection), recommender systems, and\nreinforcement learning. You learned some of the best practices for\nbuilding machine learning models. You’ve also gained practical skills to\napply machine learning techniques to challenging real-world problems.\nNow #BreakIntoAI and start building your career in machine learning!\nThe online specialization n

In [7]:
docs[0][0].page_content.strip()[:500]

"Reward Hacking in Reinforcement Learning | Lil'Log\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nLil'Log\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n|\n\n\n\n\n\n\nPosts\n\n\n\n\nArchive\n\n\n\n\nSearch\n\n\n\n\nTags\n\n\n\n\nFAQ\n\n\n\n\n\n\n\n\n\n      Reward Hacking in Reinforcement Learning\n    \nDate: November 28, 2024  |  Estimated Reading Time: 37 min  |  Author: Lilian Weng\n\n\n \n\n\nTable of Contents\n\n\n\nBackground\n\nReward Function in RL\n\nSpurious Correlation\n\n\nLet’s Define Reward Hacking\n\nList of Examples\n\nReward hacking examples in RL tasks\n\nReward hacking ex"

In [8]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

docs_list = [item for sublist in docs for item in sublist]

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=100, chunk_overlap=50
)
doc_splits = text_splitter.split_documents(docs_list)

In [10]:
from langchain.schema import Document

# Convert the text into a Document object
text_doc = [Document(page_content=text)]

# Use the text_splitter to split the text document
splits = text_splitter.split_documents(text_doc)

In [15]:
splits[3].page_content.strip()

'(clustering, anomaly detection), recommender systems, and\nreinforcement learning. You learned some of the best practices for\nbuilding machine learning models. You’ve also gained practical skills to\napply machine learning techniques to challenging real-world problems.\nNow #BreakIntoAI and start building your career in machine learning!\nThe online specialization named in this certiﬁcate may draw on material from courses taught on-campus, but the included'

In [34]:
doc_splits[0].page_content.strip()

"Reward Hacking in Reinforcement Learning | Lil'Log\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nLil'Log\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n|\n\n\n\n\n\n\nPosts\n\n\n\n\nArchive\n\n\n\n\nSearch\n\n\n\n\nTags\n\n\n\n\nFAQ"

In [35]:
from typing import List
from sentence_transformers import SentenceTransformer

MODEL_NAME = "all-MiniLM-L6-v2"

class SentenceTransformerEmbeddingFunction:
    def __init__(self, model_name: str):
        self.model = SentenceTransformer(model_name)

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return self.model.encode(texts).tolist()

    def embed_query(self, text: str) -> List[float]:
        return self.model.encode([text])[0].tolist()

embedding_function = SentenceTransformerEmbeddingFunction(MODEL_NAME)

In [36]:
from langchain_core.vectorstores import InMemoryVectorStore

vectorstore = InMemoryVectorStore.from_documents(
    documents=doc_splits, embedding=embedding_function
)
retriever = vectorstore.as_retriever()

In [37]:
from langchain.tools.retriever import create_retriever_tool

retriever_tool = create_retriever_tool(
    retriever,
    "retrieve_blog_posts",
    "Search and return information about Lilian Weng blog posts.",
)

In [38]:
retriever_tool.invoke({"query": "types of reward hacking"})

'Why does Reward Hacking Exist?#\n\nReward hacking examples in LLM tasks\n\nReward hacking examples in real life\n\n\nWhy does Reward Hacking Exist?\n\n\nHacking RL Environment\n\nHacking RLHF of LLMs\n\nHacking the Training Process\n\nHacking the Evaluator\n\nIn-Context Reward Hacking\n\n\nGeneralization of Hacking Skills\n\nPeek into Mitigations\n\nRL Algorithm Improvement\n\nDetecting Reward Hacking\n\nMost of the past work on this topic has been quite theoretical and focused on defining or demonstrating the existence of reward hacking. However, research into practical mitigations, especially in the context of RLHF and LLMs, remains limited. I especially want to call out for more research efforts directed toward understanding and developing mitigation for reward hacking in the future. Hope I will be able to cover the mitigation part in a dedicated post soon.\nBackground#\nReward Function in RL#\n\nPan et al. (2022) investigated reward hacking as a function of agent capabilities, inc

In [50]:
from langchain.chat_models import init_chat_model

# Initialize your model
response_model = init_chat_model("anthropic:claude-3-5-sonnet-latest", temperature=0)

# Minimal state-like dict instead of importing MessagesState
state = {
    "messages": [
        {"role": "user", "content": "Hello, how are you?"}
    ]
}
def generate_query_or_respond(state):
    """
    Calls the model to respond based on the current messages.
    """
    response = response_model.invoke(state["messages"])
    return {"messages": [{"role": "assistant", "content": response}]}  # mimic MessagesState

In [51]:
input = {"messages": [{"role": "user", "content": "hello!"}]}
print(generate_query_or_respond(input)["messages"][-1]["content"])

content='Hi there! How can I help you today?' additional_kwargs={} response_metadata={'id': 'msg_0177VweoQByG7quKet1w9yfw', 'model': 'claude-3-5-sonnet-20241022', 'stop_reason': 'end_turn', 'stop_sequence': None, 'usage': {'cache_creation': {'ephemeral_1h_input_tokens': 0, 'ephemeral_5m_input_tokens': 0}, 'cache_creation_input_tokens': 0, 'cache_read_input_tokens': 0, 'input_tokens': 9, 'output_tokens': 13, 'server_tool_use': None, 'service_tier': 'standard'}, 'model_name': 'claude-3-5-sonnet-20241022'} id='run--661c00c5-b859-4186-ad07-0aca3dab5911-0' usage_metadata={'input_tokens': 9, 'output_tokens': 13, 'total_tokens': 22, 'input_token_details': {'cache_read': 0, 'cache_creation': 0, 'ephemeral_5m_input_tokens': 0, 'ephemeral_1h_input_tokens': 0}}


In [52]:
input = {
    "messages": [
        {
            "role": "user",
            "content": "What does Lilian Weng say about types of reward hacking?",
        }
    ]
}
generate_query_or_respond(input)["messages"][-1]["content"]

AIMessage(content="According to Lilian Weng's writing, there are several types of reward hacking in reinforcement learning:\n\n1. Reward Gaming: When an agent exploits loopholes in the reward function to achieve high rewards without fulfilling the intended objective.\n\n2. Reward Tampering: When an agent directly modifies the reward signal/function to maximize rewards.\n\n3. Reward Corruption: When the reward signal becomes corrupted or noisy, leading to unintended behavior.\n\n4. Reward Misspecification: When the specified reward function doesn't perfectly align with the true objective, leading to unexpected optimization behavior.\n\nHowever, I recommend double-checking these categories as I'm not completely certain I'm recalling Weng's exact categorization perfectly.", additional_kwargs={}, response_metadata={'id': 'msg_01X7Vm6TrY1rEoA94aA4NBCK', 'model': 'claude-3-5-sonnet-20241022', 'stop_reason': 'end_turn', 'stop_sequence': None, 'usage': {'cache_creation': {'ephemeral_1h_input_t

In [54]:
from pydantic import BaseModel, Field
from typing import Literal

GRADE_PROMPT = (
    "You are a grader assessing relevance of a retrieved document to a user question. \n "
    "Here is the retrieved document: \n\n {context} \n\n"
    "Here is the user question: {question} \n"
    "If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. \n"
    "Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question."
)


class GradeDocuments(BaseModel):
    """Grade documents using a binary score for relevance check."""

    binary_score: str = Field(
        description="Relevance score: 'yes' if relevant, or 'no' if not relevant"
    )


grader_model = init_chat_model("anthropic:claude-3-5-sonnet-latest", temperature=0)


def grade_documents(
    state: dict,  # Mimicking MessagesState with a simple dict
) -> Literal["generate_answer", "rewrite_question"]:
    """Determine whether the retrieved documents are relevant to the question."""
    question = state["messages"][0].content
    context = state["messages"][-1].content

    prompt = GRADE_PROMPT.format(question=question, context=context)
    response = (
        grader_model
        .with_structured_output(GradeDocuments).invoke(
            [{"role": "user", "content": prompt}]
        )
    )
    score = response.binary_score

    if score == "yes":
        return "generate_answer"
    else:
        return "rewrite_question"

In [55]:
from langchain_core.messages import convert_to_messages

input = {
    "messages": convert_to_messages(
        [
            {
                "role": "user",
                "content": "What does Lilian Weng say about types of reward hacking?",
            },
            {
                "role": "assistant",
                "content": "",
                "tool_calls": [
                    {
                        "id": "1",
                        "name": "retrieve_blog_posts",
                        "args": {"query": "types of reward hacking"},
                    }
                ],
            },
            {"role": "tool", "content": "meow", "tool_call_id": "1"},
        ]
    )
}
grade_documents(input)

'rewrite_question'

In [56]:
input = {
    "messages": convert_to_messages(
        [
            {
                "role": "user",
                "content": "What does Lilian Weng say about types of reward hacking?",
            },
            {
                "role": "assistant",
                "content": "",
                "tool_calls": [
                    {
                        "id": "1",
                        "name": "retrieve_blog_posts",
                        "args": {"query": "types of reward hacking"},
                    }
                ],
            },
            {
                "role": "tool",
                "content": "reward hacking can be categorized into two types: environment or goal misspecification, and reward tampering",
                "tool_call_id": "1",
            },
        ]
    )
}
grade_documents(input)

'generate_answer'

In [58]:
REWRITE_PROMPT = (
    "Look at the input and try to reason about the underlying semantic intent / meaning.\n"
    "Here is the initial question:"
    "\n ------- \n"
    "{question}"
    "\n ------- \n"
    "Formulate an improved question:"
)


def rewrite_question(state: dict) -> dict:
    """Rewrite the original user question."""
    messages = state["messages"]
    question = messages[0].content
    prompt = REWRITE_PROMPT.format(question=question)
    response = response_model.invoke([{"role": "user", "content": prompt}])
    return {"messages": [{"role": "user", "content": response.content}]}

In [59]:
input = {
    "messages": convert_to_messages(
        [
            {
                "role": "user",
                "content": "What does Lilian Weng say about types of reward hacking?",
            },
            {
                "role": "assistant",
                "content": "",
                "tool_calls": [
                    {
                        "id": "1",
                        "name": "retrieve_blog_posts",
                        "args": {"query": "types of reward hacking"},
                    }
                ],
            },
            {"role": "tool", "content": "meow", "tool_call_id": "1"},
        ]
    )
}

response = rewrite_question(input)
print(response["messages"][-1]["content"])

Let me help improve this question by analyzing its semantic intent.

The original question asks about Lilian Weng's views on reward hacking types, but it could be more specific and provide context. Lilian Weng is known for her work in AI safety and has written about the challenges of reward specification in reinforcement learning.

Here's an improved version of the question:

"In Lilian Weng's blog post/writings about AI safety, what are the different ways she describes how AI systems can exploit or 'hack' their reward functions to achieve unintended outcomes?"

This improved version:
1. Provides context about the domain (AI safety)
2. Is more specific about what we're looking for (ways AI systems exploit reward functions)
3. Clarifies the concept of reward hacking for those who might be unfamiliar
4. Focuses on the actual problem rather than just classification of types


In [60]:
GENERATE_PROMPT = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer the question. "
    "If you don't know the answer, just say that you don't know. "
    "Use three sentences maximum and keep the answer concise.\n"
    "Question: {question} \n"
    "Context: {context}"
)


def generate_answer(state: dict) -> dict:
    """Generate an answer."""
    question = state["messages"][0].content
    context = state["messages"][-1].content
    prompt = GENERATE_PROMPT.format(question=question, context=context)
    response = response_model.invoke([{"role": "user", "content": prompt}])
    return {"messages": [response]}

In [63]:
input = {
    "messages": convert_to_messages(
        [
            {
                "role": "user",
                "content": "What does Lilian Weng say about types of reward hacking?",
            },
            {
                "role": "assistant",
                "content": "",
                "tool_calls": [
                    {
                        "id": "1",
                        "name": "retrieve_blog_posts",
                        "args": {"query": "types of reward hacking"},
                    }
                ],
            },
            {
                "role": "tool",
                "content": "reward hacking can be categorized into two types: environment or goal misspecification, and reward tampering",
                "tool_call_id": "1",
            },
        ]
    )
}

response = generate_answer(input)
response["messages"][-1].pretty_print()


According to Lilian Weng, reward hacking can be divided into two main types: environment/goal misspecification and reward tampering.


In [None]:
# Replace MessagesState with a simple dict
class SimpleState:
    def __init__(self):
        self.messages = []

# Workflow nodes as functions
def generate_query_or_respond(state: SimpleState):
    """Decide whether to retrieve or respond directly."""
    response = response_model.bind_tools([retriever_tool]).invoke(state.messages)
    state.messages.append(response)
    # Decide next node based on LLM decision (mock tools_condition)
    if "tool:" in response:  # Example condition, adjust to your real logic
        return "retrieve"
    elif "rewrite:" in response:
        return "rewrite_question"
    else:
        return "generate_answer"

def retrieve(state: SimpleState):
    """Call retriever tool and append to messages."""
    retrieved_docs = retriever_tool.invoke(state.messages)
    state.messages.append(retrieved_docs)
    return "generate_answer"

def rewrite_question(state: SimpleState):
    """Rewrite the question if needed."""
    rewritten = "rewritten question"  # Replace with your logic
    state.messages.append(rewritten)
    return "generate_query_or_respond"

def generate_answer(state: SimpleState):
    """Generate final answer."""
    answer = response_model.invoke(state.messages)
    state.messages.append(answer)
    return "END"

# Simple runner for the workflow
def run_workflow():
    state = SimpleState()
    node = "generate_query_or_respond"
    
    while node != "END":
        if node == "generate_query_or_respond":
            node = generate_query_or_respond(state)
        elif node == "retrieve":
            node = retrieve(state)
        elif node == "rewrite_question":
            node = rewrite_question(state)
        elif node == "generate_answer":
            node = generate_answer(state)
    
    return state.messages

# Run it
state = SimpleState()
state.messages.append({"role": "user", "content": "Hi!"})  # initial message
final_messages = run_workflow()
print(final_messages)


  _handle_anthropic_bad_request(e)


BadRequestError: Error code: 400 - {'type': 'error', 'error': {'type': 'invalid_request_error', 'message': 'messages: at least one message is required'}, 'request_id': 'req_011CSHw1Dgbr5TLsnHLJNZd8'}