In [1]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

### Overview

In [2]:
import weave
from weave import Evaluation
import asyncio

In [3]:
# Collect your examples
examples = [
    {"question": "What is the capital of France?", "expected": "Paris"},
    {"question": "Who wrote 'To Kill a Mockingbird'?", "expected": "Harper Lee"},
    {"question": "What is the square root of 64?", "expected": "8"},
]

In [4]:
# Define any custom scoring function
@weave.op()
def match_score1(expected: str, model_output: dict) -> dict:
    # Here is where you'd define the logic to score the model output
    return {'match': expected == model_output['generated_text']}

In [5]:
@weave.op()
def function_to_evaluate(question: str):
    # here's where you would add your LLM call and return the output
    return  {'generated_text': 'Paris'}

In [6]:
# Score your examples using scoring functions
evaluation = Evaluation(
    dataset=examples, scorers=[match_score1]
)

Using 'model_output' key for compatibility with older scorers. Please update scorers to use 'output' parameter.


In [7]:
# Start tracking the evaluation
weave.init('scratchpad-example')
# Run the evaluation
await evaluation.evaluate(function_to_evaluate)

Logged in as Weights & Biases user: paul-mriganka.
View Weave data at https://wandb.ai/paul-mriganka-personal/scratchpad-example/weave


{'match_score1': {'match': {'true_count': 1,
   'true_fraction': 0.3333333333333333}},
 'model_latency': {'mean': 0.005079587300618489}}

🍩 https://wandb.ai/paul-mriganka-personal/scratchpad-example/r/call/019682d2-f888-7d70-856d-2b0651d80bc3


### All in one

In [12]:
from weave import Evaluation, Model
import weave
import asyncio
weave.init('intro-example')
examples = [
    {"question": "What is the capital of France?", "expected": "Paris"},
    {"question": "Who wrote 'To Kill a Mockingbird'?", "expected": "Harper Lee"},
    {"question": "What is the square root of 64?", "expected": "8"},
]

@weave.op()
def match_score1(expected: str, model_output: dict) -> dict:
    return {'match': expected == model_output['generated_text']}

@weave.op()
def match_score2(expected: dict, model_output: dict) -> dict:
    return {'match': expected == model_output['generated_text']}

class MyModel(Model):
    prompt: str

    @weave.op()
    def predict(self, question: str):
        # here's where you would add your LLM call and return the output
        return {'generated_text': 'Hello, ' + question + self.prompt}

model = MyModel(prompt='World')
evaluation = Evaluation(dataset=examples, scorers=[match_score1, match_score2])

await evaluation.evaluate(model)

@weave.op()
def function_to_evaluate(question: str):
    # here's where you would add your LLM call and return the output
    return  {'generated_text': 'some response' + question}

await evaluation.evaluate(function_to_evaluate)

{'match_score1': {'match': {'true_count': 0, 'true_fraction': 0.0}},
 'match_score2': {'match': {'true_count': 0, 'true_fraction': 0.0}},
 'model_latency': {'mean': 0.003944238026936849}}

### Preprocess model input

In [15]:
import weave
from weave import Evaluation

# Our dataset has "input_text" but our model expects "question"
examples = [
    {"input_text": "What is the capital of France?", "expected": "Paris"},
    {"input_text": "Who wrote 'To Kill a Mockingbird'?", "expected": "Harper Lee"},
    {"input_text": "What is the square root of 64?", "expected": "8"},
]

@weave.op()
def preprocess_example(example):
    # Rename input_text to question
    return {
        "question": example["input_text"]
    }

@weave.op()
def match_score(expected: str, model_output: dict) -> dict:
    return {'match': expected == model_output['generated_text']}

@weave.op()
def function_to_evaluate(question: str):
    return {'generated_text': f'Answer to: {question}'}

# Create evaluation with preprocessing
evaluation = Evaluation(
    dataset=examples,
    scorers=[match_score],
    preprocess_model_input=preprocess_example
)

# Run the evaluation
weave.init('preprocessing-example')
await evaluation.evaluate(function_to_evaluate)

{'match_score': {'match': {'true_count': 0, 'true_fraction': 0.0}},
 'model_latency': {'mean': 0.005599260330200195}}

### Datasets

In [24]:
import weave
from weave import Dataset
# Initialize Weave
weave.init('intro-example')

# Create a dataset
dataset = Dataset(
    name='grammar',
    rows=[
        {'id': '0', 'sentence': "He no likes ice cream.", 'correction': "He doesn't like ice cream."},
        {'id': '1', 'sentence': "She goed to the store.", 'correction': "She went to the store."},
        {'id': '2', 'sentence': "They plays video games all day.", 'correction': "They play video games all day."}
    ]
)

# Publish the dataset
weave.publish(dataset)

# Retrieve the dataset
dataset_ref = weave.ref('grammar').get()

# Access a specific example
example_label = dataset_ref.rows[2]['sentence']

Logged in as Weights & Biases user: paul-mriganka.
View Weave data at https://wandb.ai/paul-mriganka-personal/intro-example/weave
📦 Published to https://wandb.ai/paul-mriganka-personal/intro-example/weave/objects/grammar/versions/ozZFHzO1w2hjwzUPPwdacGRwC9YH9Q6TgstzGbRiK44
🍩 https://wandb.ai/paul-mriganka-personal/intro-example/r/call/019682e5-7657-7e22-83c2-3a766b30e3b0


In [25]:
example_label

'They plays video games all day.'

In [26]:
@weave.op
def model(task: str) -> str:
    return f"Now working on {task}"

res1, call1 = model.call(task="fetch")
res2, call2 = model.call(task="parse")

dataset = Dataset.from_calls([call1, call2])
# Now you can use the dataset to evaluate the model, etc.

🍩 https://wandb.ai/paul-mriganka-personal/intro-example/r/call/019682e5-765c-7902-9bf0-08409af8915c


### Pandas DataFrame

In [30]:
import pandas as pd

In [31]:
df = pd.DataFrame([
    {'id': '0', 'sentence': "He no likes ice cream.", 'correction': "He doesn't like ice cream."},
    {'id': '1', 'sentence': "She goed to the store.", 'correction': "She went to the store."},
    {'id': '2', 'sentence': "They plays video games all day.", 'correction': "They play video games all day."}
])

In [32]:
dataset = Dataset.from_pandas(df)

In [33]:
df2 = dataset.to_pandas()

In [34]:
assert df.equals(df2)

### Function based scoring

In [45]:
import weave

@weave.op
def evaluate_uppercase(text: str) -> dict:
    return {"text_is_uppercase": text.isupper()}

evaluation = weave.Evaluation(
    dataset=[{"text": "HELLO WORLD"}],
    scorers=[evaluate_uppercase]
)
evaluation

Evaluation(name=None, description=None, dataset=Dataset(name=None, description=None, rows=<weave.trace.table.Table object at 0x7f4f40265240>), scorers=[<function evaluate_uppercase at 0x7f4f400ca170>], preprocess_model_input=None, trials=1, evaluation_name=None)

### Class based scoring

In [42]:
import weave
from openai import OpenAI
from weave import Scorer

llm_client = OpenAI()

class SummarizationScorer(Scorer):
    model_id: str = "gpt-4o"
    system_prompt: str = "Evaluate whether the summary is good."

    @weave.op
    def some_complicated_preprocessing(self, text: str) -> str:
        processed_text = "Original text: \n" + text + "\n"
        return processed_text

    @weave.op
    def call_llm(self, summary: str, processed_text: str) -> dict:
        res = llm_client.chat.completions.create(
            messages=[
                {"role": "system", "content": self.system_prompt},
                {"role": "user", "content": (
                    f"Analyse how good the summary is compared to the original text."
                    f"Summary: {summary}\n{processed_text}"
                )}])
        return {"summary_quality": res}

    @weave.op
    def score(self, output: str, text: str) -> dict:
        """Score the summary quality.

        Args:
            output: The summary generated by an AI system
            text: The original text being summarized
        """
        processed_text = self.some_complicated_preprocessing(text)
        eval_result = self.call_llm(summary=output, processed_text=processed_text)
        return {"summary_quality": eval_result}

evaluation = weave.Evaluation(
    dataset=[{"text": "The quick brown fox jumps over the lazy dog."}],
    scorers=[SummarizationScorer()])

In [43]:
evaluation

Evaluation(name=None, description=None, dataset=Dataset(name=None, description=None, rows=<weave.trace.table.Table object at 0x7f4f400fc6d0>), scorers=[SummarizationScorer(name=None, description=None, column_map=None, model_id='gpt-4o', system_prompt='Evaluate whether the summary is good.')], preprocess_model_input=None, trials=1, evaluation_name=None)

### Built-in scorers

In [48]:
# !pip install litellm

In [49]:
import asyncio
import weave
from weave.scorers import HallucinationFreeScorer

# Initialize scorer with a column mapping if needed.
hallucination_scorer = HallucinationFreeScorer(
    model_id="openai/gpt-4o", # or any other model supported by litellm
    column_map={"context": "input", "output": "other_col"}
)

# Create dataset
dataset = [
    {"input": "John likes various types of cheese."},
    {"input": "Pepe likes various types of cheese."},
]

@weave.op
def model(input: str) -> str:
    return "The person's favorite cheese is cheddar."

# Run evaluation
evaluation = weave.Evaluation(
    dataset=dataset,
    scorers=[hallucination_scorer],
)
print(await evaluation.evaluate(model))


Error getting code deps for <function HallucinationFreeScorer.score at 0x7f4f400e0550>: unmatched ')' (<unknown>, line 141)


{'HallucinationFreeScorer': {'has_hallucination': {'true_count': 2, 'true_fraction': 1.0}}, 'model_latency': {'mean': 0.011968851089477539}}


### Summarization Scorer

In [50]:
import asyncio
import weave
from weave.scorers import SummarizationScorer

class SummarizationModel(weave.Model):
    @weave.op()
    async def predict(self, input: str) -> str:
        return "This is a summary of the input text."

# Initialize scorer
summarization_scorer = SummarizationScorer(
    model_id="openai/gpt-4o"  # or any other model supported by litellm
)
# Create dataset
dataset = [
    {"input": "The quick brown fox jumps over the lazy dog."},
    {"input": "Artificial Intelligence is revolutionizing various industries."}
]
# Run evaluation
evaluation = weave.Evaluation(dataset=dataset, scorers=[summarization_scorer])
await evaluation.evaluate(SummarizationModel())


Error getting code deps for <function SummarizationScorer._evaluate_summary at 0x7f4f400e32e0>: unmatched ')' (<unknown>, line 141)
Error getting code deps for <function SummarizationScorer._extract_entities at 0x7f4f400e3250>: unmatched ')' (<unknown>, line 141)


{'SummarizationScorer': {'is_entity_dense': {'true_count': 0,
   'true_fraction': 0.0},
  'entity_density': {'mean': 0.0},
  'summarization_eval_score': {'mean': 0.0}},
 'model_latency': {'mean': 0.0007735490798950195}}

### OpenAIModerationScorer

In [53]:
import asyncio
import weave
from weave.scorers import OpenAIModerationScorer

class MyModel(weave.Model):
    @weave.op
    async def predict(self, input: str) -> str:
        return input

# Initialize scorer
moderation_scorer = OpenAIModerationScorer()

# Create dataset
dataset = [
    {"input": "I love puppies and kittens!"},
    {"input": "I hate everyone and want to hurt them."}
]

# Run evaluation
evaluation = weave.Evaluation(dataset=dataset, scorers=[moderation_scorer])
print(await evaluation.evaluate(MyModel()))

{'OpenAIModerationScorer': {'categories': {'violence': {'true_count': 1, 'true_fraction': 1.0}}, 'passed': {'true_count': 1, 'true_fraction': 0.5}}, 'model_latency': {'mean': 0.00043952465057373047}}


### EmbeddingSimilarityScorer

In [54]:
import asyncio
import weave
from weave.scorers import EmbeddingSimilarityScorer

# Initialize scorer
similarity_scorer = EmbeddingSimilarityScorer(
    model_id="openai/text-embedding-3-small",  # or any other model supported by litellm
    threshold=0.7
)
# Create dataset
dataset = [
    {
        "input": "He's name is John",
        "target": "John likes various types of cheese.",
    },
    {
        "input": "He's name is Pepe.",
        "target": "Pepe likes various types of cheese.",
    },
]
# Define model
@weave.op
def model(input: str) -> str:
    return "John likes various types of cheese."

# Run evaluation
evaluation = weave.Evaluation(
    dataset=dataset,
    scorers=[similarity_scorer],
)
print(await evaluation.evaluate(model))


{'EmbeddingSimilarityScorer': {'similarity_score': {'mean': 0.8449273654072489}, 'is_similar': {'true_count': 1, 'true_fraction': 0.5}}, 'model_latency': {'mean': 0.0044291019439697266}}


### ValidJSONScorer

In [55]:
import weave
from weave.scorers import ValidJSONScorer

class JSONModel(weave.Model):
    @weave.op()
    async def predict(self, input: str) -> str:
        # This is a placeholder.
        # In a real scenario, this would generate JSON.
        return '{"key": "value"}'

model = JSONModel()
json_scorer = ValidJSONScorer()

dataset = [
    {"input": "Generate a JSON object with a key and value"},
    {"input": "Create an invalid JSON"}
]

evaluation = weave.Evaluation(dataset=dataset, scorers=[json_scorer])
print(await evaluation.evaluate(model))

{'ValidJSONScorer': {'json_valid': {'true_count': 2, 'true_fraction': 1.0}}, 'model_latency': {'mean': 0.0007185935974121094}}


### ValidXMLScorer

In [58]:
import weave
from weave.scorers import ValidXMLScorer

class XMLModel(weave.Model):
    @weave.op()
    async def predict(self, input: str) -> str:
        # This is a placeholder. In a real scenario, this would generate XML.
        return '<root><element>value</element></root>'

model = XMLModel()
xml_scorer = ValidXMLScorer()

dataset = [
    {"input": "Generate a valid XML with a root element"},
    {"input": "Create an invalid XML"}
]

evaluation = weave.Evaluation(dataset=dataset, scorers=[xml_scorer])
print(await evaluation.evaluate(model))


{'ValidXMLScorer': {'xml_valid': {'true_count': 2, 'true_fraction': 1.0}}, 'model_latency': {'mean': 0.0003446340560913086}}


### RAGAS Scorer

In [59]:
from weave.scorers import ContextRelevancyScorer

relevancy_scorer = ContextRelevancyScorer(
    model_id="openai/gpt-4o",  # or any other model supported by litellm
    relevancy_prompt="""
Given the following question and context, rate the relevancy of the context to the question on a scale from 0 to 1.

Question: {question}
Context: {context}
Relevancy Score (0-1):
"""
)

In [60]:
from textwrap import dedent
import weave
from weave.scorers import ContextEntityRecallScorer, ContextRelevancyScorer

class RAGModel(weave.Model):
    @weave.op()
    async def predict(self, question: str) -> str:
        "Retrieve relevant context"
        return "Paris is the capital of France."

# Define prompts
relevancy_prompt: str = dedent("""
    Given the following question and context, rate the relevancy of the context to the question on a scale from 0 to 1.

    Question: {question}
    Context: {context}
    Relevancy Score (0-1):
    """)
# Initialize scorers
entity_recall_scorer = ContextEntityRecallScorer()
relevancy_scorer = ContextRelevancyScorer(relevancy_prompt=relevancy_prompt)
# Create dataset
dataset = [
    {
        "question": "What is the capital of France?",
        "context": "Paris is the capital city of France."
    },
    {
        "question": "Who wrote Romeo and Juliet?",
        "context": "William Shakespeare wrote many famous plays."
    }
]
# Run evaluation
evaluation = weave.Evaluation(
    dataset=dataset,
    scorers=[entity_recall_scorer, relevancy_scorer]
)
print(await evaluation.evaluate(RAGModel()))


Error getting code deps for <function ContextRelevancyScorer.score at 0x7f4f400e2830>: unmatched ')' (<unknown>, line 141)


{'ContextEntityRecallScorer': {'recall': {'mean': 0.5}}, 'ContextRelevancyScorer': {'relevancy_score': {'mean': 0.5}}, 'model_latency': {'mean': 0.00036144256591796875}}


### Switching to other providers

In [61]:
from weave.scorers import SummarizationScorer

# Switch to Anthropic's Claude model
summarization_scorer = SummarizationScorer(
    model_id="anthropic/claude-3-5-sonnet-20240620"
)

## Weave local scorers

In [63]:
!pip install transformers torch

Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Downloading transformers-4.51.3-py3-none-any.whl (10.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m83.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (471 kB)
Installing collected packages: safetensors, transformers
Successfully installed safetensors-0.5.3 transformers-4.51.3


In [64]:
import weave
from weave.scorers import WeaveBiasScorerV1

bias_scorer = WeaveBiasScorerV1()
result = bias_scorer.score(output="Martian men are terrible at cleaning")

print(f"The text is biased: {not result.passed}")
print(result)

[34m[1mwandb[0m: Downloading large artifact bias_scorer:v0, 551.93MB. 28 files... 
[34m[1mwandb[0m:   28 of 28 files downloaded.  
Done. 0:0:5.7


The text is biased: True
passed=False metadata={'gender_bias_score': 0.604306161403656, 'gender_bias': True, 'racial_bias_score': 0.34831076860427856, 'racial_bias': False}


##### And more....

### Evaluation Logger

In [67]:
import weave
from openai import OpenAI
from weave.flow.eval_imperative import EvaluationLogger

weave.init("logger-example-project")

# Initialize the logger (model/dataset names are optional metadata)
eval_logger = EvaluationLogger(
    model="my_model",
    dataset="my_dataset"
)

# Example input data (this can be any data structure you want)
eval_samples = [
    {'inputs': {'a': 1, 'b': 2}, 'expected': 3},
    {'inputs': {'a': 2, 'b': 3}, 'expected': 5},
    {'inputs': {'a': 3, 'b': 4}, 'expected': 7},
]

# Example model logic.  This does not have to be decorated with @weave.op,
# but if you do, it will be traced and logged.
@weave.op
def user_model(a: int, b: int) -> int:
    oai = OpenAI()
    _ = oai.chat.completions.create(messages=[{"role": "user", "content": f"What is {a}+{b}?"}], model="gpt-4o-mini")
    return a + b

# Iterate through examples, predict, and log
for sample in eval_samples:
    inputs = sample["inputs"]
    model_output = user_model(**inputs) # Pass inputs as kwargs

    # Log the prediction input and output
    pred_logger = eval_logger.log_prediction(
        inputs=inputs,
        output=model_output
    )

    # Calculate and log a score for this prediction
    expected = sample["expected"]
    correctness_score = model_output == expected
    pred_logger.log_score(
        scorer="correctness", # Simple string name for the scorer
        score=correctness_score
    )

    # Finish logging for this specific prediction
    pred_logger.finish()

# Log a final summary for the entire evaluation.
# Weave auto-aggregates the 'correctness' scores logged above.
summary_stats = {"subjective_overall_score": 0.8}
eval_logger.log_summary(summary_stats)

print("Evaluation logging complete. View results in the Weave UI.")

Logged in as Weights & Biases user: paul-mriganka.
View Weave data at https://wandb.ai/paul-mriganka-personal/logger-example-project/weave
🍩 https://wandb.ai/paul-mriganka-personal/logger-example-project/r/call/01968389-7e1f-7e43-b700-53f910bbac90
Evaluation logging complete. View results in the Weave UI.
🍩 https://wandb.ai/paul-mriganka-personal/logger-example-project/r/call/0196838d-e451-76e1-a20b-fdd31a50714e
🍩 https://wandb.ai/paul-mriganka-personal/logger-example-project/r/call/01968390-06d1-72d3-9a97-fd9ad0714b80


### Log Rich media

In [68]:
import io
import wave
import struct
from PIL import Image
import random
from typing import Any
import weave

def generate_random_audio_wave_read(duration=2, sample_rate=44100):
    n_samples = duration * sample_rate
    amplitude = 32767  # 16-bit max amplitude

    buffer = io.BytesIO()

    # Write wave data to the buffer
    with wave.open(buffer, 'wb') as wf:
        wf.setnchannels(1)
        wf.setsampwidth(2)  # 16-bit
        wf.setframerate(sample_rate)

        for _ in range(n_samples):
            sample = random.randint(-amplitude, amplitude)
            wf.writeframes(struct.pack('<h', sample))

    # Rewind the buffer to the beginning so we can read from it
    buffer.seek(0)

    # Return a Wave_read object
    return wave.open(buffer, 'rb')

rich_media_dataset = [
    {
        'image': Image.new(
            "RGB",
            (100, 100),
            color=(
                random.randint(0, 255),
                random.randint(0, 255),
                random.randint(0, 255),
            ),
        ),
        "audio": generate_random_audio_wave_read(),
    }
    for _ in range(5)
]

@weave.op
def your_output_generator(image: Image.Image, audio) -> dict[str, Any]:
    return {
        "result": random.randint(0, 10),
        "image": image,
        "audio": audio,
    }

ev = EvaluationLogger(model="example_model", dataset="example_dataset")

for inputs in rich_media_dataset:
    output = your_output_generator(**inputs)
    pred = ev.log_prediction(inputs, output)
    pred.log_score(scorer="greater_than_5_scorer", score=output["result"] > 5)
    pred.log_score(scorer="greater_than_7_scorer", score=output["result"] > 7)

ev.log_summary()