# Code Review Tool Evaluation

This notebook runs W&B Weave evaluations for the code review tool.

## Setup

In [None]:
import weave

PROJECT_NAME = "bugbug-code-review-eval"

_ = weave.init(PROJECT_NAME)

## Load Dataset

In [None]:
dataset = weave.ref("code_review_eval_legacy").get()

print(f"Dataset has {len(dataset.rows)} examples")

## Configure Model

In [None]:
from functools import cached_property

from bugbug.tools.code_review.agent import CodeReviewTool
from bugbug.tools.core.platforms.phabricator import PhabricatorPatch


class CodeReviewModel(weave.Model):
    """Weave Model wrapper for CodeReviewTool."""

    @cached_property
    def tool(self):
        return CodeReviewTool.create()

    @weave.op()
    def invoke(self, diff_id: int, patch_summary: str) -> dict:
        patch = PhabricatorPatch(diff_id=diff_id)
        comments = self.tool.generate_review_comments(patch, patch_summary)
        return {
            "comments": comments,
        }


model = CodeReviewModel()

## Run Evaluation

In [None]:
from bugbug.tools.code_review.scorer import BasicMetricsScorer, LLMCommentMatchingScorer

evaluation = weave.Evaluation(
    dataset=dataset,
    scorers=[BasicMetricsScorer(), LLMCommentMatchingScorer()],
)

results = await evaluation.evaluate(model)

## Visualizations

In [None]:
import matplotlib.pyplot as plt

# Extract key metrics for visualization
metrics = {
    "Recall (Valid)": results.get("LLMCommentMatchingScorer", {}).get(
        "recall_valid", 0
    ),
    "Recall (Invalid)": results.get("LLMCommentMatchingScorer", {}).get(
        "recall_invalid", 0
    ),
    "Missed Valid Rate": results.get("LLMCommentMatchingScorer", {}).get(
        "missed_valid_rate", 0
    ),
    "Error Rate": results.get("BasicMetricsScorer", {}).get("error_rate", 0),
}

# Bar chart
fig, ax = plt.subplots(figsize=(10, 6))
bars = ax.bar(
    metrics.keys(), metrics.values(), color=["green", "orange", "red", "gray"]
)
ax.set_ylabel("Rate")
ax.set_title("Code Review Evaluation Metrics")
ax.set_ylim(0, 1)

# Add value labels on bars
for bar, value in zip(bars, metrics.values()):
    ax.text(
        bar.get_x() + bar.get_width() / 2,
        bar.get_height() + 0.02,
        f"{value:.2%}",
        ha="center",
        va="bottom",
    )

plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()

In [None]:
# Comment counts comparison
basic_metrics = results.get("BasicMetricsScorer", {})

counts = {
    "Generated": basic_metrics.get("total_generated_comments", 0),
    "Ground Truth (Valid)": basic_metrics.get("total_ground_truth_valid", 0),
    "Ground Truth (Invalid)": basic_metrics.get("total_ground_truth_invalid", 0),
}

fig, ax = plt.subplots(figsize=(8, 5))
bars = ax.bar(counts.keys(), counts.values(), color=["blue", "green", "red"])
ax.set_ylabel("Count")
ax.set_title("Comment Counts")

for bar, value in zip(bars, counts.values()):
    ax.text(
        bar.get_x() + bar.get_width() / 2,
        bar.get_height() + 0.5,
        str(int(value)),
        ha="center",
        va="bottom",
    )

plt.tight_layout()
plt.show()

## 7. View in W&B

Visit [W&B Weave](https://wandb.ai) to see detailed traces, compare evaluations, and explore individual predictions.