# Create Dataset for Code Review Evaluation

## Setup

In [None]:
import weave

PROJECT_NAME = "bugbug-code-review-eval"
DATASET_NAME = "code_review_eval"

_ = weave.init(PROJECT_NAME)

## Prepare the Data

### Load the feedback data:

In [None]:
import pandas as pd

df = pd.read_csv(
    "https://docs.google.com/spreadsheets/d/e/2PACX-1vTQLbFpQmzEv4zxaKJprosIt6RWs4jDAN6IDT6xB7EcelD0ZR8qilt2tpscUSGEwMNWVpHKDlPacA7b/pub?output=csv"
)

### Reuse summaries from the last dataset version:

In [None]:
summaries = {
    row["diff_id"]: row["patch_summary"]
    for row in weave.ref(DATASET_NAME).get()
    if row.get("patch_summary")
}

## Generate Summaries

Generate patch summaries for new diffs that do not have summaries yet.

In [None]:
from bugbug.tools.core.platforms.phabricator import PhabricatorReviewData
from bugbug.tools.patch_summarization.agent import PatchSummarizationTool

summarizer = PatchSummarizationTool.create()
review_data = PhabricatorReviewData()


@weave.op()
def summarize(diff_id):
    patch = review_data.get_patch_by_id(diff_id)
    summaries[diff_id] = summarizer.run(patch)


@weave.op()
def generate_summaries(diff_ids):
    with weave.ThreadPoolExecutor() as exc:
        exc.map(summarize, diff_ids)


diff_ids_with_no_summaries = (
    df.query("diff_id not in @summaries")["diff_id"].unique().tolist()
)

generate_summaries(diff_ids_with_no_summaries)

## Save the Dataset

In [None]:
examples = [
    {
        "diff_id": diff_id,
        "revision_id": revision_id,
        "patch_summary": summaries[diff_id],
        "ground_truth_comments": ground_truth_comments,
    }
    for (diff_id, revision_id), ground_truth_comments in df.groupby(
        ["diff_id", "revision_id"]
    )
    .apply(lambda x: x.to_dict(orient="records"))
    .to_dict()
    .items()
    if diff_id in summaries
]

In [None]:
dataset = weave.Dataset(
    name=DATASET_NAME,
    description="Code review evaluation dataset with ground truth comments and patch summaries.",
    rows=examples,
)

_ = weave.publish(dataset)