In [None]:
# Enrich 500+ stars PRs with inline review comments and task types
# - Input: aidev_pop_ge500_agent_prs.csv  (the existing PR-level file)
# - Extra tables: pr_review_comments_v2.parquet, pr_reviews.parquet, pr_task_type.parquet
# - Output: aidev_pop_ge500_pr_review_comments_with_task_type.csv

import pandas as pd
import numpy as np
import os

print("="*70)
print("AIDev-pop (>=500 stars) ‚Äì Reviews & Task Type Join")
print("Dataset: https://huggingface.co/datasets/hao-li/AIDev")
print("="*70)

# ---------------------------------------------------------------------
# Step 0: Install dependencies for hf:// parquet access (Colab-friendly)
# ---------------------------------------------------------------------
print("\nüì¶ Installing required packages (huggingface_hub, fsspec[http], pyarrow)...")
!pip install -q "huggingface_hub>=0.23.0" "fsspec[http]" pyarrow
print("‚úÖ Packages installed")

# ---------------------------------------------------------------------
# Step 1: Load your 500+ stars PRs CSV
# ---------------------------------------------------------------------
input_pr_csv = "aidev_pop_ge500_agent_prs.csv"

if not os.path.exists(input_pr_csv):
    raise FileNotFoundError(
        f"Could not find {input_pr_csv}. "
        "Make sure it's uploaded to the Colab working directory."
    )

print(f"\nüì• Loading PR CSV: {input_pr_csv}")
agent_prs_pop = pd.read_csv(input_pr_csv)
print(f"   ‚úÖ Loaded {len(agent_prs_pop):,} PR rows, {len(agent_prs_pop.columns)} columns")

if "id" not in agent_prs_pop.columns:
    raise ValueError("Expected a PR identifier column named 'id' in the PR CSV.")

# Normalize PR IDs to int64 where possible
agent_prs_pop["id"] = pd.to_numeric(agent_prs_pop["id"], errors="coerce").astype("Int64")
target_pr_ids = set(agent_prs_pop["id"].dropna().astype("int64"))

print(f"   üìä Unique PR IDs in CSV: {len(target_pr_ids):,}")

# ---------------------------------------------------------------------
# Step 2: Load AIDev-pop tables from Hugging Face via hf:// URLs
# ---------------------------------------------------------------------
print("\n" + "="*70)
print("STEP 2: Loading pr_review_comments_v2, pr_reviews, pr_task_type")
print("="*70)

pr_review_comments_v2_path = "hf://datasets/hao-li/AIDev/pr_review_comments_v2.parquet"
pr_reviews_path            = "hf://datasets/hao-li/AIDev/pr_reviews.parquet"
pr_task_type_path          = "hf://datasets/hao-li/AIDev/pr_task_type.parquet"

print(f"üì• Loading inline review comments from: {pr_review_comments_v2_path}")
inline_all = pd.read_parquet(pr_review_comments_v2_path)
print(f"   ‚úÖ pr_review_comments_v2: {len(inline_all):,} rows, {len(inline_all.columns)} columns")

print(f"üì• Loading review metadata from: {pr_reviews_path}")
pr_reviews = pd.read_parquet(pr_reviews_path)
print(f"   ‚úÖ pr_reviews: {len(pr_reviews):,} rows, {len(pr_reviews.columns)} columns")

print(f"üì• Loading task types from: {pr_task_type_path}")
pr_task_type = pd.read_parquet(pr_task_type_path)
print(f"   ‚úÖ pr_task_type: {len(pr_task_type):,} rows, {len(pr_task_type.columns)} columns")

# Make sure PR ids are int64 consistently
if "pr_id" in pr_reviews.columns:
    pr_reviews["pr_id"] = pd.to_numeric(pr_reviews["pr_id"], errors="coerce").astype("Int64")

if "id" in pr_task_type.columns:
    pr_task_type["id"] = pd.to_numeric(pr_task_type["id"], errors="coerce").astype("Int64")

# ---------------------------------------------------------------------
# Step 3: Attach pr_id to inline comments (if needed) and filter to our PRs
# ---------------------------------------------------------------------
print("\n" + "="*70)
print("STEP 3: Linking inline comments to PR ids")
print("="*70)

# Try to normalize any pr_id column present in v2
if "pr_id" in inline_all.columns:
    inline_all["pr_id"] = pd.to_numeric(inline_all["pr_id"], errors="coerce").astype("Int64")

if "pr_id" in inline_all.columns:
    # Easiest case: v2 already carries pr_id
    print("‚úÖ Detected 'pr_id' in pr_review_comments_v2 ‚Äì using direct filter")
    inline_filtered = inline_all[inline_all["pr_id"].isin(target_pr_ids)].copy()
else:
    # Fallback: join through pr_reviews using pull_request_review_id -> id -> pr_id
    if "pull_request_review_id" not in inline_all.columns:
        raise ValueError(
            "pr_review_comments_v2 has no 'pr_id' or 'pull_request_review_id' column ‚Äì "
            "cannot link to PRs."
        )
    if "id" not in pr_reviews.columns or "pr_id" not in pr_reviews.columns:
        raise ValueError(
            "pr_reviews must contain 'id' and 'pr_id' to resolve comments to PRs."
        )

    print("‚ÑπÔ∏è No 'pr_id' in pr_review_comments_v2 ‚Äì resolving via pr_reviews...")

    # Join inline comments with pr_reviews to get pr_id
    inline_with_pr = inline_all.merge(
        pr_reviews[["id", "pr_id"]],
        left_on="pull_request_review_id",
        right_on="id",
        how="left",
        suffixes=("", "_review")
    )

    inline_with_pr["pr_id"] = pd.to_numeric(inline_with_pr["pr_id"], errors="coerce").astype("Int64")
    inline_filtered = inline_with_pr[inline_with_pr["pr_id"].isin(target_pr_ids)].copy()

print(f"üìä Inline comments total: {len(inline_all):,}")
print(f"üìä Inline comments for 500+ stars PRs: {len(inline_filtered):,}")

if inline_filtered.empty:
    print("‚ö†Ô∏è No inline comments found for the selected PRs ‚Äì result will be empty.")

# ---------------------------------------------------------------------
# Step 4: Attach task type to PR ids
# ---------------------------------------------------------------------
print("\n" + "="*70)
print("STEP 4: Joining task type classifications")
print("="*70)

expected_task_cols = {"agent", "id", "title", "reason", "type"}
missing_task_cols = expected_task_cols - set(pr_task_type.columns)
if missing_task_cols:
    print(f"‚ö†Ô∏è pr_task_type is missing columns {missing_task_cols}, "
          "but we'll still use 'id' and 'type' if present.")

# We just need (id, type) ‚Üí (pr_id, task_type)
task_subset = pr_task_type.copy()
task_subset["id"] = pd.to_numeric(task_subset["id"], errors="coerce").astype("Int64")

task_subset = task_subset[task_subset["id"].isin(target_pr_ids)]
task_subset = task_subset[["id", "type"]].rename(columns={"id": "pr_id", "type": "task_type"})

print(f"üìä Task type rows for our PRs: {len(task_subset):,}")

# ---------------------------------------------------------------------
# Step 5: Merge everything into a comment-level dataset
# ---------------------------------------------------------------------
print("\n" + "="*70)
print("STEP 5: Building final comment-level dataset")
print("="*70)

# Merge inline comments with task_type (on pr_id)
comments_with_task = inline_filtered.merge(
    task_subset,
    on="pr_id",
    how="left",
)

# Merge in PR metadata from your CSV (agent_prs_pop)
# Note: agent_prs_pop.id is the PR id, matching pr_id
comments_full = comments_with_task.merge(
    agent_prs_pop,
    left_on="pr_id",
    right_on="id",
    how="left",
    suffixes=("_comment", "_pr")
)

print(f"‚úÖ Final rows (one per inline comment): {len(comments_full):,}")
print(f"   Columns: {len(comments_full.columns)}")

# ---------------------------------------------------------------------
# Step 6: Save to CSV and download
# ---------------------------------------------------------------------
print("\n" + "="*70)
print("STEP 6: Saving and downloading CSV")
print("="*70)

output_csv = "aidev_pop_ge500_pr_review_comments_with_task_type.csv"
comments_full.to_csv(output_csv, index=False)

size_mb = os.path.getsize(output_csv) / (1024 * 1024)
print(f"‚úÖ Saved: {output_csv}")
print(f"   Size: {size_mb:.2f} MB")

# Optional: trigger download in Colab
try:
    from google.colab import files
    print("\nüöÄ Initiating file download...")
    files.download(output_csv)
    print("‚úÖ File download triggered")
except Exception as e:
    print(f"‚ÑπÔ∏è Could not auto-download (likely not in Colab): {e}")
    print("   You can download the file manually from the working directory.")

print("\n" + "="*70)
print("‚úÖ DONE ‚Äì You now have all inline review comments for your 500+ star PRs,")
print("   enriched with PR metadata and task_type.")
print("="*70)


AIDev-pop (>=500 stars) ‚Äì Reviews & Task Type Join
Dataset: https://huggingface.co/datasets/hao-li/AIDev

üì¶ Installing required packages (huggingface_hub, fsspec[http], pyarrow)...
‚úÖ Packages installed

üì• Loading PR CSV: aidev_pop_ge500_agent_prs.csv
   ‚úÖ Loaded 12,433 PR rows, 22 columns
   üìä Unique PR IDs in CSV: 12,433

STEP 2: Loading pr_review_comments_v2, pr_reviews, pr_task_type
üì• Loading inline review comments from: hf://datasets/hao-li/AIDev/pr_review_comments_v2.parquet


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


   ‚úÖ pr_review_comments_v2: 26,868 rows, 15 columns
üì• Loading review metadata from: hf://datasets/hao-li/AIDev/pr_reviews.parquet
   ‚úÖ pr_reviews: 28,875 rows, 7 columns
üì• Loading task types from: hf://datasets/hao-li/AIDev/pr_task_type.parquet
   ‚úÖ pr_task_type: 33,596 rows, 6 columns

STEP 3: Linking inline comments to PR ids
‚ÑπÔ∏è No 'pr_id' in pr_review_comments_v2 ‚Äì resolving via pr_reviews...
üìä Inline comments total: 26,868
üìä Inline comments for 500+ stars PRs: 18,383

STEP 4: Joining task type classifications
üìä Task type rows for our PRs: 12,367

STEP 5: Building final comment-level dataset
‚úÖ Final rows (one per inline comment): 18,383
   Columns: 40

STEP 6: Saving and downloading CSV
‚úÖ Saved: aidev_pop_ge500_pr_review_comments_with_task_type.csv
   Size: 85.14 MB

üöÄ Initiating file download...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

‚úÖ File download triggered

‚úÖ DONE ‚Äì You now have all inline review comments for your 500+ star PRs,
   enriched with PR metadata and task_type.
