## 1. Setup and Installation

In [None]:
# Install required packages
!pip install openai pandas numpy matplotlib seaborn plotly scikit-learn tqdm -q

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import json
import re
from tqdm import tqdm
from collections import Counter, defaultdict
import warnings
warnings.filterwarnings('ignore')

# OpenAI API
from openai import OpenAI
import time

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

## 2. Configure API Access

In [None]:
# Option 1: Set API key directly (not recommended for production)
API_KEY = "add your API key here!"

# Option 2: Use Colab secrets (recommended)
#from google.colab import userdata
#API_KEY = userdata.get('OPENAI_API_KEY')

# Initialize OpenAI client
client = OpenAI(api_key=API_KEY)

# Model configuration
MODEL = "gpt-4o"  # or "gpt-4-turbo" or "gpt-4o-mini" for cost savings
print(f"✓ Using model: {MODEL}")

✓ Using model: gpt-4o


## 3. Define the Taxonomy


In [None]:
# Complete taxonomy with definitions
TAXONOMY = {
    "SPEC_MISMATCH": {
        "name": "Specification/Intent Mismatch",
        "definition": "The PR misunderstands or partially addresses the issue requirement. The solution doesn't match what was asked for, solves the wrong problem, or addresses only part of the requirement.",
        "examples": [
            "PR fixes symptom but not root cause",
            "Feature implemented doesn't match issue description",
            "PR scope doesn't align with ticket requirements",
            "Misinterpretation of user story or feature request"
        ],
        "keywords": ["wrong problem", "not what was asked", "misunderstood", "not the issue", "scope mismatch"]
    },

    "LOGIC_DEFECT": {
        "name": "Logic/Semantic Defects",
        "definition": "The implementation contains logical errors, violates invariants, has incorrect business logic, or fails to handle edge cases. Code may compile but produces wrong results.",
        "examples": [
            "Off-by-one errors",
            "Incorrect conditional logic",
            "Edge case not handled (null, empty, boundary values)",
            "Algorithm produces incorrect output",
            "Race conditions or concurrency bugs"
        ],
        "keywords": ["bug", "wrong", "incorrect", "fails", "error", "edge case", "logic error"]
    },

    "BUILD_CI_FAILURE": {
        "name": "Build/CI/Environment Failures",
        "definition": "PR fails CI checks, has build errors, dependency issues, platform incompatibilities, or environment configuration problems. Cannot be validated in the project's CI matrix.",
        "examples": [
            "Failing CI pipeline",
            "Dependency version conflicts",
            "Build errors in specific environments",
            "Platform-specific failures",
            "Docker/container configuration issues"
        ],
        "keywords": ["CI fail", "build", "dependency", "environment", "platform", "compilation error"]
    },

    "STYLE_CONVENTION": {
        "name": "Style/Convention Violations",
        "definition": "Code doesn't follow project formatting standards, naming conventions, code organization, or linting rules. Violates established style guides.",
        "examples": [
            "Formatting doesn't match project style",
            "Naming conventions violated",
            "Linter errors",
            "Missing or incorrect code comments",
            "File organization doesn't follow project structure"
        ],
        "keywords": ["lint", "style", "format", "convention", "naming", "whitespace"]
    },

    "TEST_INADEQUACY": {
        "name": "Testing Inadequacy",
        "definition": "Missing tests, weak test coverage, incorrect tests, or tests that don't adequately validate the changes. May pass existing tests but lacks proper validation.",
        "examples": [
            "No tests provided for new functionality",
            "Test coverage below threshold",
            "Tests don't actually test the changes",
            "Weak assertions or test quality",
            "Missing edge case tests"
        ],
        "keywords": ["test", "coverage", "untested", "no tests", "test quality"]
    },

    "DESIGN_MISFIT": {
        "name": "Architectural/Design Misfit",
        "definition": "Solution violates architectural principles, degrades maintainability, has poor design choices, introduces tight coupling, or doesn't fit the codebase architecture.",
        "examples": [
            "Violates separation of concerns",
            "Introduces tight coupling",
            "Performance degradation",
            "Not maintainable or extensible",
            "Doesn't follow existing patterns",
            "Overly complex solution"
        ],
        "keywords": ["design", "architecture", "maintainability", "complexity", "coupling", "pattern"]
    },

    "POLICY_VIOLATION": {
        "name": "Process/Policy Violations",
        "definition": "Violates project governance, contribution policies, or procedural requirements. Missing required documentation, CLA/DCO, changelog, or doesn't follow contribution guidelines.",
        "examples": [
            "Missing CLA/DCO sign-off",
            "No changelog entry",
            "Missing or inadequate documentation",
            "Doesn't follow contribution guidelines",
            "Security policy violations"
        ],
        "keywords": ["documentation", "docs", "CLA", "DCO", "policy", "guideline", "changelog"]
    },

    "TOOL_ERROR": {
        "name": "Tool-Use/Automation Errors",
        "definition": "Misuse of repository-specific tooling, incorrect command flags, code generation errors, or automation script failures.",
        "examples": [
            "Code generator misused",
            "Build script errors",
            "Incorrect tool configuration",
            "Automation pipeline failures"
        ],
        "keywords": ["tool", "script", "generator", "automation", "command"]
    },

    "ALTERNATIVE_SOLUTION": {
        "name": "Alternative/Better Solution Exists",
        "definition": "A better solution already exists, is being worked on, or was chosen instead. PR is redundant or superseded by another approach.",
        "examples": [
            "Duplicate PR",
            "Better solution already merged",
            "Maintainer chose different approach",
            "Obsolete due to other changes"
        ],
        "keywords": ["duplicate", "already", "alternative", "obsolete", "superseded"]
    },

    "PR_TOO_LARGE": {
        "name": "PR Scope Too Large",
        "definition": "PR is too large, touches too many files, mixes multiple concerns, or should be split into smaller PRs for easier review.",
        "examples": [
            "Too many files changed",
            "Multiple unrelated changes",
            "Should be split into smaller PRs",
            "Difficult to review due to size"
        ],
        "keywords": ["too large", "too big", "split", "too many changes", "scope"]
    },

    "MERGE_CONFLICT": {
        "name": "Merge Conflicts/Outdated",
        "definition": "PR has merge conflicts with main branch, is based on outdated code, or needs rebasing.",
        "examples": [
            "Merge conflicts",
            "Needs rebase",
            "Based on old commit",
            "Conflicts with recent changes"
        ],
        "keywords": ["conflict", "merge", "rebase", "outdated", "stale"]
    },

    "LACK_OF_CONFIDENCE": {
        "name": "Lack of Confidence in AI-Generated Code",
        "definition": "Reviewers express distrust or lack of confidence specifically because code is AI-generated. Concerns about AI limitations or reliability.",
        "examples": [
            "'AI-generated, needs human review'",
            "Distrust of automated solution",
            "Concerns about AI understanding context"
        ],
        "keywords": ["AI", "automated", "bot", "generated", "trust"]
    },

    "NOT_COMMUNITY_INTEREST": {
        "name": "Not in Community Interest",
        "definition": "Change is not wanted by the project maintainers or community. Outside project roadmap or doesn't align with project goals.",
        "examples": [
            "Feature not wanted",
            "Outside project scope",
            "Not aligned with roadmap",
            "Maintainers declined the change"
        ],
        "keywords": ["not needed", "won't fix", "out of scope", "not interested"]
    },

    "OTHER": {
        "name": "Other/Unclear",
        "definition": "Rejection reason doesn't fit other categories or is unclear from available information.",
        "examples": ["No clear reason stated", "Insufficient information"],
        "keywords": []
    }
}

# Display taxonomy summary
print("="*80)
print("REJECTION TAXONOMY FOR AGENT-GENERATED PRs")
print("="*80)
for code, info in TAXONOMY.items():
    print(f"\n[{code}] {info['name']}")
    print(f"  Definition: {info['definition'][:100]}...")
print("\n" + "="*80)

REJECTION TAXONOMY FOR AGENT-GENERATED PRs

[SPEC_MISMATCH] Specification/Intent Mismatch
  Definition: The PR misunderstands or partially addresses the issue requirement. The solution doesn't match what ...

[LOGIC_DEFECT] Logic/Semantic Defects
  Definition: The implementation contains logical errors, violates invariants, has incorrect business logic, or fa...

[BUILD_CI_FAILURE] Build/CI/Environment Failures
  Definition: PR fails CI checks, has build errors, dependency issues, platform incompatibilities, or environment ...

[STYLE_CONVENTION] Style/Convention Violations
  Definition: Code doesn't follow project formatting standards, naming conventions, code organization, or linting ...

[TEST_INADEQUACY] Testing Inadequacy
  Definition: Missing tests, weak test coverage, incorrect tests, or tests that don't adequately validate the chan...

[DESIGN_MISFIT] Architectural/Design Misfit
  Definition: Solution violates architectural principles, degrades maintainability, has poor design 

## 4. Create the Classification Prompt

In [None]:
def create_classification_prompt(pr_body, review_comments):
    """
    Creates a structured prompt for the LLM to classify PR rejection reason.

    Args:
        pr_body: The PR description/body text
        review_comments: The review comments from maintainers

    Returns:
        Structured prompt string
    """

    # Build taxonomy section for prompt
    taxonomy_text = ""
    for code, info in TAXONOMY.items():
        taxonomy_text += f"\n### {code}: {info['name']}\n"
        taxonomy_text += f"**Definition:** {info['definition']}\n"
        taxonomy_text += f"**Examples:** {', '.join(info['examples'][:3])}\n"

    prompt = f"""You are an expert software engineering researcher analyzing rejected pull requests (PRs) from autonomous coding agents. Your task is to classify the PRIMARY reason why this PR was rejected based on the reviewer comments and PR context.

# CLASSIFICATION TAXONOMY

Choose the SINGLE MOST IMPORTANT category that best explains the rejection. Here are the categories:
{taxonomy_text}

# PR INFORMATION TO ANALYZE

## PR Description:
```
{pr_body if pr_body else "[No PR description provided]"}
```

## Reviewer Comments:
```
{review_comments if review_comments else "[No reviewer comments available]"}
```

# INSTRUCTIONS

1. Read the PR description and reviewer comments carefully
2. Identify the PRIMARY reason for rejection (the most critical issue)
3. Select the SINGLE category code that best matches this reason
4. Provide a brief explanation (1-2 sentences) of why you chose this category

# OUTPUT FORMAT

Respond with ONLY valid JSON in this exact format:
{{
    "category": "CATEGORY_CODE",
    "confidence": "high|medium|low",
    "explanation": "Brief explanation of why this category was chosen",
    "secondary_category": "CATEGORY_CODE or null"
}}

CRITICAL RULES:
- Output ONLY the JSON object, no other text
- Use exact category codes from the taxonomy above
- If no clear reason is evident, use "OTHER"
- Be decisive - choose the MOST important reason
"""

    return prompt

# Test the prompt
test_prompt = create_classification_prompt(
    pr_body="Fix bug in user authentication",
    review_comments="This doesn't actually fix the root cause. You're handling the symptom but the issue is in the token validation logic."
)
print("✓ Prompt template created")
print(f"\nPrompt length: {len(test_prompt)} characters")

✓ Prompt template created

Prompt length: 5629 characters


## 5. Load and Prepare Data

In [None]:
# Load the data
df = pd.read_csv('/content/aidev_pop_ge500_pr_review_comments_with_task_type.csv')

print(f"✓ Loaded {len(df)} rejected PRs")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
display(df.head())

# Data cleaning
df['body_comment'] = df['body_comment'].fillna('')
df['body_pr'] = df['body_pr'].fillna('')

# Create combined text for analysis
df['combined_text'] = df['body_pr'] + "\n\n" + df['body_comment']

# Remove rows with no text at all
df_with_text = df[df['combined_text'].str.strip().str.len() > 10].copy()

print(f"\n✓ {len(df_with_text)} PRs have sufficient text for classification")
print(f"\nText length statistics:")
print(df_with_text['combined_text'].str.len().describe())

✓ Loaded 3754 rejected PRs

Columns: ['id_comment', 'pull_request_review_id', 'user_comment', 'user_type', 'diff_hunk', 'path', 'position', 'original_position', 'commit_id', 'original_commit_id', 'body_comment', 'pull_request_url', 'created_at_comment', 'updated_at', 'in_reply_to_id', 'id_review', 'pr_id', 'task_type', 'agent_type', 'agent', 'id_pr', 'number', 'title', 'repo_id', 'full_name', 'language', 'stars', 'forks', 'url', 'user_pr', 'user_id', 'created_at_pr', 'closed_at', 'merged_at', 'turnaround_time_hours', 'state', 'pr_outcome', 'body_pr', 'html_url', 'repo_url']

First few rows:


Unnamed: 0,id_comment,pull_request_review_id,user_comment,user_type,diff_hunk,path,position,original_position,commit_id,original_commit_id,...,user_id,created_at_pr,closed_at,merged_at,turnaround_time_hours,state,pr_outcome,body_pr,html_url,repo_url
0,2114184775,2878659478,acornett21,User,"@@ -55,6 +56,7 @@ func (i *Install) BindFlags(...",internal/olm/operator/bundle/install.go,12.0,12,271fcdfb4a6440d3881656924dbf94c79f2d5755,271fcdfb4a6440d3881656924dbf94c79f2d5755,...,11228024,2025-05-28 19:12:52+00:00,2025-05-29 21:24:04+00:00,,26.186667,closed,REJECTED,Add a new --catalog-only flag to the 'operator...,https://github.com/operator-framework/operator...,https://api.github.com/repos/operator-framewor...
1,2114757537,2879625073,kaovilai,User,"@@ -55,6 +56,7 @@ func (i *Install) BindFlags(...",internal/olm/operator/bundle/install.go,12.0,12,271fcdfb4a6440d3881656924dbf94c79f2d5755,271fcdfb4a6440d3881656924dbf94c79f2d5755,...,11228024,2025-05-28 19:12:52+00:00,2025-05-29 21:24:04+00:00,,26.186667,closed,REJECTED,Add a new --catalog-only flag to the 'operator...,https://github.com/operator-framework/operator...,https://api.github.com/repos/operator-framewor...
2,2114361669,2878965930,camilamacedo86,User,"@@ -55,6 +56,7 @@ func (i *Install) BindFlags(...",internal/olm/operator/bundle/install.go,12.0,12,271fcdfb4a6440d3881656924dbf94c79f2d5755,271fcdfb4a6440d3881656924dbf94c79f2d5755,...,11228024,2025-05-28 19:12:52+00:00,2025-05-29 21:24:04+00:00,,26.186667,closed,REJECTED,Add a new --catalog-only flag to the 'operator...,https://github.com/operator-framework/operator...,https://api.github.com/repos/operator-framewor...
3,2114726516,2879573686,joelanford,User,"@@ -55,6 +56,7 @@ func (i *Install) BindFlags(...",internal/olm/operator/bundle/install.go,12.0,12,271fcdfb4a6440d3881656924dbf94c79f2d5755,271fcdfb4a6440d3881656924dbf94c79f2d5755,...,11228024,2025-05-28 19:12:52+00:00,2025-05-29 21:24:04+00:00,,26.186667,closed,REJECTED,Add a new --catalog-only flag to the 'operator...,https://github.com/operator-framework/operator...,https://api.github.com/repos/operator-framewor...
4,2147727534,2929274638,apple-techie,User,"@@ -480,8 +480,8 @@ function getParametersForR...",scripts/modules/config-manager.js,,6,2e314e33c4b4eb86a8119af9697ebbaed5905dc5,a084953dcf1a0ccfe0e6641c0da83327ba3bdb8e,...,203526493,2025-06-15 11:55:29+00:00,2025-06-17 06:37:31+00:00,,42.700556,closed,REJECTED,## Summary\n\nThis PR integrates the Claude Co...,https://github.com/eyaltoledano/claude-task-ma...,https://api.github.com/repos/eyaltoledano/clau...



✓ 3754 PRs have sufficient text for classification

Text length statistics:
count     3754.000000
mean      2427.503729
std       1731.691133
min         24.000000
25%       1042.000000
50%       2151.000000
75%       3249.000000
max      11985.000000
Name: combined_text, dtype: float64


## 6. LLM Classification Function

In [None]:
def classify_pr_rejection(pr_body, review_comments, max_retries=3):
    """
    Classify a single PR rejection using LLM.

    Args:
        pr_body: PR description text
        review_comments: Reviewer comments text
        max_retries: Maximum number of retry attempts

    Returns:
        dict with classification results or error
    """
    prompt = create_classification_prompt(pr_body, review_comments)

    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
                model=MODEL,
                messages=[
                    {"role": "system", "content": "You are an expert software engineering researcher. Respond ONLY with valid JSON."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.1,  # Low temperature for consistency
                max_tokens=500,
                response_format={"type": "json_object"}  # Ensure JSON response
            )

            result_text = response.choices[0].message.content
            result = json.loads(result_text)

            # Validate the result
            if 'category' not in result:
                raise ValueError("Missing 'category' in response")

            if result['category'] not in TAXONOMY:
                result['category'] = 'OTHER'

            # Add metadata
            result['success'] = True
            result['attempt'] = attempt + 1

            return result

        except Exception as e:
            if attempt == max_retries - 1:
                return {
                    'category': 'OTHER',
                    'confidence': 'low',
                    'explanation': f'Classification failed: {str(e)}',
                    'secondary_category': None,
                    'success': False,
                    'error': str(e)
                }
            time.sleep(2 ** attempt)  # Exponential backoff

# Test classification on one PR
print("Testing classification on first PR...\n")
test_result = classify_pr_rejection(
    pr_body=df_with_text.iloc[0]['body_pr'],
    review_comments=df_with_text.iloc[0]['body_comment']
)
print("Test Result:")
print(json.dumps(test_result, indent=2))

Testing classification on first PR...

Test Result:
{
  "category": "NOT_COMMUNITY_INTEREST",
  "confidence": "high",
  "explanation": "The reviewer comments indicate that the proposed feature is not needed as there are already existing methods to achieve the same outcome, suggesting it is not aligned with the community's interest or needs.",
  "secondary_category": null,
  "success": true,
  "attempt": 1
}


## 7. Batch Classification with Progress Tracking


In [None]:
# Configuration
SAMPLE_SIZE = 3750  # Start with 500, adjust based on needs/budget
# For full dataset: SAMPLE_SIZE = len(df_with_text)

# Sample the data
if SAMPLE_SIZE < len(df_with_text):
    df_to_classify = df_with_text.sample(n=SAMPLE_SIZE, random_state=42)
    print(f"Classifying random sample of {SAMPLE_SIZE} PRs")
else:
    df_to_classify = df_with_text.copy()
    print(f"Classifying all {len(df_to_classify)} PRs")

# Storage for results
classification_results = []

# Batch classification with progress bar
print(f"\nStarting classification...\n")
for idx, row in tqdm(df_to_classify.iterrows(), total=len(df_to_classify), desc="Classifying PRs"):
    result = classify_pr_rejection(
        pr_body=row['body_pr'],
        review_comments=row['body_comment']
    )

    # Add row index for tracking
    result['original_index'] = idx
    classification_results.append(result)

    # Rate limiting (adjust based on API tier)
    time.sleep(0.5)  # 2 requests per second

print("\n✓ Classification complete!")

# Create results DataFrame
results_df = pd.DataFrame(classification_results)
df_classified = df_to_classify.copy()
df_classified['category'] = results_df['category'].values
df_classified['confidence'] = results_df['confidence'].values
df_classified['explanation'] = results_df['explanation'].values
df_classified['secondary_category'] = results_df['secondary_category'].values

# Success rate
success_rate = results_df['success'].mean() * 100
print(f"\nClassification success rate: {success_rate:.1f}%")

# Save results
df_classified.to_csv('/content/pr_classifications_results.csv', index=False)
print("\n✓ Results saved to 'pr_classifications_results.csv'")

Classifying random sample of 3750 PRs

Starting classification...



Classifying PRs: 100%|██████████| 3750/3750 [2:33:05<00:00,  2.45s/it]



✓ Classification complete!

Classification success rate: 94.6%

✓ Results saved to 'pr_classifications_results.csv'


## 8. Results Analysis and Statistics

In [None]:
# Category distribution
category_counts = df_classified['category'].value_counts()
category_percentages = (category_counts / len(df_classified) * 100).round(2)

# Create summary table
summary_df = pd.DataFrame({
    'Category': [TAXONOMY[cat]['name'] for cat in category_counts.index],
    'Code': category_counts.index,
    'Count': category_counts.values,
    'Percentage': category_percentages.values
})

print("="*80)
print("CLASSIFICATION RESULTS SUMMARY")
print("="*80)
display(summary_df)

# Confidence distribution
print("\n" + "="*80)
print("CONFIDENCE DISTRIBUTION")
print("="*80)
confidence_dist = df_classified['confidence'].value_counts()
print(confidence_dist)
print(f"\nHigh confidence rate: {(confidence_dist.get('high', 0) / len(df_classified) * 100):.1f}%")

# Top failure modes
print("\n" + "="*80)
print("TOP 5 FAILURE MODES")
print("="*80)
top_5 = summary_df.head(5)
for idx, row in top_5.iterrows():
    print(f"{idx+1}. {row['Category']} ({row['Code']}): {row['Count']} ({row['Percentage']}%)")

CLASSIFICATION RESULTS SUMMARY


Unnamed: 0,Category,Code,Count,Percentage
0,Specification/Intent Mismatch,SPEC_MISMATCH,924,24.64
1,Logic/Semantic Defects,LOGIC_DEFECT,741,19.76
2,Other/Unclear,OTHER,699,18.64
3,Style/Convention Violations,STYLE_CONVENTION,501,13.36
4,Architectural/Design Misfit,DESIGN_MISFIT,207,5.52
5,Testing Inadequacy,TEST_INADEQUACY,180,4.8
6,Build/CI/Environment Failures,BUILD_CI_FAILURE,171,4.56
7,Alternative/Better Solution Exists,ALTERNATIVE_SOLUTION,88,2.35
8,Process/Policy Violations,POLICY_VIOLATION,82,2.19
9,Not in Community Interest,NOT_COMMUNITY_INTEREST,77,2.05



CONFIDENCE DISTRIBUTION
confidence
high      1753
medium    1347
low        650
Name: count, dtype: int64

High confidence rate: 46.7%

TOP 5 FAILURE MODES
1. Specification/Intent Mismatch (SPEC_MISMATCH): 924 (24.64%)
2. Logic/Semantic Defects (LOGIC_DEFECT): 741 (19.76%)
3. Other/Unclear (OTHER): 699 (18.64%)
4. Style/Convention Violations (STYLE_CONVENTION): 501 (13.36%)
5. Architectural/Design Misfit (DESIGN_MISFIT): 207 (5.52%)
