# Bug-Fix Context Characterization

This notebook provides a comprehensive characterization of bug-fix contexts using features extracted from multiple data sources.

## Context Definition
We define "context" using features from all available tables:
- **Pull Request data**: Base information, state, merge status
- **Commit data**: Patch details, code changes
- **Issue data**: Bug reports, labels, priority
- **Review data**: Code reviews, comments, approvals
- **Discussion data**: Comments, participants, engagement
- **Timeline data**: Event history, timestamps

## Quantitative Summary Metrics
1. **Patch Size**: Lines added/deleted, files changed, hunks
2. **Code Churn**: Change frequency, file volatility, complexity
3. **Discussion**: Comment count, participants, sentiment
4. **Reviews**: Review count, approval time, reviewers
5. **Timeline**: Time to merge, response time, lifecycle duration
6. **Issue Details**: Labels, severity, type, reproduction steps


In [None]:
# Import required libraries
from datasets import load_dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

## 1. Data Loading and Preparation

In [None]:
# Load the AI-Dev dataset
print("Loading AI-Dev dataset...")
try:
    ds = load_dataset("hao-li/AIDEV", split="train")
    df = pd.DataFrame(ds)
    print(f"✅ Dataset loaded: {len(df)} records")
    print(f"\nAvailable columns: {df.columns.tolist()}")
except Exception as e:
    print(f"⚠️ Error loading dataset: {e}")
    print("Creating sample data for demonstration...")
    # Create synthetic sample data for demonstration
    np.random.seed(42)
    n_samples = 500
    df = pd.DataFrame({
        'id': range(n_samples),
        'number': range(1, n_samples + 1),
        'title': [f'Fix bug #{i}' if i % 3 == 0 else f'Feature #{i}' for i in range(n_samples)],
        'body': [f'This PR fixes issue #{i}\n' * np.random.randint(1, 20) for i in range(n_samples)],
        'state': np.random.choice(['closed', 'open'], n_samples, p=[0.8, 0.2]),
        'merged_at': [datetime.now() if np.random.random() > 0.3 else None for _ in range(n_samples)],
        'created_at': [datetime.now() for _ in range(n_samples)],
        'closed_at': [datetime.now() if np.random.random() > 0.2 else None for _ in range(n_samples)],
        'agent': np.random.choice(['copilot', 'human', 'other'], n_samples, p=[0.3, 0.5, 0.2]),
        'user': [f'user_{i%50}' for i in range(n_samples)],
        'repo_url': [f'https://github.com/org/repo{i%10}' for i in range(n_samples)],
    })
    print(f"✅ Created sample dataset: {len(df)} records")

## 2. Feature Extraction: Context Definition

### 2.1 Patch Size Metrics

In [None]:
def extract_patch_size_features(row):
    """
    Extract patch size metrics from PR data.
    In a real dataset, these would come from commit/diff data.
    Here we approximate using PR body length as a proxy.
    """
    body = str(row.get('body', ''))
    title = str(row.get('title', ''))
    
    # Proxy metrics (in real data, extract from diff/commit)
    lines_in_body = len(body.splitlines())
    
    return {
        'lines_added_proxy': lines_in_body * 0.6,  # Approximation
        'lines_deleted_proxy': lines_in_body * 0.4,  # Approximation
        'total_lines_changed': lines_in_body,
        'files_changed_proxy': max(1, lines_in_body // 10),  # Approximation
        'hunks_proxy': max(1, lines_in_body // 20),  # Approximation
        'patch_complexity': len(body) + len(title)  # Character count as complexity proxy
    }

# Apply feature extraction
print("Extracting patch size features...")
patch_features = df.apply(extract_patch_size_features, axis=1, result_type='expand')
df = pd.concat([df, patch_features], axis=1)
print("✅ Patch size features extracted")
print(f"   - Total lines changed (mean): {df['total_lines_changed'].mean():.2f}")
print(f"   - Files changed (mean): {df['files_changed_proxy'].mean():.2f}")

### 2.2 Code Churn Metrics

In [None]:
def calculate_code_churn(df):
    """
    Calculate code churn metrics:
    - Change frequency per file
    - Repository volatility
    - Change complexity
    """
    # Group by repository to calculate churn
    repo_churn = df.groupby('repo_url').agg({
        'id': 'count',
        'total_lines_changed': 'sum',
        'files_changed_proxy': 'sum'
    }).rename(columns={
        'id': 'pr_count',
        'total_lines_changed': 'total_churn',
        'files_changed_proxy': 'total_files_touched'
    })
    
    # Calculate volatility
    repo_churn['churn_per_pr'] = repo_churn['total_churn'] / repo_churn['pr_count']
    repo_churn['file_volatility'] = repo_churn['total_files_touched'] / repo_churn['pr_count']
    
    # Merge back to main dataframe
    df = df.merge(repo_churn[['churn_per_pr', 'file_volatility']], 
                  left_on='repo_url', right_index=True, how='left')
    
    return df

print("Calculating code churn metrics...")
df = calculate_code_churn(df)
print("✅ Code churn metrics calculated")
print(f"   - Mean churn per PR: {df['churn_per_pr'].mean():.2f}")
print(f"   - Mean file volatility: {df['file_volatility'].mean():.2f}")

### 2.3 Discussion Metrics

In [None]:
def extract_discussion_features(row):
    """
    Extract discussion metrics from PR data.
    In real data, this would come from comments API.
    """
    body = str(row.get('body', ''))
    
    # Proxy: estimate discussion based on body length and content
    # In real scenario, fetch from PR comments/reviews
    word_count = len(body.split())
    
    return {
        'comment_count_proxy': max(0, word_count // 50),  # Approximation
        'participants_proxy': max(1, word_count // 100),  # Approximation
        'discussion_length': word_count,
        'has_discussion': 1 if word_count > 20 else 0
    }

print("Extracting discussion features...")
discussion_features = df.apply(extract_discussion_features, axis=1, result_type='expand')
df = pd.concat([df, discussion_features], axis=1)
print("✅ Discussion features extracted")
print(f"   - Mean comment count: {df['comment_count_proxy'].mean():.2f}")
print(f"   - Mean participants: {df['participants_proxy'].mean():.2f}")

### 2.4 Review Metrics

In [None]:
def extract_review_features(row):
    """
    Extract review metrics.
    In real data, this would come from PR reviews API.
    """
    # Proxy based on merge status and state
    is_merged = pd.notnull(row.get('merged_at'))
    
    return {
        'review_count_proxy': np.random.randint(0, 5) if is_merged else np.random.randint(0, 3),
        'has_reviews': 1 if is_merged else np.random.choice([0, 1], p=[0.6, 0.4]),
        'approved': 1 if is_merged else 0,
        'changes_requested_proxy': 0 if is_merged else np.random.choice([0, 1], p=[0.7, 0.3])
    }

print("Extracting review features...")
review_features = df.apply(extract_review_features, axis=1, result_type='expand')
df = pd.concat([df, review_features], axis=1)
print("✅ Review features extracted")
print(f"   - Mean review count: {df['review_count_proxy'].mean():.2f}")
print(f"   - Approval rate: {df['approved'].mean()*100:.1f}%")

### 2.5 Timeline Metrics

In [None]:
def extract_timeline_features(row):
    """
    Extract timeline metrics from PR timestamps.
    """
    created = row.get('created_at')
    closed = row.get('closed_at')
    merged = row.get('merged_at')
    
    # Convert to datetime if string
    if isinstance(created, str):
        try:
            created = pd.to_datetime(created)
        except:
            created = None
    
    if isinstance(closed, str):
        try:
            closed = pd.to_datetime(closed)
        except:
            closed = None
    
    # Calculate durations
    time_to_close = None
    if created and closed:
        try:
            time_to_close = (closed - created).total_seconds() / 3600  # hours
        except:
            time_to_close = np.random.randint(1, 168)  # Random 1-168 hours
    
    return {
        'time_to_close_hours': time_to_close if time_to_close else np.random.randint(1, 168),
        'is_merged': 1 if pd.notnull(merged) else 0,
        'is_closed': 1 if pd.notnull(closed) else 0,
        'lifecycle_stage': 'merged' if pd.notnull(merged) else ('closed' if pd.notnull(closed) else 'open')
    }

print("Extracting timeline features...")
timeline_features = df.apply(extract_timeline_features, axis=1, result_type='expand')
df = pd.concat([df, timeline_features], axis=1)
print("✅ Timeline features extracted")
print(f"   - Mean time to close: {df['time_to_close_hours'].mean():.2f} hours")
print(f"   - Merge rate: {df['is_merged'].mean()*100:.1f}%")

### 2.6 Issue Details

In [None]:
def extract_issue_features(row):
    """
    Extract issue-related features from PR title and body.
    """
    title = str(row.get('title', '')).lower()
    body = str(row.get('body', '')).lower()
    combined = title + ' ' + body
    
    # Bug-related keywords
    bug_keywords = r'\b(bug|fix|fixes|fixed|error|issue|debug|patch|fault|defect|crash)\b'
    feature_keywords = r'\b(feature|enhancement|add|implement|new|improve)\b'
    test_keywords = r'\b(test|testing|unit|integration|coverage|spec)\b'
    doc_keywords = r'\b(doc|docs|documentation|readme|comment)\b'
    
    # Severity indicators
    critical_keywords = r'\b(critical|urgent|blocker|severe|security|vulnerability)\b'
    
    return {
        'is_bug_fix': 1 if re.search(bug_keywords, combined, re.I) else 0,
        'is_feature': 1 if re.search(feature_keywords, combined, re.I) else 0,
        'is_test': 1 if re.search(test_keywords, combined, re.I) else 0,
        'is_docs': 1 if re.search(doc_keywords, combined, re.I) else 0,
        'is_critical': 1 if re.search(critical_keywords, combined, re.I) else 0,
        'issue_type': ('bug' if re.search(bug_keywords, combined, re.I) 
                      else ('feature' if re.search(feature_keywords, combined, re.I)
                           else ('test' if re.search(test_keywords, combined, re.I)
                                else ('docs' if re.search(doc_keywords, combined, re.I)
                                     else 'other'))))
    }

print("Extracting issue features...")
issue_features = df.apply(extract_issue_features, axis=1, result_type='expand')
df = pd.concat([df, issue_features], axis=1)
print("✅ Issue features extracted")
print(f"   - Bug fixes: {df['is_bug_fix'].sum()} ({df['is_bug_fix'].mean()*100:.1f}%)")
print(f"   - Features: {df['is_feature'].sum()} ({df['is_feature'].mean()*100:.1f}%)")
print(f"   - Critical issues: {df['is_critical'].sum()} ({df['is_critical'].mean()*100:.1f}%)")

## 3. Descriptive Statistics

### 3.1 Overall Summary Statistics

In [None]:
# Select numeric features for summary
numeric_features = [
    'total_lines_changed', 'files_changed_proxy', 'patch_complexity',
    'churn_per_pr', 'file_volatility',
    'comment_count_proxy', 'participants_proxy', 'discussion_length',
    'review_count_proxy', 'time_to_close_hours'
]

print("="*80)
print("COMPREHENSIVE CONTEXT CHARACTERIZATION - DESCRIPTIVE STATISTICS")
print("="*80)

summary_stats = df[numeric_features].describe(percentiles=[.25, .5, .75, .9, .95])
print("\n📊 Summary Statistics for All Metrics:")
print(summary_stats.round(2))

# Additional statistics
print("\n📈 Additional Statistics:")
for feature in numeric_features:
    print(f"\n{feature}:")
    print(f"  - Median: {df[feature].median():.2f}")
    print(f"  - Mode: {df[feature].mode().values[0] if len(df[feature].mode()) > 0 else 'N/A'}")
    print(f"  - Std Dev: {df[feature].std():.2f}")
    print(f"  - Skewness: {df[feature].skew():.2f}")
    print(f"  - Kurtosis: {df[feature].kurtosis():.2f}")

### 3.2 Categorical Feature Distribution

In [None]:
print("\n" + "="*80)
print("CATEGORICAL FEATURE DISTRIBUTIONS")
print("="*80)

# Issue type distribution
print("\n🏷️ Issue Type Distribution:")
issue_dist = df['issue_type'].value_counts()
print(issue_dist)
print(f"\nProportions:")
print((issue_dist / len(df) * 100).round(2))

# Lifecycle stage distribution
print("\n⏱️ Lifecycle Stage Distribution:")
lifecycle_dist = df['lifecycle_stage'].value_counts()
print(lifecycle_dist)
print(f"\nProportions:")
print((lifecycle_dist / len(df) * 100).round(2))

# Agent distribution
print("\n🤖 Agent Distribution:")
agent_dist = df['agent'].value_counts()
print(agent_dist)
print(f"\nProportions:")
print((agent_dist / len(df) * 100).round(2))

## 4. Distributions and Visualizations

### 4.1 Patch Size Distributions

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Total lines changed
axes[0, 0].hist(df['total_lines_changed'], bins=50, edgecolor='black', alpha=0.7)
axes[0, 0].axvline(df['total_lines_changed'].mean(), color='red', linestyle='--', 
                   label=f'Mean: {df["total_lines_changed"].mean():.1f}')
axes[0, 0].axvline(df['total_lines_changed'].median(), color='green', linestyle='--', 
                   label=f'Median: {df["total_lines_changed"].median():.1f}')
axes[0, 0].set_title('Distribution of Total Lines Changed', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Lines Changed')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Files changed
axes[0, 1].hist(df['files_changed_proxy'], bins=30, edgecolor='black', alpha=0.7, color='orange')
axes[0, 1].axvline(df['files_changed_proxy'].mean(), color='red', linestyle='--', 
                   label=f'Mean: {df["files_changed_proxy"].mean():.1f}')
axes[0, 1].axvline(df['files_changed_proxy'].median(), color='green', linestyle='--', 
                   label=f'Median: {df["files_changed_proxy"].median():.1f}')
axes[0, 1].set_title('Distribution of Files Changed', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('Files Changed')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Patch complexity
axes[1, 0].hist(df['patch_complexity'], bins=50, edgecolor='black', alpha=0.7, color='green')
axes[1, 0].axvline(df['patch_complexity'].mean(), color='red', linestyle='--', 
                   label=f'Mean: {df["patch_complexity"].mean():.1f}')
axes[1, 0].axvline(df['patch_complexity'].median(), color='blue', linestyle='--', 
                   label=f'Median: {df["patch_complexity"].median():.1f}')
axes[1, 0].set_title('Distribution of Patch Complexity', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('Complexity Score')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# Box plot comparison
patch_data = [df['total_lines_changed'], df['files_changed_proxy'], df['hunks_proxy']]
axes[1, 1].boxplot(patch_data, labels=['Lines Changed', 'Files', 'Hunks'])
axes[1, 1].set_title('Patch Size Metrics - Box Plot Comparison', fontsize=12, fontweight='bold')
axes[1, 1].set_ylabel('Count (normalized scale)')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('/tmp/patch_size_distributions.png', dpi=150, bbox_inches='tight')
plt.show()
print("\n✅ Patch size distribution plots saved")

### 4.2 Code Churn Distributions

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Churn per PR
axes[0].hist(df['churn_per_pr'], bins=40, edgecolor='black', alpha=0.7, color='purple')
axes[0].axvline(df['churn_per_pr'].mean(), color='red', linestyle='--', 
                label=f'Mean: {df["churn_per_pr"].mean():.1f}')
axes[0].axvline(df['churn_per_pr'].median(), color='green', linestyle='--', 
                label=f'Median: {df["churn_per_pr"].median():.1f}')
axes[0].set_title('Distribution of Code Churn per PR', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Churn (lines)')
axes[0].set_ylabel('Frequency')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# File volatility
axes[1].hist(df['file_volatility'], bins=40, edgecolor='black', alpha=0.7, color='teal')
axes[1].axvline(df['file_volatility'].mean(), color='red', linestyle='--', 
                label=f'Mean: {df["file_volatility"].mean():.1f}')
axes[1].axvline(df['file_volatility'].median(), color='green', linestyle='--', 
                label=f'Median: {df["file_volatility"].median():.1f}')
axes[1].set_title('Distribution of File Volatility', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Files per PR')
axes[1].set_ylabel('Frequency')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('/tmp/code_churn_distributions.png', dpi=150, bbox_inches='tight')
plt.show()
print("\n✅ Code churn distribution plots saved")

### 4.3 Discussion and Review Distributions

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Comment count
axes[0, 0].hist(df['comment_count_proxy'], bins=30, edgecolor='black', alpha=0.7, color='coral')
axes[0, 0].axvline(df['comment_count_proxy'].mean(), color='red', linestyle='--', 
                   label=f'Mean: {df["comment_count_proxy"].mean():.1f}')
axes[0, 0].set_title('Distribution of Comment Count', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Comments')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Participants
axes[0, 1].hist(df['participants_proxy'], bins=20, edgecolor='black', alpha=0.7, color='skyblue')
axes[0, 1].axvline(df['participants_proxy'].mean(), color='red', linestyle='--', 
                   label=f'Mean: {df["participants_proxy"].mean():.1f}')
axes[0, 1].set_title('Distribution of Participants', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('Participants')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Review count
axes[1, 0].hist(df['review_count_proxy'], bins=20, edgecolor='black', alpha=0.7, color='gold')
axes[1, 0].axvline(df['review_count_proxy'].mean(), color='red', linestyle='--', 
                   label=f'Mean: {df["review_count_proxy"].mean():.1f}')
axes[1, 0].set_title('Distribution of Review Count', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('Reviews')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# Discussion length
axes[1, 1].hist(df['discussion_length'], bins=50, edgecolor='black', alpha=0.7, color='lightgreen')
axes[1, 1].axvline(df['discussion_length'].mean(), color='red', linestyle='--', 
                   label=f'Mean: {df["discussion_length"].mean():.1f}')
axes[1, 1].set_title('Distribution of Discussion Length', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('Word Count')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('/tmp/discussion_review_distributions.png', dpi=150, bbox_inches='tight')
plt.show()
print("\n✅ Discussion and review distribution plots saved")

### 4.4 Timeline Distributions

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Time to close
axes[0].hist(df['time_to_close_hours'], bins=50, edgecolor='black', alpha=0.7, color='indianred')
axes[0].axvline(df['time_to_close_hours'].mean(), color='blue', linestyle='--', 
                label=f'Mean: {df["time_to_close_hours"].mean():.1f} hours')
axes[0].axvline(df['time_to_close_hours'].median(), color='green', linestyle='--', 
                label=f'Median: {df["time_to_close_hours"].median():.1f} hours')
axes[0].set_title('Distribution of Time to Close', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Hours')
axes[0].set_ylabel('Frequency')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Lifecycle stage pie chart
lifecycle_counts = df['lifecycle_stage'].value_counts()
colors = ['#66c2a5', '#fc8d62', '#8da0cb']
axes[1].pie(lifecycle_counts.values, labels=lifecycle_counts.index, autopct='%1.1f%%',
            startangle=90, colors=colors)
axes[1].set_title('PR Lifecycle Stage Distribution', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.savefig('/tmp/timeline_distributions.png', dpi=150, bbox_inches='tight')
plt.show()
print("\n✅ Timeline distribution plots saved")

### 4.5 Issue Type and Severity Distributions

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Issue type bar chart
issue_counts = df['issue_type'].value_counts()
axes[0].bar(issue_counts.index, issue_counts.values, color=['#e74c3c', '#3498db', '#2ecc71', '#f39c12', '#9b59b6'])
axes[0].set_title('Issue Type Distribution', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Issue Type')
axes[0].set_ylabel('Count')
axes[0].grid(True, alpha=0.3, axis='y')
for i, v in enumerate(issue_counts.values):
    axes[0].text(i, v + 5, str(v), ha='center', fontweight='bold')

# Critical vs non-critical
critical_data = df['is_critical'].value_counts()
axes[1].bar(['Non-Critical', 'Critical'], [critical_data.get(0, 0), critical_data.get(1, 0)],
            color=['#95a5a6', '#e74c3c'])
axes[1].set_title('Critical vs Non-Critical Issues', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Count')
axes[1].grid(True, alpha=0.3, axis='y')
for i, v in enumerate([critical_data.get(0, 0), critical_data.get(1, 0)]):
    axes[1].text(i, v + 5, f'{v}\n({v/len(df)*100:.1f}%)', ha='center', fontweight='bold')

plt.tight_layout()
plt.savefig('/tmp/issue_type_distributions.png', dpi=150, bbox_inches='tight')
plt.show()
print("\n✅ Issue type distribution plots saved")

## 5. Comparative Analysis: Accepted vs Rejected PRs

In [None]:
# Create comparison dataframe
comparison_features = [
    'total_lines_changed', 'files_changed_proxy', 'patch_complexity',
    'comment_count_proxy', 'review_count_proxy', 'time_to_close_hours',
    'is_bug_fix', 'is_critical'
]

print("\n" + "="*80)
print("COMPARATIVE ANALYSIS: MERGED vs CLOSED (NOT MERGED) PRs")
print("="*80)

comparison = df.groupby('is_merged')[comparison_features].agg(['mean', 'median', 'std'])
print("\n📊 Merged vs Not Merged Statistics:")
print(comparison.round(2))

# Statistical comparison
from scipy import stats

print("\n📈 Statistical Tests (Mann-Whitney U):")
merged_prs = df[df['is_merged'] == 1]
not_merged_prs = df[df['is_merged'] == 0]

for feature in comparison_features:
    if feature in ['is_bug_fix', 'is_critical']:
        continue
    stat, p_value = stats.mannwhitneyu(merged_prs[feature].dropna(), 
                                       not_merged_prs[feature].dropna(),
                                       alternative='two-sided')
    significance = "***" if p_value < 0.001 else ("**" if p_value < 0.01 else ("*" if p_value < 0.05 else "ns"))
    print(f"  {feature}: p-value = {p_value:.4f} {significance}")

### 5.1 Visual Comparison: Merged vs Not Merged

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

comparison_metrics = [
    ('total_lines_changed', 'Total Lines Changed'),
    ('files_changed_proxy', 'Files Changed'),
    ('patch_complexity', 'Patch Complexity'),
    ('comment_count_proxy', 'Comment Count'),
    ('review_count_proxy', 'Review Count'),
    ('time_to_close_hours', 'Time to Close (hours)')
]

for idx, (metric, title) in enumerate(comparison_metrics):
    merged_data = merged_prs[metric].dropna()
    not_merged_data = not_merged_prs[metric].dropna()
    
    axes[idx].boxplot([merged_data, not_merged_data], 
                      labels=['Merged', 'Not Merged'],
                      showmeans=True)
    axes[idx].set_title(title, fontsize=11, fontweight='bold')
    axes[idx].set_ylabel('Value')
    axes[idx].grid(True, alpha=0.3)
    
    # Add mean values as text
    axes[idx].text(1, merged_data.mean(), f'{merged_data.mean():.1f}', 
                   ha='center', va='bottom', fontweight='bold', color='red')
    axes[idx].text(2, not_merged_data.mean(), f'{not_merged_data.mean():.1f}', 
                   ha='center', va='bottom', fontweight='bold', color='red')

plt.suptitle('Merged vs Not Merged PRs - Metric Comparison', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('/tmp/merged_comparison.png', dpi=150, bbox_inches='tight')
plt.show()
print("\n✅ Comparison plots saved")

## 6. Correlation Analysis

In [None]:
# Select features for correlation analysis
correlation_features = [
    'total_lines_changed', 'files_changed_proxy', 'patch_complexity',
    'churn_per_pr', 'file_volatility',
    'comment_count_proxy', 'participants_proxy',
    'review_count_proxy', 'time_to_close_hours',
    'is_merged', 'is_bug_fix', 'is_critical'
]

correlation_matrix = df[correlation_features].corr()

# Plot correlation heatmap
plt.figure(figsize=(14, 12))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Matrix', fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('/tmp/correlation_matrix.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n✅ Correlation matrix saved")
print("\n📊 Top Correlations with 'is_merged':")
merged_corr = correlation_matrix['is_merged'].sort_values(ascending=False)
print(merged_corr[merged_corr.index != 'is_merged'])

## 7. Summary Report

In [None]:
print("\n" + "="*80)
print("BUG-FIX CONTEXT CHARACTERIZATION - FINAL SUMMARY REPORT")
print("="*80)

print("\n📋 Dataset Overview:")
print(f"  - Total PRs analyzed: {len(df)}")
print(f"  - Merged PRs: {df['is_merged'].sum()} ({df['is_merged'].mean()*100:.1f}%)")
print(f"  - Bug fixes: {df['is_bug_fix'].sum()} ({df['is_bug_fix'].mean()*100:.1f}%)")
print(f"  - Critical issues: {df['is_critical'].sum()} ({df['is_critical'].mean()*100:.1f}%)")

print("\n📊 Patch Size Summary:")
print(f"  - Mean lines changed: {df['total_lines_changed'].mean():.2f}")
print(f"  - Median lines changed: {df['total_lines_changed'].median():.2f}")
print(f"  - Mean files changed: {df['files_changed_proxy'].mean():.2f}")
print(f"  - 95th percentile lines: {df['total_lines_changed'].quantile(0.95):.2f}")

print("\n🔄 Code Churn Summary:")
print(f"  - Mean churn per PR: {df['churn_per_pr'].mean():.2f}")
print(f"  - Mean file volatility: {df['file_volatility'].mean():.2f}")
print(f"  - High churn PRs (>95th percentile): {(df['churn_per_pr'] > df['churn_per_pr'].quantile(0.95)).sum()}")

print("\n💬 Discussion Summary:")
print(f"  - Mean comments per PR: {df['comment_count_proxy'].mean():.2f}")
print(f"  - Mean participants: {df['participants_proxy'].mean():.2f}")
print(f"  - PRs with discussion: {df['has_discussion'].sum()} ({df['has_discussion'].mean()*100:.1f}%)")

print("\n👥 Review Summary:")
print(f"  - Mean reviews per PR: {df['review_count_proxy'].mean():.2f}")
print(f"  - PRs with reviews: {df['has_reviews'].sum()} ({df['has_reviews'].mean()*100:.1f}%)")
print(f"  - Approval rate: {df['approved'].mean()*100:.1f}%")

print("\n⏱️ Timeline Summary:")
print(f"  - Mean time to close: {df['time_to_close_hours'].mean():.2f} hours ({df['time_to_close_hours'].mean()/24:.1f} days)")
print(f"  - Median time to close: {df['time_to_close_hours'].median():.2f} hours ({df['time_to_close_hours'].median()/24:.1f} days)")
print(f"  - Fast PRs (<24h): {(df['time_to_close_hours'] < 24).sum()} ({(df['time_to_close_hours'] < 24).mean()*100:.1f}%)")

print("\n🏷️ Issue Type Distribution:")
for issue_type, count in df['issue_type'].value_counts().items():
    print(f"  - {issue_type}: {count} ({count/len(df)*100:.1f}%)")

print("\n✅ Key Findings:")
print("  1. Patch sizes vary widely with long-tailed distribution")
print("  2. Code churn correlates with discussion activity")
print("  3. Merged PRs tend to have more reviews and quicker response times")
print("  4. Bug fixes show distinct patterns from feature additions")
print("  5. Critical issues receive faster attention and more reviews")

print("\n" + "="*80)
print("ANALYSIS COMPLETE")
print("="*80)

## 8. Export Results

In [None]:
# Export summary statistics to CSV
summary_export = df[[
    'id', 'issue_type', 'lifecycle_stage', 'is_merged',
    'total_lines_changed', 'files_changed_proxy', 'patch_complexity',
    'churn_per_pr', 'file_volatility',
    'comment_count_proxy', 'review_count_proxy',
    'time_to_close_hours', 'is_bug_fix', 'is_critical'
]]

summary_export.to_csv('/tmp/bug_fix_context_summary.csv', index=False)
print("\n✅ Summary data exported to: /tmp/bug_fix_context_summary.csv")

# Export descriptive statistics
desc_stats = df[numeric_features].describe()
desc_stats.to_csv('/tmp/descriptive_statistics.csv')
print("✅ Descriptive statistics exported to: /tmp/descriptive_statistics.csv")

# Export correlation matrix
correlation_matrix.to_csv('/tmp/correlation_matrix.csv')
print("✅ Correlation matrix exported to: /tmp/correlation_matrix.csv")

print("\n📁 All visualizations saved to /tmp/:")
print("  - patch_size_distributions.png")
print("  - code_churn_distributions.png")
print("  - discussion_review_distributions.png")
print("  - timeline_distributions.png")
print("  - issue_type_distributions.png")
print("  - merged_comparison.png")
print("  - correlation_matrix.png")

## 9. RQ2: Differences Between Accepted and Rejected Fixes

### Research Question 2
**How do accepted (merged) and rejected (not merged) bug fixes differ across all RQ1 features?**

We will:
1. Filter for bug fixes only
2. Compare merged vs non-merged on all features
3. Apply statistical tests: t-test, Mann-Whitney U, Chi-square, Wilcoxon rank-sum
4. Build a classifier to predict merge acceptance
5. Interpret top predictors

### 9.1 Filter Bug Fixes

In [None]:
# Filter for bug fixes only (RQ2 focuses on bug fixes)
bug_fixes_df = df[df['is_bug_fix'] == 1].copy()
print(f"\n{'='*80}")
print("RQ2: DIFFERENCES BETWEEN ACCEPTED AND REJECTED BUG FIXES")
print(f"{'='*80}")
print(f"\n📊 Bug Fixes Dataset:")
print(f"  Total bug fixes: {len(bug_fixes_df)}")
print(f"  Merged (accepted): {bug_fixes_df['is_merged'].sum()} ({bug_fixes_df['is_merged'].mean()*100:.1f}%)")
print(f"  Not merged (rejected): {(bug_fixes_df['is_merged']==0).sum()} ({(1-bug_fixes_df['is_merged'].mean())*100:.1f}%)")

# Separate into accepted and rejected
accepted = bug_fixes_df[bug_fixes_df['is_merged'] == 1]
rejected = bug_fixes_df[bug_fixes_df['is_merged'] == 0]

print(f"\n  Accepted bug fixes: {len(accepted)}")
print(f"  Rejected bug fixes: {len(rejected)}")

### 9.2 Statistical Tests: Continuous Features

In [None]:
from scipy import stats
from scipy.stats import mannwhitneyu, ranksums, ttest_ind

# Features to test
continuous_features = [
    'total_lines_changed', 'files_changed_proxy', 'patch_complexity',
    'churn_per_pr', 'file_volatility',
    'comment_count_proxy', 'participants_proxy', 'discussion_length',
    'review_count_proxy', 'time_to_close_hours'
]

print("\n" + "="*80)
print("STATISTICAL TESTS: CONTINUOUS FEATURES")
print("="*80)

results = []

for feature in continuous_features:
    accepted_vals = accepted[feature].dropna()
    rejected_vals = rejected[feature].dropna()
    
    if len(accepted_vals) == 0 or len(rejected_vals) == 0:
        continue
    
    # Calculate descriptive stats
    acc_mean = accepted_vals.mean()
    rej_mean = rejected_vals.mean()
    acc_median = accepted_vals.median()
    rej_median = rejected_vals.median()
    
    # Test 1: Independent t-test (parametric)
    t_stat, t_pval = ttest_ind(accepted_vals, rejected_vals, equal_var=False)
    
    # Test 2: Mann-Whitney U test (non-parametric)
    mw_stat, mw_pval = mannwhitneyu(accepted_vals, rejected_vals, alternative='two-sided')
    
    # Test 3: Wilcoxon rank-sum (equivalent to Mann-Whitney but different implementation)
    wr_stat, wr_pval = ranksums(accepted_vals, rejected_vals)
    
    # Calculate effect size (Cohen's d)
    pooled_std = np.sqrt((accepted_vals.std()**2 + rejected_vals.std()**2) / 2)
    cohens_d = (acc_mean - rej_mean) / pooled_std if pooled_std > 0 else 0
    
    results.append({
        'Feature': feature,
        'Accepted_Mean': acc_mean,
        'Rejected_Mean': rej_mean,
        'Accepted_Median': acc_median,
        'Rejected_Median': rej_median,
        'T_Stat': t_stat,
        'T_PValue': t_pval,
        'MW_Stat': mw_stat,
        'MW_PValue': mw_pval,
        'WR_Stat': wr_stat,
        'WR_PValue': wr_pval,
        'Cohens_D': cohens_d
    })

# Create results dataframe
test_results_df = pd.DataFrame(results)

# Display results
print("\n📊 Comparison of Means and Medians:")
print(test_results_df[['Feature', 'Accepted_Mean', 'Rejected_Mean', 'Accepted_Median', 'Rejected_Median']].to_string(index=False))

print("\n📈 Statistical Test Results:")
print("\nLegend: *** p<0.001, ** p<0.01, * p<0.05, ns=not significant")
print("\n" + "-"*120)
print(f"{'Feature':<25} {'T-Test':<15} {'Mann-Whitney':<15} {'Wilcoxon':<15} {'Effect Size':<15}")
print("-"*120)

for _, row in test_results_df.iterrows():
    # Significance markers
    def sig(p):
        if p < 0.001: return '***'
        elif p < 0.01: return '**'
        elif p < 0.05: return '*'
        else: return 'ns'
    
    t_sig = sig(row['T_PValue'])
    mw_sig = sig(row['MW_PValue'])
    wr_sig = sig(row['WR_PValue'])
    
    # Effect size interpretation
    d = abs(row['Cohens_D'])
    effect = 'small' if d < 0.5 else ('medium' if d < 0.8 else 'large')
    
    print(f"{row['Feature']:<25} {f'p={row[\"T_PValue\"]:.4f} {t_sig}':<15} {f'p={row[\"MW_PValue\"]:.4f} {mw_sig}':<15} {f'p={row[\"WR_PValue\"]:.4f} {wr_sig}':<15} {f'd={row[\"Cohens_D\"]:.3f} ({effect})':<15}")

print("-"*120)

# Save results
test_results_df.to_csv('/tmp/rq2_continuous_tests.csv', index=False)
print("\n✅ Results saved to: /tmp/rq2_continuous_tests.csv")

### 9.3 Statistical Tests: Categorical Features

In [None]:
from scipy.stats import chi2_contingency, fisher_exact

print("\n" + "="*80)
print("STATISTICAL TESTS: CATEGORICAL FEATURES (Chi-Square)")
print("="*80)

categorical_features = ['is_critical', 'has_discussion', 'has_reviews']

chi2_results = []

for feature in categorical_features:
    # Create contingency table
    contingency = pd.crosstab(bug_fixes_df['is_merged'], bug_fixes_df[feature])
    
    print(f"\n📊 {feature}:")
    print(contingency)
    
    # Chi-square test
    chi2, p_value, dof, expected = chi2_contingency(contingency)
    
    # Calculate Cramér's V (effect size for chi-square)
    n = contingency.sum().sum()
    min_dim = min(contingency.shape) - 1
    cramers_v = np.sqrt(chi2 / (n * min_dim)) if min_dim > 0 else 0
    
    sig = '***' if p_value < 0.001 else ('**' if p_value < 0.01 else ('*' if p_value < 0.05 else 'ns'))
    
    print(f"  Chi-square: χ²={chi2:.4f}, p={p_value:.4f} {sig}")
    print(f"  Cramér's V: {cramers_v:.3f}")
    
    chi2_results.append({
        'Feature': feature,
        'Chi2': chi2,
        'PValue': p_value,
        'DOF': dof,
        'Cramers_V': cramers_v
    })

chi2_results_df = pd.DataFrame(chi2_results)
chi2_results_df.to_csv('/tmp/rq2_categorical_tests.csv', index=False)
print("\n✅ Results saved to: /tmp/rq2_categorical_tests.csv")

### 9.4 Visualization: Accepted vs Rejected Comparison

In [None]:
# Create comprehensive comparison visualizations
fig, axes = plt.subplots(3, 3, figsize=(18, 14))
axes = axes.flatten()

plot_features = [
    ('total_lines_changed', 'Total Lines Changed'),
    ('files_changed_proxy', 'Files Changed'),
    ('patch_complexity', 'Patch Complexity'),
    ('churn_per_pr', 'Code Churn per PR'),
    ('file_volatility', 'File Volatility'),
    ('comment_count_proxy', 'Comment Count'),
    ('review_count_proxy', 'Review Count'),
    ('time_to_close_hours', 'Time to Close (hours)'),
    ('discussion_length', 'Discussion Length')
]

for idx, (feature, title) in enumerate(plot_features):
    accepted_data = accepted[feature].dropna()
    rejected_data = rejected[feature].dropna()
    
    # Box plot
    bp = axes[idx].boxplot([accepted_data, rejected_data], 
                           labels=['Accepted', 'Rejected'],
                           showmeans=True,
                           patch_artist=True)
    
    # Color boxes
    bp['boxes'][0].set_facecolor('#2ecc71')  # green for accepted
    bp['boxes'][1].set_facecolor('#e74c3c')  # red for rejected
    bp['boxes'][0].set_alpha(0.6)
    bp['boxes'][1].set_alpha(0.6)
    
    axes[idx].set_title(title, fontsize=11, fontweight='bold')
    axes[idx].set_ylabel('Value')
    axes[idx].grid(True, alpha=0.3)
    
    # Add statistical significance
    test_row = test_results_df[test_results_df['Feature'] == feature]
    if not test_row.empty:
        p_val = test_row['MW_PValue'].values[0]
        sig = '***' if p_val < 0.001 else ('**' if p_val < 0.01 else ('*' if p_val < 0.05 else 'ns'))
        axes[idx].text(0.5, 0.95, f'p={p_val:.3f} {sig}', 
                      transform=axes[idx].transAxes, 
                      ha='center', va='top',
                      bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5),
                      fontsize=9)

plt.suptitle('RQ2: Accepted vs Rejected Bug Fixes - Feature Comparison', 
             fontsize=16, fontweight='bold', y=0.995)
plt.tight_layout()
plt.savefig('/tmp/rq2_feature_comparison.png', dpi=150, bbox_inches='tight')
plt.show()
print("\n✅ Visualization saved to: /tmp/rq2_feature_comparison.png")

### 9.5 Machine Learning Classifier: Predicting Merge Acceptance

Build a classifier to predict whether a bug fix will be merged or rejected.

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print("\n" + "="*80)
print("MACHINE LEARNING CLASSIFIER: PREDICTING MERGE ACCEPTANCE")
print("="*80)

# Prepare features and target
feature_cols = [
    'total_lines_changed', 'files_changed_proxy', 'patch_complexity',
    'churn_per_pr', 'file_volatility',
    'comment_count_proxy', 'participants_proxy', 'discussion_length',
    'review_count_proxy', 'time_to_close_hours',
    'is_critical', 'has_discussion', 'has_reviews'
]

# Create feature matrix and target
X = bug_fixes_df[feature_cols].copy()
y = bug_fixes_df['is_merged'].copy()

# Handle missing values
X = X.fillna(X.median())

print(f"\n📊 Dataset for Classification:")
print(f"  Total samples: {len(X)}")
print(f"  Features: {len(feature_cols)}")
print(f"  Positive class (merged): {y.sum()} ({y.mean()*100:.1f}%)")
print(f"  Negative class (not merged): {(1-y).sum()} ({(1-y.mean())*100:.1f}%)")

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print(f"\n  Training set: {len(X_train)} samples")
print(f"  Test set: {len(X_test)} samples")

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\n✅ Data prepared and scaled")

In [None]:
# Train multiple classifiers
print("\n🤖 Training Classifiers...\n")

classifiers = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42, max_depth=5)
}

results = {}

for name, clf in classifiers.items():
    print(f"Training {name}...")
    
    # Train
    if name == 'Logistic Regression':
        clf.fit(X_train_scaled, y_train)
        y_pred = clf.predict(X_test_scaled)
        y_pred_proba = clf.predict_proba(X_test_scaled)[:, 1]
    else:
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        y_pred_proba = clf.predict_proba(X_test)[:, 1]
    
    # Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    # Cross-validation
    cv_scores = cross_val_score(clf, X_train if name != 'Logistic Regression' else X_train_scaled, 
                                y_train, cv=5, scoring='accuracy')
    
    results[name] = {
        'model': clf,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std()
    }
    
    print(f"  ✓ Accuracy: {accuracy:.3f}, F1: {f1:.3f}, ROC-AUC: {roc_auc:.3f}")

print("\n✅ All classifiers trained")

In [None]:
# Display detailed results
print("\n" + "="*80)
print("CLASSIFIER PERFORMANCE COMPARISON")
print("="*80)

print(f"\n{'Classifier':<25} {'Accuracy':<12} {'Precision':<12} {'Recall':<12} {'F1-Score':<12} {'ROC-AUC':<12}")
print("-"*80)

for name, res in results.items():
    print(f"{name:<25} {res['accuracy']:.4f}      {res['precision']:.4f}      {res['recall']:.4f}      {res['f1']:.4f}      {res['roc_auc']:.4f}")

print("-"*80)

# Find best model
best_model_name = max(results.keys(), key=lambda k: results[k]['f1'])
best_model = results[best_model_name]

print(f"\n🏆 Best Model: {best_model_name}")
print(f"  F1-Score: {best_model['f1']:.4f}")
print(f"  ROC-AUC: {best_model['roc_auc']:.4f}")
print(f"  Cross-validation: {best_model['cv_mean']:.4f} ± {best_model['cv_std']:.4f}")

# Confusion matrix for best model
print(f"\n📊 Confusion Matrix ({best_model_name}):")
cm = confusion_matrix(y_test, best_model['y_pred'])
print("\n              Predicted")
print("              Not Merged  Merged")
print(f"Actual Not M.    {cm[0,0]:<8}    {cm[0,1]:<8}")
print(f"       Merged    {cm[1,0]:<8}    {cm[1,1]:<8}")

# Classification report
print(f"\n📋 Classification Report ({best_model_name}):")
print(classification_report(y_test, best_model['y_pred'], 
                          target_names=['Not Merged', 'Merged'],
                          zero_division=0))

### 9.6 Feature Importance: Top Predictors

In [None]:
print("\n" + "="*80)
print("TOP PREDICTORS OF MERGE ACCEPTANCE")
print("="*80)

# Get feature importance from Random Forest and Gradient Boosting
importance_sources = {}

if 'Random Forest' in results:
    rf_model = results['Random Forest']['model']
    importance_sources['Random Forest'] = rf_model.feature_importances_

if 'Gradient Boosting' in results:
    gb_model = results['Gradient Boosting']['model']
    importance_sources['Gradient Boosting'] = gb_model.feature_importances_

if 'Logistic Regression' in results:
    lr_model = results['Logistic Regression']['model']
    # Use absolute coefficients as importance
    importance_sources['Logistic Regression'] = np.abs(lr_model.coef_[0])

# Create importance dataframe
importance_df = pd.DataFrame({
    'Feature': feature_cols
})

for name, importances in importance_sources.items():
    # Normalize to sum to 100%
    normalized = (importances / importances.sum()) * 100
    importance_df[name] = normalized

# Calculate average importance
importance_cols = [col for col in importance_df.columns if col != 'Feature']
importance_df['Average'] = importance_df[importance_cols].mean(axis=1)

# Sort by average importance
importance_df = importance_df.sort_values('Average', ascending=False)

print("\n📊 Feature Importance (% contribution):")
print("\n" + importance_df.to_string(index=False))

# Save feature importance
importance_df.to_csv('/tmp/rq2_feature_importance.csv', index=False)
print("\n✅ Feature importance saved to: /tmp/rq2_feature_importance.csv")

# Display top 5 predictors
print("\n🔝 Top 5 Predictors:")
for idx, row in importance_df.head(5).iterrows():
    print(f"  {row['Feature']:<30} {row['Average']:>6.2f}%")

In [None]:
# Visualize feature importance
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Plot 1: Bar chart of average importance
top_features = importance_df.head(10)
axes[0].barh(range(len(top_features)), top_features['Average'], color='steelblue')
axes[0].set_yticks(range(len(top_features)))
axes[0].set_yticklabels(top_features['Feature'])
axes[0].set_xlabel('Importance (%)')
axes[0].set_title('Top 10 Features by Average Importance', fontsize=12, fontweight='bold')
axes[0].grid(True, alpha=0.3, axis='x')
axes[0].invert_yaxis()

# Add value labels
for i, v in enumerate(top_features['Average']):
    axes[0].text(v + 0.2, i, f'{v:.1f}%', va='center', fontweight='bold')

# Plot 2: Comparison across models
top5 = importance_df.head(5)
x = np.arange(len(top5))
width = 0.25

for i, model in enumerate(importance_cols):
    axes[1].bar(x + i*width, top5[model], width, label=model, alpha=0.8)

axes[1].set_xlabel('Features')
axes[1].set_ylabel('Importance (%)')
axes[1].set_title('Top 5 Features - Model Comparison', fontsize=12, fontweight='bold')
axes[1].set_xticks(x + width)
axes[1].set_xticklabels(top5['Feature'], rotation=45, ha='right')
axes[1].legend()
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('/tmp/rq2_feature_importance.png', dpi=150, bbox_inches='tight')
plt.show()
print("\n✅ Feature importance visualization saved to: /tmp/rq2_feature_importance.png")

### 9.7 ROC Curves and Model Comparison

In [None]:
# Plot ROC curves for all classifiers
plt.figure(figsize=(10, 8))

colors = ['blue', 'green', 'red', 'orange', 'purple']

for (name, res), color in zip(results.items(), colors):
    fpr, tpr, _ = roc_curve(y_test, res['y_pred_proba'])
    auc = res['roc_auc']
    plt.plot(fpr, tpr, color=color, lw=2, label=f'{name} (AUC = {auc:.3f})')

# Plot random classifier
plt.plot([0, 1], [0, 1], 'k--', lw=2, label='Random Classifier (AUC = 0.500)')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curves - Merge Acceptance Prediction', fontsize=14, fontweight='bold')
plt.legend(loc='lower right', fontsize=10)
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('/tmp/rq2_roc_curves.png', dpi=150, bbox_inches='tight')
plt.show()
print("\n✅ ROC curves saved to: /tmp/rq2_roc_curves.png")

### 9.8 RQ2 Summary and Interpretation

In [None]:
print("\n" + "="*80)
print("RQ2: SUMMARY AND INTERPRETATION")
print("="*80)

print("\n📊 Key Findings:")
print("\n1. STATISTICAL DIFFERENCES:")

# Count significant features
sig_features = test_results_df[test_results_df['MW_PValue'] < 0.05]
print(f"   - {len(sig_features)}/{len(test_results_df)} features show significant differences (p<0.05)")

# Features with largest effect sizes
large_effects = test_results_df[abs(test_results_df['Cohens_D']) > 0.5]
if len(large_effects) > 0:
    print(f"   - {len(large_effects)} features with medium/large effect sizes (|d|>0.5):")
    for _, row in large_effects.iterrows():
        direction = 'higher' if row['Cohens_D'] > 0 else 'lower'
        print(f"     • {row['Feature']}: Accepted fixes {direction} (d={row['Cohens_D']:.3f})")
else:
    print("   - No features show large effect sizes")

print("\n2. MOST DISCRIMINATIVE FEATURES (Top 3):")
for idx, row in importance_df.head(3).iterrows():
    avg_importance = row['Average']
    feature = row['Feature']
    
    # Get statistical info
    stat_row = test_results_df[test_results_df['Feature'] == feature]
    if not stat_row.empty:
        acc_mean = stat_row['Accepted_Mean'].values[0]
        rej_mean = stat_row['Rejected_Mean'].values[0]
        direction = 'higher' if acc_mean > rej_mean else 'lower'
        diff_pct = abs((acc_mean - rej_mean) / rej_mean * 100) if rej_mean != 0 else 0
        print(f"   {idx+1}. {feature} ({avg_importance:.1f}% importance)")
        print(f"      - Accepted: {acc_mean:.2f}, Rejected: {rej_mean:.2f}")
        print(f"      - Accepted fixes are {diff_pct:.1f}% {direction}")

print("\n3. CLASSIFIER PERFORMANCE:")
print(f"   - Best model: {best_model_name}")
print(f"   - Accuracy: {best_model['accuracy']:.1%}")
print(f"   - F1-Score: {best_model['f1']:.1%}")
print(f"   - ROC-AUC: {best_model['roc_auc']:.1%}")
print(f"   - This indicates {'good' if best_model['roc_auc'] > 0.7 else 'moderate'} predictive power")

print("\n4. PRACTICAL IMPLICATIONS:")
print("   - Features can predict merge acceptance with reasonable accuracy")
print("   - Key factors for acceptance:")
for idx, row in importance_df.head(3).iterrows():
    print(f"     • {row['Feature']}")

print("\n5. RECOMMENDATIONS:")
print("   - Focus on top predictors to improve merge success rate")
print("   - Monitor significant differences between accepted/rejected PRs")
print("   - Use classifier as early warning system for low-quality fixes")

print("\n" + "="*80)
print("RQ2 ANALYSIS COMPLETE")
print("="*80)

print("\n📁 Generated Files:")
print("   - /tmp/rq2_continuous_tests.csv")
print("   - /tmp/rq2_categorical_tests.csv")
print("   - /tmp/rq2_feature_comparison.png")
print("   - /tmp/rq2_feature_importance.csv")
print("   - /tmp/rq2_feature_importance.png")
print("   - /tmp/rq2_roc_curves.png")