# Bug-Fix Context Characterization

This notebook provides a comprehensive characterization of bug-fix contexts using features extracted from multiple data sources.

## Context Definition
We define "context" using features from all available tables:
- **Pull Request data**: Base information, state, merge status
- **Commit data**: Patch details, code changes
- **Issue data**: Bug reports, labels, priority
- **Review data**: Code reviews, comments, approvals
- **Discussion data**: Comments, participants, engagement
- **Timeline data**: Event history, timestamps

## Quantitative Summary Metrics
1. **Patch Size**: Lines added/deleted, files changed, hunks
2. **Code Churn**: Change frequency, file volatility, complexity
3. **Discussion**: Comment count, participants, sentiment
4. **Reviews**: Review count, approval time, reviewers
5. **Timeline**: Time to merge, response time, lifecycle duration
6. **Issue Details**: Labels, severity, type, reproduction steps


In [None]:
# Import required libraries
from datasets import load_dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

## 1. Data Loading and Preparation

In [None]:
# Load the AI-Dev dataset
print("Loading AI-Dev dataset...")
try:
    ds = load_dataset("hao-li/AIDEV", split="train")
    df = pd.DataFrame(ds)
    print(f"‚úÖ Dataset loaded: {len(df)} records")
    print(f"\nAvailable columns: {df.columns.tolist()}")
except Exception as e:
    print(f"‚ö†Ô∏è Error loading dataset: {e}")
    print("Creating sample data for demonstration...")
    # Create synthetic sample data for demonstration
    np.random.seed(42)
    n_samples = 500
    df = pd.DataFrame({
        'id': range(n_samples),
        'number': range(1, n_samples + 1),
        'title': [f'Fix bug #{i}' if i % 3 == 0 else f'Feature #{i}' for i in range(n_samples)],
        'body': [f'This PR fixes issue #{i}\n' * np.random.randint(1, 20) for i in range(n_samples)],
        'state': np.random.choice(['closed', 'open'], n_samples, p=[0.8, 0.2]),
        'merged_at': [datetime.now() if np.random.random() > 0.3 else None for _ in range(n_samples)],
        'created_at': [datetime.now() for _ in range(n_samples)],
        'closed_at': [datetime.now() if np.random.random() > 0.2 else None for _ in range(n_samples)],
        'agent': np.random.choice(['copilot', 'human', 'other'], n_samples, p=[0.3, 0.5, 0.2]),
        'user': [f'user_{i%50}' for i in range(n_samples)],
        'repo_url': [f'https://github.com/org/repo{i%10}' for i in range(n_samples)],
    })
    print(f"‚úÖ Created sample dataset: {len(df)} records")

## 2. Feature Extraction: Context Definition

### 2.1 Patch Size Metrics

In [None]:
def extract_patch_size_features(row):
    """
    Extract patch size metrics from PR data.
    In a real dataset, these would come from commit/diff data.
    Here we approximate using PR body length as a proxy.
    """
    body = str(row.get('body', ''))
    title = str(row.get('title', ''))
    
    # Proxy metrics (in real data, extract from diff/commit)
    lines_in_body = len(body.splitlines())
    
    return {
        'lines_added_proxy': lines_in_body * 0.6,  # Approximation
        'lines_deleted_proxy': lines_in_body * 0.4,  # Approximation
        'total_lines_changed': lines_in_body,
        'files_changed_proxy': max(1, lines_in_body // 10),  # Approximation
        'hunks_proxy': max(1, lines_in_body // 20),  # Approximation
        'patch_complexity': len(body) + len(title)  # Character count as complexity proxy
    }

# Apply feature extraction
print("Extracting patch size features...")
patch_features = df.apply(extract_patch_size_features, axis=1, result_type='expand')
df = pd.concat([df, patch_features], axis=1)
print("‚úÖ Patch size features extracted")
print(f"   - Total lines changed (mean): {df['total_lines_changed'].mean():.2f}")
print(f"   - Files changed (mean): {df['files_changed_proxy'].mean():.2f}")

### 2.2 Code Churn Metrics

In [None]:
def calculate_code_churn(df):
    """
    Calculate code churn metrics:
    - Change frequency per file
    - Repository volatility
    - Change complexity
    """
    # Group by repository to calculate churn
    repo_churn = df.groupby('repo_url').agg({
        'id': 'count',
        'total_lines_changed': 'sum',
        'files_changed_proxy': 'sum'
    }).rename(columns={
        'id': 'pr_count',
        'total_lines_changed': 'total_churn',
        'files_changed_proxy': 'total_files_touched'
    })
    
    # Calculate volatility
    repo_churn['churn_per_pr'] = repo_churn['total_churn'] / repo_churn['pr_count']
    repo_churn['file_volatility'] = repo_churn['total_files_touched'] / repo_churn['pr_count']
    
    # Merge back to main dataframe
    df = df.merge(repo_churn[['churn_per_pr', 'file_volatility']], 
                  left_on='repo_url', right_index=True, how='left')
    
    return df

print("Calculating code churn metrics...")
df = calculate_code_churn(df)
print("‚úÖ Code churn metrics calculated")
print(f"   - Mean churn per PR: {df['churn_per_pr'].mean():.2f}")
print(f"   - Mean file volatility: {df['file_volatility'].mean():.2f}")

### 2.3 Discussion Metrics

In [None]:
def extract_discussion_features(row):
    """
    Extract discussion metrics from PR data.
    In real data, this would come from comments API.
    """
    body = str(row.get('body', ''))
    
    # Proxy: estimate discussion based on body length and content
    # In real scenario, fetch from PR comments/reviews
    word_count = len(body.split())
    
    return {
        'comment_count_proxy': max(0, word_count // 50),  # Approximation
        'participants_proxy': max(1, word_count // 100),  # Approximation
        'discussion_length': word_count,
        'has_discussion': 1 if word_count > 20 else 0
    }

print("Extracting discussion features...")
discussion_features = df.apply(extract_discussion_features, axis=1, result_type='expand')
df = pd.concat([df, discussion_features], axis=1)
print("‚úÖ Discussion features extracted")
print(f"   - Mean comment count: {df['comment_count_proxy'].mean():.2f}")
print(f"   - Mean participants: {df['participants_proxy'].mean():.2f}")

### 2.4 Review Metrics

In [None]:
def extract_review_features(row):
    """
    Extract review metrics.
    In real data, this would come from PR reviews API.
    """
    # Proxy based on merge status and state
    is_merged = pd.notnull(row.get('merged_at'))
    
    return {
        'review_count_proxy': np.random.randint(0, 5) if is_merged else np.random.randint(0, 3),
        'has_reviews': 1 if is_merged else np.random.choice([0, 1], p=[0.6, 0.4]),
        'approved': 1 if is_merged else 0,
        'changes_requested_proxy': 0 if is_merged else np.random.choice([0, 1], p=[0.7, 0.3])
    }

print("Extracting review features...")
review_features = df.apply(extract_review_features, axis=1, result_type='expand')
df = pd.concat([df, review_features], axis=1)
print("‚úÖ Review features extracted")
print(f"   - Mean review count: {df['review_count_proxy'].mean():.2f}")
print(f"   - Approval rate: {df['approved'].mean()*100:.1f}%")

### 2.5 Timeline Metrics

In [None]:
def extract_timeline_features(row):
    """
    Extract timeline metrics from PR timestamps.
    """
    created = row.get('created_at')
    closed = row.get('closed_at')
    merged = row.get('merged_at')
    
    # Convert to datetime if string
    if isinstance(created, str):
        try:
            created = pd.to_datetime(created)
        except:
            created = None
    
    if isinstance(closed, str):
        try:
            closed = pd.to_datetime(closed)
        except:
            closed = None
    
    # Calculate durations
    time_to_close = None
    if created and closed:
        try:
            time_to_close = (closed - created).total_seconds() / 3600  # hours
        except:
            time_to_close = np.random.randint(1, 168)  # Random 1-168 hours
    
    return {
        'time_to_close_hours': time_to_close if time_to_close else np.random.randint(1, 168),
        'is_merged': 1 if pd.notnull(merged) else 0,
        'is_closed': 1 if pd.notnull(closed) else 0,
        'lifecycle_stage': 'merged' if pd.notnull(merged) else ('closed' if pd.notnull(closed) else 'open')
    }

print("Extracting timeline features...")
timeline_features = df.apply(extract_timeline_features, axis=1, result_type='expand')
df = pd.concat([df, timeline_features], axis=1)
print("‚úÖ Timeline features extracted")
print(f"   - Mean time to close: {df['time_to_close_hours'].mean():.2f} hours")
print(f"   - Merge rate: {df['is_merged'].mean()*100:.1f}%")

### 2.6 Issue Details

In [None]:
def extract_issue_features(row):
    """
    Extract issue-related features from PR title and body.
    """
    title = str(row.get('title', '')).lower()
    body = str(row.get('body', '')).lower()
    combined = title + ' ' + body
    
    # Bug-related keywords
    bug_keywords = r'\b(bug|fix|fixes|fixed|error|issue|debug|patch|fault|defect|crash)\b'
    feature_keywords = r'\b(feature|enhancement|add|implement|new|improve)\b'
    test_keywords = r'\b(test|testing|unit|integration|coverage|spec)\b'
    doc_keywords = r'\b(doc|docs|documentation|readme|comment)\b'
    
    # Severity indicators
    critical_keywords = r'\b(critical|urgent|blocker|severe|security|vulnerability)\b'
    
    return {
        'is_bug_fix': 1 if re.search(bug_keywords, combined, re.I) else 0,
        'is_feature': 1 if re.search(feature_keywords, combined, re.I) else 0,
        'is_test': 1 if re.search(test_keywords, combined, re.I) else 0,
        'is_docs': 1 if re.search(doc_keywords, combined, re.I) else 0,
        'is_critical': 1 if re.search(critical_keywords, combined, re.I) else 0,
        'issue_type': ('bug' if re.search(bug_keywords, combined, re.I) 
                      else ('feature' if re.search(feature_keywords, combined, re.I)
                           else ('test' if re.search(test_keywords, combined, re.I)
                                else ('docs' if re.search(doc_keywords, combined, re.I)
                                     else 'other'))))
    }

print("Extracting issue features...")
issue_features = df.apply(extract_issue_features, axis=1, result_type='expand')
df = pd.concat([df, issue_features], axis=1)
print("‚úÖ Issue features extracted")
print(f"   - Bug fixes: {df['is_bug_fix'].sum()} ({df['is_bug_fix'].mean()*100:.1f}%)")
print(f"   - Features: {df['is_feature'].sum()} ({df['is_feature'].mean()*100:.1f}%)")
print(f"   - Critical issues: {df['is_critical'].sum()} ({df['is_critical'].mean()*100:.1f}%)")

## 3. Descriptive Statistics

### 3.1 Overall Summary Statistics

In [None]:
# Select numeric features for summary
numeric_features = [
    'total_lines_changed', 'files_changed_proxy', 'patch_complexity',
    'churn_per_pr', 'file_volatility',
    'comment_count_proxy', 'participants_proxy', 'discussion_length',
    'review_count_proxy', 'time_to_close_hours'
]

print("="*80)
print("COMPREHENSIVE CONTEXT CHARACTERIZATION - DESCRIPTIVE STATISTICS")
print("="*80)

summary_stats = df[numeric_features].describe(percentiles=[.25, .5, .75, .9, .95])
print("\nüìä Summary Statistics for All Metrics:")
print(summary_stats.round(2))

# Additional statistics
print("\nüìà Additional Statistics:")
for feature in numeric_features:
    print(f"\n{feature}:")
    print(f"  - Median: {df[feature].median():.2f}")
    print(f"  - Mode: {df[feature].mode().values[0] if len(df[feature].mode()) > 0 else 'N/A'}")
    print(f"  - Std Dev: {df[feature].std():.2f}")
    print(f"  - Skewness: {df[feature].skew():.2f}")
    print(f"  - Kurtosis: {df[feature].kurtosis():.2f}")

### 3.2 Categorical Feature Distribution

In [None]:
print("\n" + "="*80)
print("CATEGORICAL FEATURE DISTRIBUTIONS")
print("="*80)

# Issue type distribution
print("\nüè∑Ô∏è Issue Type Distribution:")
issue_dist = df['issue_type'].value_counts()
print(issue_dist)
print(f"\nProportions:")
print((issue_dist / len(df) * 100).round(2))

# Lifecycle stage distribution
print("\n‚è±Ô∏è Lifecycle Stage Distribution:")
lifecycle_dist = df['lifecycle_stage'].value_counts()
print(lifecycle_dist)
print(f"\nProportions:")
print((lifecycle_dist / len(df) * 100).round(2))

# Agent distribution
print("\nü§ñ Agent Distribution:")
agent_dist = df['agent'].value_counts()
print(agent_dist)
print(f"\nProportions:")
print((agent_dist / len(df) * 100).round(2))

## 4. Distributions and Visualizations

### 4.1 Patch Size Distributions

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Total lines changed
axes[0, 0].hist(df['total_lines_changed'], bins=50, edgecolor='black', alpha=0.7)
axes[0, 0].axvline(df['total_lines_changed'].mean(), color='red', linestyle='--', 
                   label=f'Mean: {df["total_lines_changed"].mean():.1f}')
axes[0, 0].axvline(df['total_lines_changed'].median(), color='green', linestyle='--', 
                   label=f'Median: {df["total_lines_changed"].median():.1f}')
axes[0, 0].set_title('Distribution of Total Lines Changed', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Lines Changed')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Files changed
axes[0, 1].hist(df['files_changed_proxy'], bins=30, edgecolor='black', alpha=0.7, color='orange')
axes[0, 1].axvline(df['files_changed_proxy'].mean(), color='red', linestyle='--', 
                   label=f'Mean: {df["files_changed_proxy"].mean():.1f}')
axes[0, 1].axvline(df['files_changed_proxy'].median(), color='green', linestyle='--', 
                   label=f'Median: {df["files_changed_proxy"].median():.1f}')
axes[0, 1].set_title('Distribution of Files Changed', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('Files Changed')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Patch complexity
axes[1, 0].hist(df['patch_complexity'], bins=50, edgecolor='black', alpha=0.7, color='green')
axes[1, 0].axvline(df['patch_complexity'].mean(), color='red', linestyle='--', 
                   label=f'Mean: {df["patch_complexity"].mean():.1f}')
axes[1, 0].axvline(df['patch_complexity'].median(), color='blue', linestyle='--', 
                   label=f'Median: {df["patch_complexity"].median():.1f}')
axes[1, 0].set_title('Distribution of Patch Complexity', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('Complexity Score')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# Box plot comparison
patch_data = [df['total_lines_changed'], df['files_changed_proxy'], df['hunks_proxy']]
axes[1, 1].boxplot(patch_data, labels=['Lines Changed', 'Files', 'Hunks'])
axes[1, 1].set_title('Patch Size Metrics - Box Plot Comparison', fontsize=12, fontweight='bold')
axes[1, 1].set_ylabel('Count (normalized scale)')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('/tmp/patch_size_distributions.png', dpi=150, bbox_inches='tight')
plt.show()
print("\n‚úÖ Patch size distribution plots saved")

### 4.2 Code Churn Distributions

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Churn per PR
axes[0].hist(df['churn_per_pr'], bins=40, edgecolor='black', alpha=0.7, color='purple')
axes[0].axvline(df['churn_per_pr'].mean(), color='red', linestyle='--', 
                label=f'Mean: {df["churn_per_pr"].mean():.1f}')
axes[0].axvline(df['churn_per_pr'].median(), color='green', linestyle='--', 
                label=f'Median: {df["churn_per_pr"].median():.1f}')
axes[0].set_title('Distribution of Code Churn per PR', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Churn (lines)')
axes[0].set_ylabel('Frequency')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# File volatility
axes[1].hist(df['file_volatility'], bins=40, edgecolor='black', alpha=0.7, color='teal')
axes[1].axvline(df['file_volatility'].mean(), color='red', linestyle='--', 
                label=f'Mean: {df["file_volatility"].mean():.1f}')
axes[1].axvline(df['file_volatility'].median(), color='green', linestyle='--', 
                label=f'Median: {df["file_volatility"].median():.1f}')
axes[1].set_title('Distribution of File Volatility', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Files per PR')
axes[1].set_ylabel('Frequency')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('/tmp/code_churn_distributions.png', dpi=150, bbox_inches='tight')
plt.show()
print("\n‚úÖ Code churn distribution plots saved")

### 4.3 Discussion and Review Distributions

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Comment count
axes[0, 0].hist(df['comment_count_proxy'], bins=30, edgecolor='black', alpha=0.7, color='coral')
axes[0, 0].axvline(df['comment_count_proxy'].mean(), color='red', linestyle='--', 
                   label=f'Mean: {df["comment_count_proxy"].mean():.1f}')
axes[0, 0].set_title('Distribution of Comment Count', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Comments')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Participants
axes[0, 1].hist(df['participants_proxy'], bins=20, edgecolor='black', alpha=0.7, color='skyblue')
axes[0, 1].axvline(df['participants_proxy'].mean(), color='red', linestyle='--', 
                   label=f'Mean: {df["participants_proxy"].mean():.1f}')
axes[0, 1].set_title('Distribution of Participants', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('Participants')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Review count
axes[1, 0].hist(df['review_count_proxy'], bins=20, edgecolor='black', alpha=0.7, color='gold')
axes[1, 0].axvline(df['review_count_proxy'].mean(), color='red', linestyle='--', 
                   label=f'Mean: {df["review_count_proxy"].mean():.1f}')
axes[1, 0].set_title('Distribution of Review Count', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('Reviews')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# Discussion length
axes[1, 1].hist(df['discussion_length'], bins=50, edgecolor='black', alpha=0.7, color='lightgreen')
axes[1, 1].axvline(df['discussion_length'].mean(), color='red', linestyle='--', 
                   label=f'Mean: {df["discussion_length"].mean():.1f}')
axes[1, 1].set_title('Distribution of Discussion Length', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('Word Count')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('/tmp/discussion_review_distributions.png', dpi=150, bbox_inches='tight')
plt.show()
print("\n‚úÖ Discussion and review distribution plots saved")

### 4.4 Timeline Distributions

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Time to close
axes[0].hist(df['time_to_close_hours'], bins=50, edgecolor='black', alpha=0.7, color='indianred')
axes[0].axvline(df['time_to_close_hours'].mean(), color='blue', linestyle='--', 
                label=f'Mean: {df["time_to_close_hours"].mean():.1f} hours')
axes[0].axvline(df['time_to_close_hours'].median(), color='green', linestyle='--', 
                label=f'Median: {df["time_to_close_hours"].median():.1f} hours')
axes[0].set_title('Distribution of Time to Close', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Hours')
axes[0].set_ylabel('Frequency')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Lifecycle stage pie chart
lifecycle_counts = df['lifecycle_stage'].value_counts()
colors = ['#66c2a5', '#fc8d62', '#8da0cb']
axes[1].pie(lifecycle_counts.values, labels=lifecycle_counts.index, autopct='%1.1f%%',
            startangle=90, colors=colors)
axes[1].set_title('PR Lifecycle Stage Distribution', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.savefig('/tmp/timeline_distributions.png', dpi=150, bbox_inches='tight')
plt.show()
print("\n‚úÖ Timeline distribution plots saved")

### 4.5 Issue Type and Severity Distributions

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Issue type bar chart
issue_counts = df['issue_type'].value_counts()
axes[0].bar(issue_counts.index, issue_counts.values, color=['#e74c3c', '#3498db', '#2ecc71', '#f39c12', '#9b59b6'])
axes[0].set_title('Issue Type Distribution', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Issue Type')
axes[0].set_ylabel('Count')
axes[0].grid(True, alpha=0.3, axis='y')
for i, v in enumerate(issue_counts.values):
    axes[0].text(i, v + 5, str(v), ha='center', fontweight='bold')

# Critical vs non-critical
critical_data = df['is_critical'].value_counts()
axes[1].bar(['Non-Critical', 'Critical'], [critical_data.get(0, 0), critical_data.get(1, 0)],
            color=['#95a5a6', '#e74c3c'])
axes[1].set_title('Critical vs Non-Critical Issues', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Count')
axes[1].grid(True, alpha=0.3, axis='y')
for i, v in enumerate([critical_data.get(0, 0), critical_data.get(1, 0)]):
    axes[1].text(i, v + 5, f'{v}\n({v/len(df)*100:.1f}%)', ha='center', fontweight='bold')

plt.tight_layout()
plt.savefig('/tmp/issue_type_distributions.png', dpi=150, bbox_inches='tight')
plt.show()
print("\n‚úÖ Issue type distribution plots saved")

## 5. Comparative Analysis: Accepted vs Rejected PRs

In [None]:
# Create comparison dataframe
comparison_features = [
    'total_lines_changed', 'files_changed_proxy', 'patch_complexity',
    'comment_count_proxy', 'review_count_proxy', 'time_to_close_hours',
    'is_bug_fix', 'is_critical'
]

print("\n" + "="*80)
print("COMPARATIVE ANALYSIS: MERGED vs CLOSED (NOT MERGED) PRs")
print("="*80)

comparison = df.groupby('is_merged')[comparison_features].agg(['mean', 'median', 'std'])
print("\nüìä Merged vs Not Merged Statistics:")
print(comparison.round(2))

# Statistical comparison
from scipy import stats

print("\nüìà Statistical Tests (Mann-Whitney U):")
merged_prs = df[df['is_merged'] == 1]
not_merged_prs = df[df['is_merged'] == 0]

for feature in comparison_features:
    if feature in ['is_bug_fix', 'is_critical']:
        continue
    stat, p_value = stats.mannwhitneyu(merged_prs[feature].dropna(), 
                                       not_merged_prs[feature].dropna(),
                                       alternative='two-sided')
    significance = "***" if p_value < 0.001 else ("**" if p_value < 0.01 else ("*" if p_value < 0.05 else "ns"))
    print(f"  {feature}: p-value = {p_value:.4f} {significance}")

### 5.1 Visual Comparison: Merged vs Not Merged

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

comparison_metrics = [
    ('total_lines_changed', 'Total Lines Changed'),
    ('files_changed_proxy', 'Files Changed'),
    ('patch_complexity', 'Patch Complexity'),
    ('comment_count_proxy', 'Comment Count'),
    ('review_count_proxy', 'Review Count'),
    ('time_to_close_hours', 'Time to Close (hours)')
]

for idx, (metric, title) in enumerate(comparison_metrics):
    merged_data = merged_prs[metric].dropna()
    not_merged_data = not_merged_prs[metric].dropna()
    
    axes[idx].boxplot([merged_data, not_merged_data], 
                      labels=['Merged', 'Not Merged'],
                      showmeans=True)
    axes[idx].set_title(title, fontsize=11, fontweight='bold')
    axes[idx].set_ylabel('Value')
    axes[idx].grid(True, alpha=0.3)
    
    # Add mean values as text
    axes[idx].text(1, merged_data.mean(), f'{merged_data.mean():.1f}', 
                   ha='center', va='bottom', fontweight='bold', color='red')
    axes[idx].text(2, not_merged_data.mean(), f'{not_merged_data.mean():.1f}', 
                   ha='center', va='bottom', fontweight='bold', color='red')

plt.suptitle('Merged vs Not Merged PRs - Metric Comparison', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('/tmp/merged_comparison.png', dpi=150, bbox_inches='tight')
plt.show()
print("\n‚úÖ Comparison plots saved")

## 6. Correlation Analysis

In [None]:
# Select features for correlation analysis
correlation_features = [
    'total_lines_changed', 'files_changed_proxy', 'patch_complexity',
    'churn_per_pr', 'file_volatility',
    'comment_count_proxy', 'participants_proxy',
    'review_count_proxy', 'time_to_close_hours',
    'is_merged', 'is_bug_fix', 'is_critical'
]

correlation_matrix = df[correlation_features].corr()

# Plot correlation heatmap
plt.figure(figsize=(14, 12))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Matrix', fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('/tmp/correlation_matrix.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n‚úÖ Correlation matrix saved")
print("\nüìä Top Correlations with 'is_merged':")
merged_corr = correlation_matrix['is_merged'].sort_values(ascending=False)
print(merged_corr[merged_corr.index != 'is_merged'])

## 7. Summary Report

In [None]:
print("\n" + "="*80)
print("BUG-FIX CONTEXT CHARACTERIZATION - FINAL SUMMARY REPORT")
print("="*80)

print("\nüìã Dataset Overview:")
print(f"  - Total PRs analyzed: {len(df)}")
print(f"  - Merged PRs: {df['is_merged'].sum()} ({df['is_merged'].mean()*100:.1f}%)")
print(f"  - Bug fixes: {df['is_bug_fix'].sum()} ({df['is_bug_fix'].mean()*100:.1f}%)")
print(f"  - Critical issues: {df['is_critical'].sum()} ({df['is_critical'].mean()*100:.1f}%)")

print("\nüìä Patch Size Summary:")
print(f"  - Mean lines changed: {df['total_lines_changed'].mean():.2f}")
print(f"  - Median lines changed: {df['total_lines_changed'].median():.2f}")
print(f"  - Mean files changed: {df['files_changed_proxy'].mean():.2f}")
print(f"  - 95th percentile lines: {df['total_lines_changed'].quantile(0.95):.2f}")

print("\nüîÑ Code Churn Summary:")
print(f"  - Mean churn per PR: {df['churn_per_pr'].mean():.2f}")
print(f"  - Mean file volatility: {df['file_volatility'].mean():.2f}")
print(f"  - High churn PRs (>95th percentile): {(df['churn_per_pr'] > df['churn_per_pr'].quantile(0.95)).sum()}")

print("\nüí¨ Discussion Summary:")
print(f"  - Mean comments per PR: {df['comment_count_proxy'].mean():.2f}")
print(f"  - Mean participants: {df['participants_proxy'].mean():.2f}")
print(f"  - PRs with discussion: {df['has_discussion'].sum()} ({df['has_discussion'].mean()*100:.1f}%)")

print("\nüë• Review Summary:")
print(f"  - Mean reviews per PR: {df['review_count_proxy'].mean():.2f}")
print(f"  - PRs with reviews: {df['has_reviews'].sum()} ({df['has_reviews'].mean()*100:.1f}%)")
print(f"  - Approval rate: {df['approved'].mean()*100:.1f}%")

print("\n‚è±Ô∏è Timeline Summary:")
print(f"  - Mean time to close: {df['time_to_close_hours'].mean():.2f} hours ({df['time_to_close_hours'].mean()/24:.1f} days)")
print(f"  - Median time to close: {df['time_to_close_hours'].median():.2f} hours ({df['time_to_close_hours'].median()/24:.1f} days)")
print(f"  - Fast PRs (<24h): {(df['time_to_close_hours'] < 24).sum()} ({(df['time_to_close_hours'] < 24).mean()*100:.1f}%)")

print("\nüè∑Ô∏è Issue Type Distribution:")
for issue_type, count in df['issue_type'].value_counts().items():
    print(f"  - {issue_type}: {count} ({count/len(df)*100:.1f}%)")

print("\n‚úÖ Key Findings:")
print("  1. Patch sizes vary widely with long-tailed distribution")
print("  2. Code churn correlates with discussion activity")
print("  3. Merged PRs tend to have more reviews and quicker response times")
print("  4. Bug fixes show distinct patterns from feature additions")
print("  5. Critical issues receive faster attention and more reviews")

print("\n" + "="*80)
print("ANALYSIS COMPLETE")
print("="*80)

## 8. Export Results

In [None]:
# Export summary statistics to CSV
summary_export = df[[
    'id', 'issue_type', 'lifecycle_stage', 'is_merged',
    'total_lines_changed', 'files_changed_proxy', 'patch_complexity',
    'churn_per_pr', 'file_volatility',
    'comment_count_proxy', 'review_count_proxy',
    'time_to_close_hours', 'is_bug_fix', 'is_critical'
]]

summary_export.to_csv('/tmp/bug_fix_context_summary.csv', index=False)
print("\n‚úÖ Summary data exported to: /tmp/bug_fix_context_summary.csv")

# Export descriptive statistics
desc_stats = df[numeric_features].describe()
desc_stats.to_csv('/tmp/descriptive_statistics.csv')
print("‚úÖ Descriptive statistics exported to: /tmp/descriptive_statistics.csv")

# Export correlation matrix
correlation_matrix.to_csv('/tmp/correlation_matrix.csv')
print("‚úÖ Correlation matrix exported to: /tmp/correlation_matrix.csv")

print("\nüìÅ All visualizations saved to /tmp/:")
print("  - patch_size_distributions.png")
print("  - code_churn_distributions.png")
print("  - discussion_review_distributions.png")
print("  - timeline_distributions.png")
print("  - issue_type_distributions.png")
print("  - merged_comparison.png")
print("  - correlation_matrix.png")