In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os


import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import numpy as np

# Set plot style
sns.set_theme(style="whitegrid")



print("Loading datasets...")
try:
    all_pr_df = pd.read_parquet("hf://datasets/hao-li/AIDev/all_pull_request.parquet")
    all_repo_df = pd.read_parquet("hf://datasets/hao-li/AIDev/all_repository.parquet")
    all_user_df = pd.read_parquet("hf://datasets/hao-li/AIDev/all_user.parquet")

    # Basic
    pr_df = pd.read_parquet("hf://datasets/hao-li/AIDev/pull_request.parquet")
    repo_df = pd.read_parquet("hf://datasets/hao-li/AIDev/repository.parquet")
    user_df = pd.read_parquet("hf://datasets/hao-li/AIDev/user.parquet")
    
    # Comments and reviews
    pr_comments_df = pd.read_parquet("hf://datasets/hao-li/AIDev/pr_comments.parquet")
    pr_reviews_df = pd.read_parquet("hf://datasets/hao-li/AIDev/pr_reviews.parquet")
    pr_review_comments_df = pd.read_parquet("hf://datasets/hao-li/AIDev/pr_review_comments_v2.parquet")
    
    # Commits
    pr_commits_df = pd.read_parquet("hf://datasets/hao-li/AIDev/pr_commits.parquet")
    pr_commit_details_df = pd.read_parquet("hf://datasets/hao-li/AIDev/pr_commit_details.parquet")
    
    # Related issues
    related_issue_df = pd.read_parquet("hf://datasets/hao-li/AIDev/related_issue.parquet")
    issue_df = pd.read_parquet("hf://datasets/hao-li/AIDev/issue.parquet")
    
    # Events
    pr_timeline_df = pd.read_parquet("hf://datasets/hao-li/AIDev/pr_timeline.parquet")
    
    # Task type
    pr_task_type_df = pd.read_parquet("hf://datasets/hao-li/AIDev/pr_task_type.parquet")
    
    # Human-PR
    human_pr_df = pd.read_parquet("hf://datasets/hao-li/AIDev/human_pull_request.parquet")
    human_pr_task_type_df = pd.read_parquet("hf://datasets/hao-li/AIDev/human_pr_task_type.parquet")

    print("Datasets loaded successfully.")
except Exception as e:
    print(f"Error loading datasets: {e}")
    pr_df, pr_timeline_df, pr_reviews_df = None, None, None


if pr_df is not None:
    print("Preprocessing PR data...")
    # Filter for closed PRs
    closed_prs = pr_df[pr_df["state"] == "closed"].copy()
    
    # Convert timestamps to datetime
    closed_prs["created_at"] = pd.to_datetime(closed_prs["created_at"])
    closed_prs["closed_at"] = pd.to_datetime(closed_prs["closed_at"])
    
    # Calculate lifecycle in hours
    closed_prs["lifecycle_hours"] = (closed_prs["closed_at"] - closed_prs["created_at"]).dt.total_seconds() / 3600
    
    pr_lifecycle = closed_prs[["id", "state", "lifecycle_hours"]]
    print(pr_lifecycle.head())


if pr_timeline_df is not None:
    print("Calculating iteration cycles...")
    # Group by pr_id and count events
    pr_iterations = (
        pr_timeline_df
        .groupby("pr_id")
        .size()
        .reset_index(name="iteration_cycles")
    )
    print(pr_iterations.head())


if pr_reviews_df is not None:
    print("Categorizing reviewers...")
    # Drop rows with missing user_type
    clean_reviews = pr_reviews_df.dropna(subset=["user_type"]).copy()
    
    # Group by pr_id and get unique user types
    type_map = (
        clean_reviews.groupby("pr_id")["user_type"]
        .apply(lambda x: set(x.unique()))
    )
    
    def label_user_type(types):
        if types == {"User"}:
            return "User"
        elif types == {"Bot"}:
            return "Bot"
        else:
            return "Both"
            
    reviewer_types = type_map.apply(label_user_type).reset_index()
    reviewer_types.columns = ["id", "user_type_final"]
    
    print(reviewer_types.head())



if pr_df is not None and pr_timeline_df is not None and pr_reviews_df is not None:
    print("Merging data...")
    # Merge lifecycle and iterations
    merged_df = pd.merge(
        pr_lifecycle,
        pr_iterations,
        left_on="id",
        right_on="pr_id",
        how="left"
    )
    
    # Merge with reviewer types
    final_df = pd.merge(
        merged_df,
        reviewer_types,
        on="id",
        how="inner" # We only want PRs that have reviewer info
    )
    print(final_df.head())


closed_prs = pr_df.loc[pr_df['state'] == 'closed', ['id',"agent"]]
closed_prs


merged_final_df = final_df.merge(
    closed_prs, 
    left_on="id", 
    right_on="id", 
    how="left"
)


merged_final_df

Loading datasets...


  from .autonotebook import tqdm as notebook_tqdm


Datasets loaded successfully.
Preprocessing PR data...
            id   state  lifecycle_hours
0   3264933329  closed        76.038611
1   3265118634  closed        17.258056
2   3265640341  closed         0.100833
3   3265709660  closed         0.635556
16  3234102722  closed        35.815278
Calculating iteration cycles...
        pr_id  iteration_cycles
0  2756921963                30
1  2757103560                22
2  2757124156                 7
3  2757125491                 7
4  2757179026                15
Categorizing reviewers...
           id user_type_final
0  2756921963             Bot
1  2758636941            User
2  2759620798            User
3  2760115428            User
4  2760243902            User
Merging data...
           id   state  lifecycle_hours       pr_id  iteration_cycles  \
0  3265640341  closed         0.100833  3265640341                12   
1  3265709660  closed         0.635556  3265709660                11   
2  3214555104  closed        47.635833  321

Unnamed: 0,id,state,lifecycle_hours,pr_id,iteration_cycles,user_type_final,agent
0,3265640341,closed,0.100833,3265640341,12,Both,Claude_Code
1,3265709660,closed,0.635556,3265709660,11,Bot,Claude_Code
2,3214555104,closed,47.635833,3214555104,30,User,Claude_Code
3,3214724259,closed,0.004444,3214724259,12,Bot,Claude_Code
4,3214782537,closed,203.482500,3214782537,30,Bot,Claude_Code
...,...,...,...,...,...,...,...
7311,3260345989,closed,29.747500,3260345989,17,Bot,Devin
7312,3260452571,closed,19.248056,3260452571,7,Both,Devin
7313,2857171695,closed,20.076389,2857171695,13,User,Devin
7314,2858429985,closed,15.211944,2858429985,18,User,Devin


In [3]:


import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import numpy as np

if pr_df is not None:
    print("Preprocessing PR data...")
    closed_prs = pr_df[pr_df["state"] == "closed"].copy()
    closed_prs["created_at"] = pd.to_datetime(closed_prs["created_at"])
    closed_prs["closed_at"] = pd.to_datetime(closed_prs["closed_at"])
    closed_prs["lifecycle_hours"] = (closed_prs["closed_at"] - closed_prs["created_at"]).dt.total_seconds() / 3600
    pr_lifecycle = closed_prs[["id", "state", "lifecycle_hours"]]
    print(pr_lifecycle.head())


if pr_reviews_df is not None:
    print("Analyzing sentiment...")
    # Extract relevant columns and drop missing bodies
    reviews = pr_reviews_df[["pr_id", "body"]].dropna().copy()
    
    sia = SentimentIntensityAnalyzer()
    
    def get_compound_score(text):
        return sia.polarity_scores(text)["compound"]
    
    # Calculate sentiment for each review
    print("Calculating VADER scores (this might take a moment)...")
    reviews["compound_score"] = reviews["body"].apply(get_compound_score)
    
    # Aggregate by PR (Mean score)
    print("Aggregating sentiment per PR...")
    pr_sentiment = reviews.groupby("pr_id")["compound_score"].mean().reset_index()
    
    # Classify sentiment
    def classify_sentiment(score):
        if score > 0.05:
            return "Positive"
        elif score < -0.05:
            return "Negative"
        else:
            return "Neutral"
            
    pr_sentiment["sentiment_category"] = pr_sentiment["compound_score"].apply(classify_sentiment)
    pr_sentiment.columns = ["id", "mean_compound_score", "sentiment_category"]
    
    print(pr_sentiment.head())


if pr_df is not None and pr_timeline_df is not None and pr_reviews_df is not None:
    print("Merging data...")
    merged_df_sentiment = pd.merge(
        pr_lifecycle,
        pr_iterations,
        left_on="id",
        right_on="pr_id",
        how="left"
    )
    
    final_df_sentiment = pd.merge(
        merged_df,
        pr_sentiment,
        on="id",
        how="inner" # Only PRs with reviews/sentiment
    )
    print(final_df_sentiment.head())

Preprocessing PR data...
            id   state  lifecycle_hours
0   3264933329  closed        76.038611
1   3265118634  closed        17.258056
2   3265640341  closed         0.100833
3   3265709660  closed         0.635556
16  3234102722  closed        35.815278
Analyzing sentiment...
Calculating VADER scores (this might take a moment)...
Aggregating sentiment per PR...
           id  mean_compound_score sentiment_category
0  2760115428               0.0000            Neutral
1  2766353261              -0.1779           Negative
2  2768057346               0.4926           Positive
3  2768057378               0.4199           Positive
4  2768132850              -0.5267           Negative
Merging data...
           id   state  lifecycle_hours       pr_id  iteration_cycles  \
0  3265709660  closed         0.635556  3265709660                11   
1  3214555104  closed        47.635833  3214555104                30   
2  3214724259  closed         0.004444  3214724259                12 

In [4]:
final_df_sentiment

Unnamed: 0,id,state,lifecycle_hours,pr_id,iteration_cycles,mean_compound_score,sentiment_category
0,3265709660,closed,0.635556,3265709660,11,0.790600,Positive
1,3214555104,closed,47.635833,3214555104,30,0.332200,Positive
2,3214724259,closed,0.004444,3214724259,12,0.653100,Positive
3,3214876564,closed,0.938333,3214876564,30,0.499460,Positive
4,3215868710,closed,13.475000,3215868710,30,0.398367,Positive
...,...,...,...,...,...,...,...
4397,2977988551,closed,21.619444,2977988551,30,0.278667,Positive
4398,3275246488,closed,15.181944,3275246488,16,0.000000,Neutral
4399,3275451449,closed,2.135278,3275451449,27,-0.044050,Neutral
4400,3260325787,closed,0.711389,3260325787,7,0.954800,Positive


In [5]:
merged_final_df

Unnamed: 0,id,state,lifecycle_hours,pr_id,iteration_cycles,user_type_final,agent
0,3265640341,closed,0.100833,3265640341,12,Both,Claude_Code
1,3265709660,closed,0.635556,3265709660,11,Bot,Claude_Code
2,3214555104,closed,47.635833,3214555104,30,User,Claude_Code
3,3214724259,closed,0.004444,3214724259,12,Bot,Claude_Code
4,3214782537,closed,203.482500,3214782537,30,Bot,Claude_Code
...,...,...,...,...,...,...,...
7311,3260345989,closed,29.747500,3260345989,17,Bot,Devin
7312,3260452571,closed,19.248056,3260452571,7,Both,Devin
7313,2857171695,closed,20.076389,2857171695,13,User,Devin
7314,2858429985,closed,15.211944,2858429985,18,User,Devin


In [6]:
final_df_sentiment_col = ["id", "lifecycle_hours", "iteration_cycles", "sentiment_category"]
merged_final_df_col = ["id", "agent", "user_type_final"]

merged = final_df_sentiment[final_df_sentiment_col].merge(
    merged_final_df[merged_final_df_col],
    on="id",
    how="left"
)


In [7]:
merged

Unnamed: 0,id,lifecycle_hours,iteration_cycles,sentiment_category,agent,user_type_final
0,3265709660,0.635556,11,Positive,Claude_Code,Bot
1,3214555104,47.635833,30,Positive,Claude_Code,User
2,3214724259,0.004444,12,Positive,Claude_Code,Bot
3,3214876564,0.938333,30,Positive,Claude_Code,Bot
4,3215868710,13.475000,30,Positive,Claude_Code,Both
...,...,...,...,...,...,...
4397,2977988551,21.619444,30,Positive,Devin,Both
4398,3275246488,15.181944,16,Neutral,Devin,User
4399,3275451449,2.135278,27,Neutral,Devin,Both
4400,3260325787,0.711389,7,Positive,Devin,Both
