# Extra Analysis: 3-Way ANOVA
This notebook analyses the effects for the factors Sentiment, Agent and User with different levels

In [1]:
! pip install pyarrow fastparquet 



In [2]:
from huggingface_hub import login
login("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")

In [3]:
# Test if we can access the dataset
from huggingface_hub import HfApi
api = HfApi()

try:
    # Try to get dataset info
    dataset_info = api.dataset_info("hao-li/AIDev")
    print(f"Dataset exists: {dataset_info.id}")
    print(f"Files in dataset:")
    files = api.list_repo_files("hao-li/AIDev", repo_type="dataset")
    for f in files:
        print(f"  - {f}")
except Exception as e:
    print(f"Error accessing dataset: {e}")
    print("\nThis could mean:")
    print("1. The dataset doesn't exist at 'hao-li/AIDev'")
    print("2. The dataset is private and you don't have access")
    print("3. The dataset name is incorrect")

Dataset exists: hao-li/AIDev
Files in dataset:
  - .DS_Store
  - .gitattributes
  - README.md
  - aidev_logo.png
  - all_pull_request.parquet
  - all_repository.parquet
  - all_user.parquet
  - data_table.md
  - human_pr_task_type.parquet
  - human_pull_request.parquet
  - issue.parquet
  - pr_comments.parquet
  - pr_commit_details.parquet
  - pr_commits.parquet
  - pr_cumulative.png
  - pr_review_comments.parquet
  - pr_review_comments_v2.parquet
  - pr_reviews.parquet
  - pr_task_type.parquet
  - pr_timeline.parquet
  - pull_request.parquet
  - related_issue.parquet
  - repository.parquet
  - schema.png
  - user.parquet


In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import numpy as np

# Set plot style
sns.set_theme(style="whitegrid")

# Ensure VADER lexicon is downloaded
try:
    nltk.data.find('sentiment/vader_lexicon.zip')
except LookupError:
    print("Downloading VADER lexicon...")
    nltk.download('vader_lexicon')

In [5]:
print("Loading datasets...")
from huggingface_hub import hf_hub_download
import os

try:
    # Download files from Hugging Face to local cache
    dataset_name = "hao-li/AIDev"
    
    # Download all required files
    files_to_download = [
        "all_pull_request.parquet",
        "all_repository.parquet", 
        "all_user.parquet",
        "pull_request.parquet",
        "repository.parquet",
        "user.parquet",
        "pr_comments.parquet",
        "pr_reviews.parquet",
        "pr_review_comments_v2.parquet",
        "pr_commits.parquet",
        "pr_commit_details.parquet",
        "related_issue.parquet",
        "issue.parquet",
        "pr_timeline.parquet",
        "pr_task_type.parquet",
        "human_pull_request.parquet",
        "human_pr_task_type.parquet"
    ]
    
    print("Downloading files from Hugging Face...")
    file_paths = {}
    for filename in files_to_download:
        print(f"  Downloading {filename}...")
        file_paths[filename] = hf_hub_download(repo_id=dataset_name, filename=filename, repo_type="dataset")
    
    print("\nLoading parquet files into dataframes...")
    all_pr_df = pd.read_parquet(file_paths["all_pull_request.parquet"])
    all_repo_df = pd.read_parquet(file_paths["all_repository.parquet"])
    all_user_df = pd.read_parquet(file_paths["all_user.parquet"])

    # Basic
    pr_df = pd.read_parquet(file_paths["pull_request.parquet"])
    repo_df = pd.read_parquet(file_paths["repository.parquet"])
    user_df = pd.read_parquet(file_paths["user.parquet"])
    
    # Comments and reviews
    pr_comments_df = pd.read_parquet(file_paths["pr_comments.parquet"])
    pr_reviews_df = pd.read_parquet(file_paths["pr_reviews.parquet"])
    pr_review_comments_df = pd.read_parquet(file_paths["pr_review_comments_v2.parquet"])
    
    # Commits
    pr_commits_df = pd.read_parquet(file_paths["pr_commits.parquet"])
    pr_commit_details_df = pd.read_parquet(file_paths["pr_commit_details.parquet"])
    
    # Related issues
    related_issue_df = pd.read_parquet(file_paths["related_issue.parquet"])
    issue_df = pd.read_parquet(file_paths["issue.parquet"])
    
    # Events
    pr_timeline_df = pd.read_parquet(file_paths["pr_timeline.parquet"])
    
    # Task type
    pr_task_type_df = pd.read_parquet(file_paths["pr_task_type.parquet"])
    
    # Human-PR
    human_pr_df = pd.read_parquet(file_paths["human_pull_request.parquet"])
    human_pr_task_type_df = pd.read_parquet(file_paths["human_pr_task_type.parquet"])

    print("\n✓ All datasets loaded successfully!")
except Exception as e:
    print(f"Error loading datasets: {e}")
    import traceback
    traceback.print_exc()
    pr_df, pr_timeline_df, pr_reviews_df = None, None, None

Loading datasets...
Downloading files from Hugging Face...
  Downloading all_pull_request.parquet...
  Downloading all_repository.parquet...
  Downloading all_user.parquet...
  Downloading pull_request.parquet...
  Downloading repository.parquet...
  Downloading user.parquet...
  Downloading pr_comments.parquet...
  Downloading pr_reviews.parquet...
  Downloading pr_review_comments_v2.parquet...
  Downloading pr_commits.parquet...
  Downloading pr_commit_details.parquet...
  Downloading related_issue.parquet...
  Downloading issue.parquet...
  Downloading pr_timeline.parquet...
  Downloading pr_task_type.parquet...
  Downloading human_pull_request.parquet...
  Downloading human_pr_task_type.parquet...

Loading parquet files into dataframes...

✓ All datasets loaded successfully!


In [6]:
if pr_df is not None:
    print("Preprocessing PR data...")
    closed_prs = pr_df[pr_df["state"] == "closed"].copy()
    closed_prs["created_at"] = pd.to_datetime(closed_prs["created_at"])
    closed_prs["closed_at"] = pd.to_datetime(closed_prs["closed_at"])
    closed_prs["lifecycle_hours"] = (closed_prs["closed_at"] - closed_prs["created_at"]).dt.total_seconds() / 3600
    pr_lifecycle = closed_prs[["id", "state", "lifecycle_hours"]]
    print(pr_lifecycle.head())

Preprocessing PR data...
            id   state  lifecycle_hours
0   3264933329  closed        76.038611
1   3265118634  closed        17.258056
2   3265640341  closed         0.100833
3   3265709660  closed         0.635556
16  3234102722  closed        35.815278


In [7]:
if pr_timeline_df is not None:
    print("Calculating iteration cycles...")
    pr_iterations = (
        pr_timeline_df
        .groupby("pr_id")
        .size()
        .reset_index(name="iteration_cycles")
    )
    print(pr_iterations.head())

Calculating iteration cycles...
        pr_id  iteration_cycles
0  2756921963                30
1  2757103560                22
2  2757124156                 7
3  2757125491                 7
4  2757179026                15


In [8]:
if pr_reviews_df is not None:
    print("Analyzing sentiment...")
    # Extract relevant columns and drop missing bodies
    reviews = pr_reviews_df[["pr_id", "body"]].dropna().copy()
    
    sia = SentimentIntensityAnalyzer()
    
    def get_compound_score(text):
        return sia.polarity_scores(text)["compound"]
    
    # Calculate sentiment for each review
    print("Calculating VADER scores (this might take a moment)...")
    reviews["compound_score"] = reviews["body"].apply(get_compound_score)
    
    # Aggregate by PR (Mean score)
    print("Aggregating sentiment per PR...")
    pr_sentiment = reviews.groupby("pr_id")["compound_score"].mean().reset_index()
    
    # Classify sentiment
    def classify_sentiment(score):
        if score > 0.05:
            return "Positive"
        elif score < -0.05:
            return "Negative"
        else:
            return "Neutral"
            
    pr_sentiment["sentiment_category"] = pr_sentiment["compound_score"].apply(classify_sentiment)
    pr_sentiment.columns = ["id", "mean_compound_score", "sentiment_category"]
    
    print(pr_sentiment.head())

Analyzing sentiment...
Calculating VADER scores (this might take a moment)...
Aggregating sentiment per PR...
           id  mean_compound_score sentiment_category
0  2760115428               0.0000            Neutral
1  2766353261              -0.1779           Negative
2  2768057346               0.4926           Positive
3  2768057378               0.4199           Positive
4  2768132850              -0.5267           Negative


In [9]:
if pr_df is not None and pr_timeline_df is not None and pr_reviews_df is not None:
    print("Merging data...")
    merged_df = pd.merge(
        pr_lifecycle,
        pr_iterations,
        left_on="id",
        right_on="pr_id",
        how="left"
    )
    
    final_df = pd.merge(
        merged_df,
        pr_sentiment,
        on="id",
        how="inner" # Only PRs with reviews/sentiment
    )
    print(final_df.head())

Merging data...
           id   state  lifecycle_hours       pr_id  iteration_cycles  \
0  3265709660  closed         0.635556  3265709660                11   
1  3214555104  closed        47.635833  3214555104                30   
2  3214724259  closed         0.004444  3214724259                12   
3  3214876564  closed         0.938333  3214876564                30   
4  3215868710  closed        13.475000  3215868710                30   

   mean_compound_score sentiment_category  
0             0.790600           Positive  
1             0.332200           Positive  
2             0.653100           Positive  
3             0.499460           Positive  
4             0.398367           Positive  


In [10]:
final_df["iteration_per_hour"] = final_df["iteration_cycles"] / final_df["lifecycle_hours"]

In [11]:
# Create 3 separate dataframes for each sentiment category
positive_df = final_df[final_df["sentiment_category"] == "Positive"]
negative_df = final_df[final_df["sentiment_category"] == "Negative"]
neutral_df = final_df[final_df["sentiment_category"] == "Neutral"]

In [12]:
# merge the final_df with the agent column from closed_prs to see if there is any difference between AI agents and human agents
merged_final_df = final_df.merge(
    closed_prs[["id", "agent"]], 
    left_on="id", 
    right_on="id", 
    how="left"
)

In [13]:
# Merge the merged_final_df with the user_type column in the pr_reviews_df
merged_final_df = merged_final_df.merge(
    pr_reviews_df[["pr_id", "user_type"]].drop_duplicates(), 
    left_on="id", 
    right_on="pr_id", 
    how="left"
)

In [14]:
import statsmodels.api as sm
from statsmodels.formula.api import ols
model = ols('iteration_per_hour ~ C(agent) * C(user_type) * C(sentiment_category)', data=merged_final_df).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
anova_table

Unnamed: 0,sum_sq,df,F,PR(>F)
C(agent),16085840.0,4.0,35.717816,1.591521e-29
C(user_type),3790535.0,1.0,33.666794,6.887059e-09
C(sentiment_category),512519.8,2.0,2.27605,0.1027808
C(agent):C(user_type),5487238.0,4.0,12.184144,7.278689e-10
C(agent):C(sentiment_category),6204049.0,8.0,6.887895,4.677021e-09
C(user_type):C(sentiment_category),1030431.0,2.0,4.576046,0.01033277
C(agent):C(user_type):C(sentiment_category),2178340.0,8.0,2.418449,0.01321321
Residual,652232300.0,5793.0,,


In [15]:
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Post-hoc tests (Tukey HSD)
tukey_agent = pairwise_tukeyhsd(endog=merged_final_df['iteration_per_hour'],groups=merged_final_df['agent'],alpha=0.05)
print(tukey_agent)

# compare levels of 'user_type'
tukey_user = pairwise_tukeyhsd(endog=merged_final_df['iteration_per_hour'],groups=merged_final_df['user_type'],alpha=0.05)
print(tukey_user)

# compare levels of 'sentiment_category'
tukey_sentiment = pairwise_tukeyhsd(endog=merged_final_df['iteration_per_hour'],groups=merged_final_df['sentiment_category'],alpha=0.05)
print(tukey_sentiment)

        Multiple Comparison of Means - Tukey HSD, FWER=0.05        
   group1      group2     meandiff p-adj    lower    upper   reject
-------------------------------------------------------------------
Claude_Code      Copilot  -67.0081 0.1149 -143.1407   9.1244  False
Claude_Code       Cursor  118.8639 0.0008   36.3567 201.3711   True
Claude_Code        Devin  -49.4583 0.4148 -127.4152  28.4986  False
Claude_Code OpenAI_Codex   33.2762 0.7535  -42.6197 109.1722  False
    Copilot       Cursor   185.872    0.0  141.9737 229.7703   True
    Copilot        Devin   17.5498 0.6379  -17.0464   52.146  False
    Copilot OpenAI_Codex  100.2843    0.0   70.6226 129.9461   True
     Cursor        Devin -168.3222    0.0 -215.3134 -121.331   True
     Cursor OpenAI_Codex  -85.5877    0.0 -129.0743  -42.101   True
      Devin OpenAI_Codex   82.7346    0.0   48.6621  116.807   True
-------------------------------------------------------------------
 Multiple Comparison of Means - Tukey HSD, FWER=