# Import Required Libraries
Import all necessary libraries including os, csv, json, subprocess, git, datetime, pandas, scipy, matplotlib for visualization, and collections.

In [None]:
import os
import csv
import json
import subprocess
import git
import datetime
import pandas as pd
from scipy.stats import mannwhitneyu
from collections import defaultdict
import matplotlib.pyplot as plt

# Define Repository Data
Create the list of repositories with metadata including name, URL, type (AI-Coauthored or Human Written), AI tool used, and AI-coauthorship mention.

In [None]:
repositories = [
    {
        "name": "MeowAI",
        "url": "https://github.com/charlie-captain/MeowAI.git",
        "type": "AI-Coauthored",
        "ai_tool": "GitHub Copilot",
        "ai_mention": "Commit messages indicate 'Co-authored-by: GitHub Copilot'"
    },
    {
        "name": "cryptocurrency",
        "url": "https://github.com/antonioparraga/cryptocurrency.git",
        "type": "AI-Coauthored",
        "ai_tool": "GitHub Copilot",
        "ai_mention": "AI-coauthorship noted in commits"
    },
    {
        "name": "AutoScheduler",
        "url": "https://github.com/zhengyue08/AutoScheduler.git",
        "type": "AI-Coauthored",
        "ai_tool": "GitHub Copilot",
        "ai_mention": "README and commit notes mention AI assistance"
    },
    {
        "name": "argos-ai-adventure",
        "url": "https://github.com/argosopentech/argos-ai-adventure.git",
        "type": "AI-Coauthored",
        "ai_tool": "GitHub Copilot",
        "ai_mention": "‘Co-authored-by: GitHub Copilot’ appears in commit messages"
    },
    {
        "name": "ai-generated-games",
        "url": "https://github.com/alexkorep/ai-generated-games.git",
        "type": "AI-Coauthored",
        "ai_tool": "GitHub Copilot",
        "ai_mention": "AI-generated commits detected via commit annotations"
    },
    {
        "name": "ascii-games",
        "url": "https://github.com/willnippard/ascii-games.git",
        "type": "AI-Coauthored",
        "ai_tool": "GitHub Copilot",
        "ai_mention": "Indications in commit messages that AI assistance was used"
    },
    {
        "name": "vscode-python-github-copilot-wsl2",
        "url": "https://github.com/buanzo/vscode-python-github-copilot-wsl2.git",
        "type": "AI-Coauthored",
        "ai_tool": "GitHub Copilot",
        "ai_mention": "Repository name and commits clearly reference GitHub Copilot"
    },
    {
        "name": "written_by_chatgpt",
        "url": "https://github.com/hozgur/written_by_chatgpt.git",
        "type": "AI-Coauthored",
        "ai_tool": "ChatGPT",
        "ai_mention": "Repository title and commit messages indicate ChatGPT involvement"
    },
    {
        "name": "poker-gpt",
        "url": "https://github.com/thebyrd/poker-gpt.git",
        "type": "AI-Coauthored",
        "ai_tool": "ChatGPT",
        "ai_mention": "Title suggests GPT involvement; commit annotations note AI assistance"
    },
    {
        "name": "PyTaskBar",
        "url": "https://github.com/Grassboy/PyTaskBar.git",
        "type": "AI-Coauthored",
        "ai_tool": "GitHub Copilot",
        "ai_mention": "Commit history shows ‘Co-authored-by: GitHub Copilot’"
    },
    {
        "name": "python-frameworks-benchmark",
        "url": "https://github.com/startmatter/python-frameworks-benchmark.git",
        "type": "Human Written",
        "ai_tool": "N/A",
        "ai_mention": "N/A"
    },
    {
        "name": "msspray",
        "url": "https://github.com/0xZDH/msspray.git",
        "type": "Human Written",
        "ai_tool": "N/A",
        "ai_mention": "N/A"
    },
    {
        "name": "pytorch-res2net",
        "url": "https://github.com/4uiiurz1/pytorch-res2net.git",
        "type": "Human Written",
        "ai_tool": "N/A",
        "ai_mention": "N/A"
    },
    {
        "name": "pytorch-dimenet",
        "url": "https://github.com/akirasosa/pytorch-dimenet.git",
        "type": "Human Written",
        "ai_tool": "N/A",
        "ai_mention": "N/A"
    },
    {
        "name": "CRIME-poc",
        "url": "https://github.com/mpgn/crime-poc.git",
        "type": "Human Written",
        "ai_tool": "N/A",
        "ai_mention": "N/A"
    },
    {
        "name": "voxceleb-luigi",
        "url": "https://github.com/maxhollmann/voxceleb-luigi.git",
        "type": "Human Written",
        "ai_tool": "N/A",
        "ai_mention": "N/A"
    },
    {
        "name": "pycamloop",
        "url": "https://github.com/glefundes/pycamloop.git",
        "type": "Human Written",
        "ai_tool": "N/A",
        "ai_mention": "N/A"
    },
    {
        "name": "Gnip-Insights-Interface",
        "url": "https://github.com/xdevplatform/Gnip-Insights-Interface.git",
        "type": "Human Written",
        "ai_tool": "N/A",
        "ai_mention": "N/A"
    },
    {
        "name": "auckland-ai-meetup-x-triage",
        "url": "https://github.com/a-i-joe/auckland-ai-meetup-x-triage.git",
        "type": "Human Written",
        "ai_tool": "N/A",
        "ai_mention": "N/A"
    },
    {
        "name": "NJUPT-API",
        "url": "https://github.com/gaoliang/njupt-api.git",
        "type": "Human Written",
        "ai_tool": "N/A",
        "ai_mention": "N/A"
    }
]

# Setup Helper Functions
Define utility functions including categorize_refactoring_type, clone_or_update_repo, estimate_loc, and get_commit_metrics.

In [None]:
def categorize_refactoring_type(ref_type):
    category_mapping = {
        # Naming
        "Rename Method": "Naming Improvements",
        "Rename Class": "Naming Improvements",
        "Rename Variable": "Naming Improvements",
        "Rename Parameter": "Naming Improvements",
        
        # Parameters
        "Add Parameter": "Parameter Modifications",
        "Remove Parameter": "Parameter Modifications",
        "Change Parameter": "Parameter Modifications",
        "Change/Rename Parameter": "Parameter Modifications",
        
        # Method Composition
        "Extract Method": "Method Composition",
        "Inline Method": "Method Composition",
        
        # Method Movement
        "Move Method": "Method Movement",
        "Pull Up Method": "Method Movement",
        "Push Down Method": "Method Movement"
    }
    return category_mapping.get(ref_type, "Other")  # Default to "Other" if type is unknown

# function to clone or update a repository
def clone_or_update_repo(repo_url, local_path):
    if not os.path.exists(local_path):
        print(f"Cloning {repo_url} into {local_path}")
        git.Repo.clone_from(repo_url, local_path)
    else:
        print(f"Repository {local_path} exists. Pulling latest changes.")
        repo = git.Repo(local_path)
        repo.remotes.origin.pull()

# function to estimate lines of code (LOC) in a repository
def estimate_loc(local_path):
    total_lines = 0
    for root, dirs, files in os.walk(local_path):
        for file in files:
            if file.endswith('.py'):
                file_path = os.path.join(root, file)
                try:
                    with open(file_path, 'r', encoding='utf-8') as f:
                        lines = f.readlines()
                    total_lines += sum(1 for line in lines if line.strip())  
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")
    return total_lines

# function to extract commit metrics using GitPython
def get_commit_metrics(local_path):
    repo = git.Repo(local_path)
    commits = list(repo.iter_commits())
    commit_count = len(commits)
    contributors = set(commit.author.email for commit in commits)
    loc = estimate_loc(local_path)
    commit_dates = [datetime.datetime.fromtimestamp(commit.committed_date) for commit in commits]
    all_commit_hashes = set(commit.hexsha for commit in commits)

    return commit_count, loc, len(contributors), sorted(commit_dates), all_commit_hashes

# Configure Directory and Output Settings
Set up the directory for cloning repositories and output CSV file names and paths.

In [None]:
repos_dir = "Repos"  
output_csv = "aggregated_repo_metrics_with_types.csv"  

os.makedirs(repos_dir, exist_ok=True)

# Define Refactoring Categories
Create the REFACTORING_CATEGORIES dictionary that maps specific refactoring types to broader categories.

In [None]:
REFACTORING_CATEGORIES = {
    "Naming Improvements": [
        "Rename Method", "Rename Class", "Rename Variable", "Rename Parameter"
    ],
    "Parameter Modifications": [
        "Add Parameter", "Remove Parameter", "Change Parameter", "Change/Rename Parameter"
    ],
    "Method Composition": [
        "Extract Method", "Inline Method"
    ],
    "Method Movement": [
        "Move Method", "Pull Up Method", "Push Down Method"
    ]
}

# Create Repository Processing Functions
Define functions to process repositories including run_pyref, get_refactoring_commits, get_refactoring_contributors, get_pyref_type_counts, get_most_common_refactoring_type, and get_refactoring_time_diffs_and_avg_timestamp.

In [None]:
# function to run PyRef on a repository
def run_pyref(local_path):
    cmd = f"python3 ../PyRef/main.py getrefs -r \"{local_path}\" -s 10"
    print(f"Running PyRef for {local_path}")
    try:
        subprocess.run(cmd, shell=True, check=True, capture_output=True)
    except subprocess.CalledProcessError as e:
        print(f"PyRef command failed for {local_path}: {e}")
        return None
    repo_name = os.path.basename(local_path)
    json_file = f"{repo_name}_data.json"
    if os.path.exists(json_file):
        try:
            with open(json_file, "r") as f:
                ref_data = json.load(f)
            return ref_data
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON from {json_file}: {e}")
            return None
    else:
        print(f"PyRef JSON output file not found: {json_file}")
        return None

# get refactoring commits from PyRef data
def get_refactoring_commits(ref_data):
    if not ref_data:
        return set()
    return set(item["Commit"] for item in ref_data)

# get refactoring contributors from PyRef data
def get_refactoring_contributors(ref_data, local_path):
    if not ref_data:
        return set()
    repo = git.Repo(local_path)
    refactoring_commit_hashes = set(item["Commit"] for item in ref_data)
    refactoring_contributors = set()
    for commit_hash in refactoring_commit_hashes:
        commit = repo.commit(commit_hash)
        refactoring_contributors.add(commit.author.email)
    return refactoring_contributors

#  count refactoring types from PyRef data
def get_pyref_type_counts(ref_data):
    if not ref_data:
        return defaultdict(int)
    type_counts = defaultdict(int)
    for refactoring in ref_data:
        ref_type = refactoring.get("Refactoring Type")
        if ref_type:
            if isinstance(ref_type, list):
                for t in ref_type:
                    type_counts[t] += 1
            else:
                type_counts[ref_type] += 1
    return type_counts

# get the most common refactoring type
def get_most_common_refactoring_type(type_counts):
    if not type_counts:
        return "None"
    most_common_type = "None"
    max_count = 0
    for ref_type, count in type_counts.items():
        if count > max_count:
            max_count = count
            most_common_type = ref_type
    return most_common_type

# calculate refactoring time differences and average timestamp
def get_refactoring_time_diffs_and_avg_timestamp(ref_data, local_path, commit_dates):
    if not ref_data:
        return [], 0 

    ref_commits = set(item["Commit"] for item in ref_data)
    repo = git.Repo(local_path)
    ref_dates = []
    for commit in repo.iter_commits():
        if commit.hexsha in ref_commits:
            ref_dates.append(datetime.datetime.fromtimestamp(commit.committed_date))
    ref_dates.sort()
    time_diffs = []
    for i in range(1, len(ref_dates)):
        diff = (ref_dates[i] - ref_dates[i-1]).total_seconds()
        time_diffs.append(diff)

    avg_refactoring_commit_timestamp = 0  
    if ref_dates:
        timestamp_sum = sum(date.timestamp() for date in ref_dates)  
        avg_refactoring_commit_timestamp = timestamp_sum / len(ref_dates)  
    return time_diffs, avg_refactoring_commit_timestamp

# Process Repositories and Collect Metrics
Implement the main processing loop that iterates through repositories, collects metrics, and prepares data for analysis and output.

In [None]:
# Process repositories and collect metrics
all_time_diffs_ai = []
all_time_diffs_human = []
refactoring_category_counts_ai = defaultdict(int)
refactoring_category_counts_human = defaultdict(int)

with open(output_csv, "w", newline="") as csvfile:
    fieldnames = [
        "Repository Name", "Repository Link", "Human/AI-Coauthored",
        "AI Tool Used", "AI-coauthorship Mention", "Number of Commits",
        "Lines of Code/Size", "# of Contributors",
        "Total Refactorings", "Average Time-to-Refactor (sec)", "Refactoring Commits Percentage",
        "Average Commit Timestamp (Epoch Sec)", "Average Refactoring Commit Timestamp (Epoch Sec)", "Refactoring Timestamp Difference (days)",
        "Number of Refactoring Contributors", "Most Common Refactoring Type"
    ] + list(REFACTORING_CATEGORIES.keys())

    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for repo_info in repositories:
        local_path = os.path.join(repos_dir, repo_info["name"])
        clone_or_update_repo(repo_info["url"], local_path)

        # Get commit metrics
        num_commits, loc, num_contributors, commit_dates, all_commit_hashes = get_commit_metrics(local_path)

        # Run PyRef and collect refactoring data
        ref_data = run_pyref(local_path)
        refactoring_commits_hashes = get_refactoring_commits(ref_data)
        refactoring_commits_percentage = (len(refactoring_commits_hashes) / num_commits) * 100 if num_commits > 0 else 0

        # Calculate time differences and average timestamps
        time_diffs, avg_refactoring_commit_timestamp_epoch = get_refactoring_time_diffs_and_avg_timestamp(ref_data, local_path, commit_dates)
        avg_time_refactor = sum(time_diffs) / len(time_diffs) if time_diffs else 0

        avg_commit_timestamp_epoch = sum(date.timestamp() for date in commit_dates) / len(commit_dates) if commit_dates else 0
        timestamp_difference_days = (avg_refactoring_commit_timestamp_epoch - avg_commit_timestamp_epoch) / 86400.0 if avg_refactoring_commit_timestamp_epoch and avg_commit_timestamp_epoch else 0

        # Collect refactoring contributors and type counts
        refactoring_contributors = get_refactoring_contributors(ref_data, local_path)
        num_refactoring_contributors = len(refactoring_contributors)
        type_counts = get_pyref_type_counts(ref_data)
        most_common_type = get_most_common_refactoring_type(type_counts)

        # Count refactoring categories
        refactoring_category_counts = defaultdict(int)
        total_refactorings = len(ref_data) if ref_data else 0
        if ref_data:
            for refactoring in ref_data:
                ref_type = refactoring.get("Refactoring Type")
                if ref_type:
                    if isinstance(ref_type, list):
                        for single_type in ref_type:
                            category = categorize_refactoring_type(single_type)
                            refactoring_category_counts[category] += 1
                    else:
                        category = categorize_refactoring_type(ref_type)
                        refactoring_category_counts[category] += 1

        # Aggregate time differences and category counts by repository type
        if repo_info["type"] == "AI-Coauthored":
            all_time_diffs_ai.extend(time_diffs)
            for category, count in refactoring_category_counts.items():
                refactoring_category_counts_ai[category] += count
        else:
            all_time_diffs_human.extend(time_diffs)
            for category, count in refactoring_category_counts.items():
                refactoring_category_counts_human[category] += count

        # Write repository metrics to CSV
        row_data = {
            "Repository Name": repo_info["name"],
            "Repository Link": repo_info["url"],
            "Human/AI-Coauthored": repo_info["type"],
            "AI Tool Used": repo_info["ai_tool"],
            "AI-coauthorship Mention": repo_info["ai_mention"],
            "Number of Commits": num_commits,
            "Lines of Code/Size": f"{loc} lines",
            "# of Contributors": num_contributors,
            "Total Refactorings": total_refactorings,
            "Average Time-to-Refactor (sec)": avg_time_refactor,
            "Refactoring Commits Percentage": refactoring_commits_percentage,
            "Average Commit Timestamp (Epoch Sec)": avg_commit_timestamp_epoch,
            "Average Refactoring Commit Timestamp (Epoch Sec)": avg_refactoring_commit_timestamp_epoch,
            "Refactoring Timestamp Difference (days)": timestamp_difference_days,
            "Number of Refactoring Contributors": num_refactoring_contributors,
            "Most Common Refactoring Type": most_common_type
        }
        for category in REFACTORING_CATEGORIES.keys():
            row_data[category] = refactoring_category_counts.get(category, 0)

        writer.writerow(row_data)
        print(f"Processed repository: {repo_info['name']}")

# Generate CSV Output
Create and write to the CSV file with all the aggregated repository metrics.

In [None]:
with open(output_csv, "w", newline="") as csvfile:
    fieldnames = [
        "Repository Name", "Repository Link", "Human/AI-Coauthored",
        "AI Tool Used", "AI-coauthorship Mention", "Number of Commits",
        "Lines of Code/Size", "# of Contributors",
        "Total Refactorings", "Average Time-to-Refactor (sec)", "Refactoring Commits Percentage",
        "Average Commit Timestamp (Epoch Sec)", "Average Refactoring Commit Timestamp (Epoch Sec)", "Refactoring Timestamp Difference (days)",
        "Number of Refactoring Contributors", "Most Common Refactoring Type"
    ] + list(REFACTORING_CATEGORIES.keys())

    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for repo_info in repositories:
        local_path = os.path.join(repos_dir, repo_info["name"])
        clone_or_update_repo(repo_info["url"], local_path)

        num_commits, loc, num_contributors, commit_dates, all_commit_hashes = get_commit_metrics(local_path)

        ref_data = run_pyref(local_path)
        refactoring_commits_hashes = get_refactoring_commits(ref_data)
        refactoring_commits_percentage = (len(refactoring_commits_hashes) / num_commits) * 100 if num_commits > 0 else 0

        # Calculate time differences and average timestamps
        time_diffs, avg_refactoring_commit_timestamp_epoch = get_refactoring_time_diffs_and_avg_timestamp(ref_data, local_path, commit_dates)
        avg_time_refactor = sum(time_diffs) / len(time_diffs) if time_diffs else 0

        avg_commit_timestamp_epoch = sum(date.timestamp() for date in commit_dates) / len(commit_dates) if commit_dates else 0
        timestamp_difference_days = (avg_refactoring_commit_timestamp_epoch - avg_commit_timestamp_epoch) / 86400.0 if avg_refactoring_commit_timestamp_epoch and avg_commit_timestamp_epoch else 0

        # Collect refactoring contributors and type counts
        refactoring_contributors = get_refactoring_contributors(ref_data, local_path)
        num_refactoring_contributors = len(refactoring_contributors)
        type_counts = get_pyref_type_counts(ref_data)
        most_common_type = get_most_common_refactoring_type(type_counts)

        # Count refactoring categories
        refactoring_category_counts = defaultdict(int)
        total_refactorings = len(ref_data) if ref_data else 0
        if ref_data:
            for refactoring in ref_data:
                ref_type = refactoring.get("Refactoring Type")
                if ref_type:
                    if isinstance(ref_type, list):
                        for single_type in ref_type:
                            category = categorize_refactoring_type(single_type)
                            refactoring_category_counts[category] += 1
                    else:
                        category = categorize_refactoring_type(ref_type)
                        refactoring_category_counts[category] += 1

        # Aggregate time differences and category counts by repository type
        if repo_info["type"] == "AI-Coauthored":
            all_time_diffs_ai.extend(time_diffs)
            for category, count in refactoring_category_counts.items():
                refactoring_category_counts_ai[category] += count
        else:
            all_time_diffs_human.extend(time_diffs)
            for category, count in refactoring_category_counts.items():
                refactoring_category_counts_human[category] += count

        # Write repository metrics to CSV
        row_data = {
            "Repository Name": repo_info["name"],
            "Repository Link": repo_info["url"],
            "Human/AI-Coauthored": repo_info["type"],
            "AI Tool Used": repo_info["ai_tool"],
            "AI-coauthorship Mention": repo_info["ai_mention"],
            "Number of Commits": num_commits,
            "Lines of Code/Size": f"{loc} lines",
            "# of Contributors": num_contributors,
            "Total Refactorings": total_refactorings,
            "Average Time-to-Refactor (sec)": avg_time_refactor,
            "Refactoring Commits Percentage": refactoring_commits_percentage,
            "Average Commit Timestamp (Epoch Sec)": avg_commit_timestamp_epoch,
            "Average Refactoring Commit Timestamp (Epoch Sec)": avg_refactoring_commit_timestamp_epoch,
            "Refactoring Timestamp Difference (days)": timestamp_difference_days,
            "Number of Refactoring Contributors": num_refactoring_contributors,
            "Most Common Refactoring Type": most_common_type
        }
        for category in REFACTORING_CATEGORIES.keys():
            row_data[category] = refactoring_category_counts.get(category, 0)

        writer.writerow(row_data)
        print(f"Processed repository: {repo_info['name']}")

# Perform Statistical Analysis
Perform statistical analysis including Mann-Whitney U test to compare refactoring patterns between AI-coauthored and human-written repositories.

In [None]:
# Perform initial Mann-Whitney U test to compare refactoring time differences
if all_time_diffs_ai and all_time_diffs_human:
    stat, p_value = mannwhitneyu(all_time_diffs_ai, all_time_diffs_human)
    print(f"Mann-Whitney U test results:")
    print(f"Statistic: {stat}, p-value: {p_value}")
else:
    print("Not enough data for Mann-Whitney U test.")