# TASK 1: Defect Analysis

In [None]:
import subprocess
import matplotlib.pyplot as plt
from collections import defaultdict

In [None]:
KEYWORDS = ["fix", "bug", "error", "issue", "hotfix", "resolve", "repair"]

- Obtention of commits since a given date (2023-01-01)

In [None]:
def get_git_commits_since(date="2023-01-01"):
    """
    Gets git commits since a given date using git log.
    """
    git_command = [
        "git", "log",
        f'--since={date}',
        "--pretty=format:%H|%ad|%s",
        "--date=short",
        "--name-only"
    ]
    result = subprocess.run(git_command, capture_output=True, text=True, encoding="utf-8", check=True)
    return result.stdout.split("\n") #we split the result in lines to process it more easily

- Creation of a commit dictionary (from unstructured raw git output to structured data)

In [None]:
def parse_commits(raw_lines):
    """
    Convert the raw lines returned by get_commits_since()
    into a clean list of commit dictionaries.
    Each commit contains: hash, date, message, and the files touched.
    """
    commits = []
    current = {}

    for line in raw_lines:
        if "|" in line: # a header line contains hash | date | message
            if current:
                commits.append(current)
            hash_, date, msg = line.split("|", 2)
            current = {"hash": hash_, "date": date, "msg": msg, "files": []}
        elif line.strip() == "":
            # empty line means a separator in git log output
            continue
        else:
            # file modified in this commit
            if "files" in current:
                current["files"].append(line.strip())

    if current:
        commits.append(current)

    return commits

- Detection of defect commits

In [None]:
def is_defect_commit(message):
    """
    If the commit message contains any keyword that is considered as defect, it outputs True.
    """
    msg = message.lower()
    return any(k in msg for k in KEYWORDS)

- Computation of defects per month, defects per file and defects per month for a file

In [None]:
def defects_per_month(commits):
    """
    Counts how many defect commits happened in each month, and returns a dict of the form: { 'YYYY-MM' : number_of_defects }
    """
    counter = defaultdict(int)
    for c in commits:
        # only count commits whose message suggests a defect fix
        if is_defect_commit(c["msg"]):
            month = c["date"][:7] # extracting YYYY-MM
            counter[month] += 1
    return counter


def defects_per_file(commits):
    """
    Counts how many times a file appeared in a defect commit. Returns a dict: { 'path/to/file.py' : number_of_defects }
    """
    file_counts = defaultdict(int)
    for c in commits:
        if is_defect_commit(c["msg"]):
            #every modified file in this commit is counted
            for f in c["files"]:
                file_counts[f] += 1
    return file_counts


def defects_per_month_for_file(commits, target_file):
    """
    Counts defect commits for a specific file, group by month. Returns a dict: { 'YYYY-MM' : number_of_defects_for_that_file }
    """
    counter = defaultdict(int)
    for c in commits:
        # only count defect commits that touched this specific file
        if is_defect_commit(c["msg"]) and target_file in c["files"]:
            month = c["date"][:7]
            counter[month] += 1
    return counter


In [None]:
def build_month_series(commits, defects_by_month):
    """
    Creates a month-by-month series based on all commit dates,
    filling missing months with 0 defects. 
    """
    all_months = sorted({commit["date"][:7] for commit in commits}) # we extract all months that appear in the commit history

    full_series = {month: defects_by_month.get(month, 0) for month in all_months} # we build a complete series, using 0 when a month has no defects. This is useful for ploting all months.
    return full_series

- Plotting utils

In [None]:
def plot_bar_chart(data, title, ylabel):

    months = sorted(data.keys()) # we sort keys to ensure the bars appear in chronological order
    values = [data[m] for m in months]

    plt.rcParams["figure.figsize"] = (10, 5) 
    plt.rcParams["figure.dpi"] = 120
    plt.figure()
    plt.bar(months, values)
    plt.title(title)
    plt.ylabel(ylabel)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

- Run Task 1 Analysis

In [None]:
#Obtaining commits sice 2023-01-01
raw = get_git_commits_since()
commits = parse_commits(raw)
len(commits)

In [None]:
#Total number of defects per month
dpm = defects_per_month(commits)
dpm_full = build_month_series(commits, dpm)
print("Defects per month:", dpm_full)
plot_bar_chart(dpm_full, "Defects per Month", "Number of Defects")

In [None]:
#Top 2 defective files
df = defects_per_file(commits)
top2 = sorted(df.items(), key=lambda x: x[1], reverse=True)[:2]
top2

In [None]:
#Defects per month for each of the most defective files
for f, count in top2:
    print(f"\nFile: {f} (total defects: {count})")
    dpf = defects_per_month_for_file(commits, f)
    print("Defects per month for this file:", dpf)
    full_counts = build_month_series(commits, dpf)
    plot_bar_chart(full_counts, f"Defects per Month for {f}", "Number of Defects")

# Task 2: Complexity Analysis

Setup transformers repository

In [29]:
from pathlib import Path

repo_url = "https://github.com/huggingface/transformers"

if not Path("transformers").exists():
    print("Cloning the Transformers repository...")
    subprocess.run(["git", "clone", repo_url], check=True)

/


Implement task

In [33]:
import os

from pathlib import Path
from matplotlib import pyplot as plt
from wordcloud import WordCloud


def dict_to_markdown_table(input_data: dict[str, int]) -> str:
    header = "| Count | File Name |\n"
    separator = "|-----------|------------|\n"
    rows = ""

    for key, value in input_data.items():
        rows += f"| {value} | {key} |\n"

    return header + separator + rows


def count_commits_in_repo(repo_path: Path, since_date: str) -> dict[str, int]:
    result = {}
    command = f"git log --since='{since_date}' --pretty=format: --name-only"

    os.chdir(repo_path)
    output = subprocess.check_output(command, shell=True, text=True)
    files = output.strip().splitlines()

    for file in files:
        if file and file.endswith('.py'):
            result[file] = result.get(file, 0) + 1

    return result


def count_lines_of_python_files(directory: Path, exclude_venv: bool = True, exclude_empty_init: bool = True) -> dict[str, int]:
    result = {}

    if not directory.exists():
        print(f"ERROR: Directory {directory} does not exist")
        return result

    for file in directory.glob("**/*.py"):
        lines_count = len(file.open("r", encoding='utf-8').readlines())
        relative_file_path = file.relative_to(directory).__str__()

        if exclude_venv and relative_file_path.startswith(".venv"):
            continue
        elif exclude_empty_init and file.name == "__init__.py" and lines_count == 0:
            continue

        result[relative_file_path] = lines_count

    return result


def create_bar_chart(input_data: dict[str, int], show_top_n_entries: int = 30) -> None:
    top_files = sorted(input_data.items(), key=lambda item: item[1], reverse=True)[:show_top_n_entries]
    file_names, line_counts = zip(*top_files)

    plt.figure(figsize=(40, 20))
    plt.barh(file_names, line_counts, color='skyblue')
    plt.title(f"Top {show_top_n_entries} Files by Line Count")
    plt.xlabel("Line Count")
    plt.ylabel("File Name")
    plt.gca().invert_yaxis()
    plt.grid(axis="x")
    plt.show()
    plt.clf()


def create_word_cloud(input_data: dict[str, int], width: int = 1080, height: int = 720) -> None:
    wordcloud = WordCloud(width=width, height=height, background_color="white")
    wordcloud.generate_from_frequencies(input_data)

    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()
    plt.clf()


if __name__ == "__main__":
    os.chdir('/')

    repo_path = Path("transformers")
    figures_output_path = Path("images")

    # Count lines of code
    commit_counts = count_commits_in_repo(repo_path, "2023-01-01")
    sorted_commit_counts = dict(sorted(commit_counts.items(), key=lambda item: item[1], reverse=True))
    markdown_table = dict_to_markdown_table(sorted_commit_counts)
    Path("docs/commits_count_table.md").open("w", encoding="utf-8").write(markdown_table)

    # Count commits
    line_counts_total = count_lines_of_python_files(repo_path, exclude_venv=False, exclude_empty_init=False)
    line_counts_without_venv_and_empty_init_files = count_lines_of_python_files(repo_path)
    line_counts_test_module = {}
    line_counts_src_module = {}

    for key, value in line_counts_without_venv_and_empty_init_files.items():
        if key.startswith("tests"):
            line_counts_test_module[key.replace("tests/", "")] = value
        elif key.startswith("src/transformers"):
            line_counts_src_module[key.replace("src/transformers/", "")] = value

    sorted_line_counts_total = dict(sorted(line_counts_total.items(), key=lambda item: item[1], reverse=True))
    markdown_table = dict_to_markdown_table(sorted_line_counts_total)
    Path("docs/line_count_table.md").open("w", encoding="utf-8").write(markdown_table)

    create_bar_chart(line_counts_src_module, figures_output_path)
    create_bar_chart(line_counts_test_module, figures_output_path)
    create_word_cloud(line_counts_src_module, figures_output_path)
    create_word_cloud(line_counts_test_module, figures_output_path)

    # use both metrics
    top_n = 10

    commit_counts = count_commits_in_repo(repo_path, "2023-01-01")
    sorted_commit_counts = sorted(commit_counts.items(), key=lambda item: item[1], reverse=True)[:top_n]

    line_counts = count_lines_of_python_files(repo_path)
    sorted_line_counts = sorted(line_counts.items(), key=lambda item: item[1], reverse=True)[:top_n]

    header = "| Top n | File Name | Commits / Lines of Code (LoC) | Bar |\n"
    separator = "|-----------|-----------|-----------|------------|\n"
    rows = header + separator
    for i in range(top_n):
        commit_count_file_name = sorted_commit_counts[i][0]
        commit_count = sorted_line_counts[i][1]

        lines_count_file_name = sorted_line_counts[i][0]
        lines_count = sorted_commit_counts[i][1]

        bar = f"*" * (commit_count // 70)
        rows += f"| {i + 1} | {commit_count_file_name} | Commits: {commit_count:,} | {bar} |\n"

        bar = f"*" * (lines_count // 70)
        rows += f"| {i + 1} | {lines_count_file_name} | LoC: {lines_count:,} | {bar} |\n"

    Path("../docs/LoC_and_NCC.md").open("w", encoding="utf-8").write(rows)

FileNotFoundError: [Errno 2] No such file or directory: 'transformers'

# TASK 3: Coupling Analysis

In [None]:
from collections import defaultdict
import itertools
import os
import matplotlib.pyplot as plt

- Computation of logical coupling for all Python file pairs

In [None]:
def is_python_file(path):
    return path.endswith(".py") # check to keep only Python source files.

def compute_logical_coupling(commits):
    coupling_counts = defaultdict(int)

    for commit in commits:
        python_files = [f for f in commit["files"] if is_python_file(f)] # we keep only Python files for this commit

        
        unique_files = sorted(set(python_files)) # we use a set to avoid counting the same file twice in one commit

        # For each unordered pair of files, we increase the coupling count
        for file_a, file_b in itertools.combinations(unique_files, 2):
            pair = (file_a, file_b)
            coupling_counts[pair] += 1

    return coupling_counts

- Plot of the top 10 coupled pairs 

In [None]:
def plot_top_coupled_pairs(coupling_counts, top_n=10, title="Top coupled file pairs"):

    
    top_pairs = sorted(coupling_counts.items(), key=lambda item: item[1], reverse=True)[:top_n] # we sort pairs by co-change count in descending order 

    pair_labels = [f"{a}\n{b}" for (a, b), _ in top_pairs]
    cochange_values = [count for _, count in top_pairs]

    plt.figure(figsize=(10, 6), dpi=120)
    plt.barh(range(len(top_pairs)), cochange_values)
    plt.yticks(range(len(top_pairs)), pair_labels)
    plt.xlabel("Number of co-changes")
    plt.title(title)
    plt.gca().invert_yaxis() 
    plt.tight_layout()
    plt.show()

    return top_pairs 

# we run the analysis for all Python file pairs
all_coupling = compute_logical_coupling(commits)
top10_all = plot_top_coupled_pairs(all_coupling, top_n=10, title="Top 10 logically coupled Python file pairs")
top10_all

- Computation of Logical Coupling restricted to test - non-test pairs

In [None]:
def is_test_file(path):
    return is_python_file(path) and os.path.basename(path).startswith("test") # we consider a python test file those files that start with "test" and end with .py

def filter_test_non_test_pairs(coupling_counts):
    filtered = {}

    for (file_a, file_b), count in coupling_counts.items():
        a_is_test = is_test_file(file_a)
        b_is_test = is_test_file(file_b)

        # we keep pairs where one is test and the other is non-test
        if a_is_test ^ b_is_test:  # XOR-> exactly one True
            filtered[(file_a, file_b)] = count

    return filtered


test_non_test_coupling = filter_test_non_test_pairs(all_coupling)

top10_test_non_test = plot_top_coupled_pairs(
    test_non_test_coupling,
    top_n=10,
    title="Top 10 test - non-test Python file pairs"
)

top10_test_non_test

### Method 1: Matching based on name:
We find the most likely test file for a given source file based on naming patterns.
    Examples:
    - source: utils.py  ->  test_utils.py or utils_test.py
    - source: trainer.py ->  test_trainer.py or trainer_test.py

In [None]:
from pathlib import Path
from typing import Optional, List

In [None]:
def find_all_test_files(tests_root: Path) -> List[Path]:
    return [p for p in tests_root.rglob("test*.py") if p.is_file()] #we collect all Python test files under the given tests_root

In [None]:

def find_test_by_name(source_file: Path, tests_root: Path) -> Optional[Path]:

    source_name = source_file.stem 

    expected_names = {
        f"test_{source_name}.py",
        f"{source_name}_test.py",
    }

    candidates = []
    for test_file in find_all_test_files(tests_root):
        if test_file.name in expected_names:
            candidates.append(test_file)

    if not candidates:
        return None

    # If there are several, we just return the one with the shortest path (more specific)
    return sorted(candidates, key=lambda p: len(p.as_posix()))[0]


In [None]:
project_root = Path(".").resolve()
source_file = project_root / "src/transformers/generation/utils.py"
tests_root = project_root / "tests"

best_name_match = find_test_by_name(source_file, tests_root)
best_name_match

### Method 2: Implement test finding method: find test through import statements

In [None]:
import ast
from pathlib import Path


def get_imports(file_path: Path) -> set:
    node = ast.parse(file_path.open("r").read(), filename=file_path)

    imports = set()
    for n in ast.walk(node):
        if isinstance(n, ast.Import):
            for alias in n.names:
                imports.add(alias.name)
        elif isinstance(n, ast.ImportFrom):
            imports.add(n.module if n.module else '')

    return imports


def find_related_test_file(source_file: Path, test_directory: Path) -> Path | None:
    source_imports = get_imports(source_file)
    related_tests = {}

    for file in test_directory.iterdir():
        if file.name.endswith('_test.py') or file.stem.startswith('test_'):
            test_file_path = file
            test_imports = get_imports(test_file_path)

            match_count = len(source_imports.intersection(test_imports))
            if match_count > 0:
                related_tests[test_file_path] = match_count

    if related_tests:
        return max(related_tests, key=related_tests.get)
    return None


test_source_file = Path('src/transformers/generation/utils.py')
test_directory = Path('tests')
related_test_file = find_related_test_file(test_source_file, test_directory)
print(f'Related test file: {related_test_file}')