In [2]:
import os
import re
import json
import tempfile
import itertools
from urllib.parse import urlparse

import clang.cindex
from tqdm import tqdm
import pandas as pd
from git import Repo, Commit, Diff
from rich import print

In [37]:
df = pd.read_csv("data/devign_2023_functions.csv")

In [40]:
df["labels"].value_counts()

labels
non-bugfix    12945
bugfix         3384
Name: count, dtype: int64

In [3]:
index = clang.cindex.Index.create()

In [20]:
df = pd.read_csv("data/devign.csv")
df = df[df["is_merge"] == "False"]
df = df[~df["labels"].str.contains("outside-threshold")]

In [13]:
df = pd.read_csv("data/output.csv")
df = df[df["label"] == "non-bugfix"]

In [15]:
df = df.sample(15000, random_state=42)

In [17]:
urls = [urlparse(url) for url in df["remote_url"]]
paths = [
    f"data/repositories/{url.netloc}/{'/'.join(url.path.split('/')[1:3])}"
    for url in urls
]
df["path"] = paths

repos = {}
commits: list[Commit] = []
for i, row in df.iterrows():
    path = row["path"]
    if repos.get(path) is None:
        repos[path] = Repo(path)

    repo = repos[path]
    commits.append(repo.commit(row["sha"]))

In [20]:
def get_hunk_headers_function(diff: Diff):
    # given a diff, read each line containing a hunk header "@@ -a,b +c,d @@ <function>"
    # and return a list of functions
    # if the hunk header has no function, don't include it

    # read the diff
    diff_text = diff.diff.decode("latin-1")
    # split the diff into lines
    diff_lines = diff_text.split("\n")

    # regex to match hunk header
    hunk_header_regex = re.compile(r"^@@ -\d+,\d+ \+\d+,\d+ @@")

    # regex to match function name
    function_name_regex = re.compile(r"@@ -\d+,\d+ \+\d+,\d+ @@ (.+)")

    # list of functions
    functions = []

    # iterate over each line
    for line in diff_lines:
        # if the line matches the hunk header regex
        if hunk_header_regex.match(line):
            # try to match the function name regex
            match = function_name_regex.match(line)
            # if the function name regex matches
            if match:
                # append the function name to the list of functions
                functions.append(match.group(1))

    # return the list of functions
    return functions


def find_function(node, function_name):
    if (
        node.kind == clang.cindex.CursorKind.FUNCTION_DECL
        and node.spelling == function_name
    ):
        return node
    for child in node.get_children():
        result = find_function(child, function_name)
        if result is not None:
            return result
    return None


def get_function_source(file_path, function):
    # Get the starting and ending line numbers of the function
    start_line = function.extent.start.line
    end_line = function.extent.end.line

    # with open(file_path, "r") as file:
    with open(file_path, "r", encoding="utf-8") as file:
        lines = file.readlines()

    # Extract the function's source code
    function_source = "".join(lines[start_line - 1 : end_line])
    return function_source


def batch(iterable, size):
    sourceiter = iter(iterable)
    while True:
        batchiter = itertools.islice(sourceiter, size)
        yield list(batchiter)


BATCH_SIZE = 10
batch = []
header = True
for i, commit in enumerate(tqdm(commits[3070:])):
    batch.append(commit)

    if len(batch) == BATCH_SIZE or (i + 1 == len(commits[3070:]) and len(batch) > 0):
        results = []
        for i, commit in enumerate(batch):
            result = {}
            result["sha"] = commit.hexsha
            result["remote_url"] = commit.repo.remotes[0].url
            result["label"] = df.iloc[i]["label"]
            result["commit_msg"] = commit.message

            parent_commit = commit.parents[0]
            diff_items = parent_commit.diff(commit, create_patch=True)

            # diff filter conditions
            # 1. modified files
            # 2. .c extension
            # 3. only 1 modified .c file
            diffs: list[Diff] = [
                diff
                for diff in diff_items.iter_change_type("M")
                if diff.a_path.endswith(".c")
            ]

            if not len(diffs) == 1:
                continue

            diff = diffs[0]

            functions = get_hunk_headers_function(diff)
            # diff functions filter conditions
            # 1. Only 1 function is modified
            # if not len(functions) == 1:
            #     continue

            if len(functions) == 0:
                continue
            if len(functions) > 1:
                continue

            # function_names = [re.search(r"(\w+)\(", function) for function in functions]
            # function_names = [
            #     function_name.group(1)
            #     for function_name in function_names
            #     if function_name is not None
            # ]

            function_name = re.search(r"(\w+)\(", functions[0])

            if function_name is None:
                continue

            function_name = function_name.group(1)

            file_path = diff.a_path
            try:
                code = commit.repo.git.show(f"{commit.hexsha}:{file_path}")
            except:
                continue
            temp_file = "data/temp.c"

            with open(temp_file, "w", encoding="utf-8") as file:
                file.write(code)

            translation_unit = index.parse(temp_file)

            c_func = find_function(translation_unit.cursor, function_name)

            if c_func is None:
                continue

            result["function"] = get_function_source(temp_file, c_func)

            # functions = {}
            # for function_name in function_names:
            #     c_func = find_function(translation_unit.cursor, function_name)
            #     if c_func is None:
            #         continue

            #     functions[function_name] = get_function_source(temp_file, c_func)

            # if len(functions) == 0:
            #     continue

            # result["functions"] = functions

            results.append(result)

        if len(results) > 0:
            results_df = pd.DataFrame(results)
            results_df["num_functions"] = results_df["functions"].apply(
                lambda functions: len(functions)
            )
            results_df.to_json(
                "data/function_sources.jsonl", mode="a", lines=True, orient="records"
            )
            results_df.to_csv(
                "data/function_sources.csv", mode="a", index=False, header=header
            )
            header = False

        batch = []

100%|██████████| 11930/11930 [21:24<00:00,  9.29it/s]


In [226]:
results_df = pd.DataFrame(results)
results_df["num_functions"] = results_df["functions"].apply(
    lambda functions: len(functions)
)
results_df["num_functions"].value_counts()
results_df.to_json("data/function_sources.json", orient="records")

with open("data/function_sources.json", "r") as file:
    data = json.load(file)

with open("data/function_sources.json", "w") as file:
    json.dump(data, file, indent=2)

In [None]:
functions = pd.read_csv("data/function_sources.csv")
functions.iloc[:20]