In [111]:
import os
import sys
import time

import git

os.chdir("/Users/bohrok/Documents/replication-kit-2020-line-validation")
import difflib
import pprint
import subprocess
from typing import Dict, Iterator, List, Set, Tuple, Sequence, Optional

import javalang

pp = pprint.PrettyPrinter(indent=4)


In [112]:
repo_path = "data/repos/ant-ivy"
repo = git.Repo(repo_path)
assert not repo.bare

In [113]:
for cid, commit in enumerate(repo.iter_commits("master")):
    print(f"{cid=}")
    print(f"{commit=}")
    print(f"{commit.hexsha=}")
    print(f"{commit.message=}")
    print(f"{commit.author=}")
    print(
        f"commit.authored_date={time.strftime('%Y %b %d %H:%M', time.gmtime(commit.authored_date))}"
    )
    print(f"{commit.committer=}")
    print(
        f"commit.committed_date={time.strftime('%Y %b %d %H:%M', time.gmtime(commit.committed_date))}"
    )
    print(f"{commit.parents=}")
    print(f"{commit.tree=}")
    # for entry in commit.tree:
    #     print(f"\t{entry=}")
    print(f"{commit.stats=}")
    print(f"{commit.summary=}")
    print(f"diff from index : {commit.diff()}")
    print("diff from parent:")
    for parent in commit.parents:
        print(f"\t{parent=}, diff={commit.diff(parent)}")
    print()
    if cid >= 1:
        break


cid=0
commit=<git.Commit "cd9296a409e3a4ccc7c8a532a56f2d2570b39493">
commit.hexsha='cd9296a409e3a4ccc7c8a532a56f2d2570b39493'
commit.message='add contributor of IVY-1632 to contributors list\n'
commit.author=<git.Actor "Jaikiran Pai <jaikiran@apache.org>">
commit.authored_date=2022 Jan 15 04:27
commit.committer=<git.Actor "Jaikiran Pai <jaikiran@apache.org>">
commit.committed_date=2022 Jan 15 04:27
commit.parents=(<git.Commit "a2701d733a57f28bd4b1e0111c61b659efef6444">,)
commit.tree=<git.Tree "746aa64a9ab4bf5c576096565b05271360c6c77e">
commit.stats=<git.util.Stats object at 0x10ac32bb0>
commit.summary='add contributor of IVY-1632 to contributors list'
diff from index : [<git.diff.Diff object at 0x10ae5a670>, <git.diff.Diff object at 0x10ac64e50>]
diff from parent:
	parent=<git.Commit "a2701d733a57f28bd4b1e0111c61b659efef6444">, diff=[<git.diff.Diff object at 0x11195ddc0>]

cid=1
commit=<git.Commit "a2701d733a57f28bd4b1e0111c61b659efef6444">
commit.hexsha='a2701d733a57f28bd4b1e0111c61b6

In [114]:
head_commit = repo.head.commit
diff = head_commit.diff(head_commit.parents[0])[0]
print(diff)
new_blob = diff.a_blob
old_blob = diff.b_blob
print(f"{new_blob=}")
print(f"{old_blob=}")
for tree_elem in head_commit.tree.traverse():
    if new_blob == tree_elem:
        print("new blob is in the head.")

prnt_commit = head_commit.parents[0]
for tree_elem in prnt_commit.tree.traverse():
    if old_blob == tree_elem:
        print("old blob is in the parent.")
print(new_blob.hexsha)
print(old_blob.hexsha)

asciidoc/release-notes.adoc
lhs: 100644 | 61b02f90b4372f5e7596eccb90c0ecd276b0efaa
rhs: 100644 | 1a181067c3220556650991ee8eec720e1a75e754
new_blob=<git.Blob "61b02f90b4372f5e7596eccb90c0ecd276b0efaa">
old_blob=<git.Blob "1a181067c3220556650991ee8eec720e1a75e754">
new blob is in the head.
old blob is in the parent.
61b02f90b4372f5e7596eccb90c0ecd276b0efaa
1a181067c3220556650991ee8eec720e1a75e754


In [160]:
new_blob_str = subprocess.check_output(
    f"git cat-file -p {new_blob.hexsha}", shell=True, cwd=repo_path
).decode("utf-8", "backslashreplace")
old_blob_str = subprocess.check_output(
    f"git cat-file -p {old_blob.hexsha}", shell=True, cwd=repo_path
).decode("utf-8", "backslashreplace")

# d = difflib.Differ()
# diff = d.compare(new_blob_str.splitlines(), old_blob_str.splitlines())
diff_result = difflib.context_diff(
    [s + "\n" for s in old_blob_str.splitlines()],
    [s + "\n" for s in new_blob_str.splitlines()],
    n=0
)
sys.stdout.writelines(diff_result)
# print("\n".join(diff))
# todo: 이제 여기서 file, line num 주면 class, method (가능하면) 찾아주도록 해야 함

*** 
--- 
***************
*** 43 ****
--- 44,45 ----
+     private static final String ACCEPT_HEADER_VALUE = "*/*";
+ 
***************
*** 98 ****
--- 101 ----
+             con.setRequestProperty("Accept", ACCEPT_HEADER_VALUE);
***************
*** 201 ****
--- 205 ----
+             conn.setRequestProperty("Accept", ACCEPT_HEADER_VALUE);
***************
*** 247 ****
--- 252 ----
+             srcConn.setRequestProperty("Accept", ACCEPT_HEADER_VALUE);


In [116]:
# commit -> parent들
# commit, parent -> diff들
# diff -> blob a, blob b -> file, line num
# file, line num -> class, method (optional)

In [120]:
def get_parents(commit: git.Commit) -> Sequence[git.Commit]:
    return commit.parents


def get_diff(new: git.Commit, old: git.Commit) -> git.DiffIndex:
    return new.diff(old)


def get_blobs(diff: git.Diff) -> tuple:
    return diff.a_blob, diff.b_blob


def get_changed_lines(
    a_blob: git.Blob, b_blob: git.Blob, repo_path: str
) -> Tuple[List[int], List[int]]:
    """
    return (line_num_a, line_num_b)
    line_num_a: a blob에서 수정된 line_num 들
    line_num_b: b blob에서 수정된 line_num 들
    """
    a_blob_str = subprocess.check_output(
        f"git cat-file -p {a_blob.hexsha}", shell=True, cwd=repo_path
    ).decode("utf-8")
    b_blob_str = subprocess.check_output(
        f"git cat-file -p {b_blob.hexsha}", shell=True, cwd=repo_path
    ).decode("utf-8")
    s = difflib.SequenceMatcher(
        None, a_blob_str.splitlines(), b_blob_str.splitlines()
    )
    line_num_a, line_num_b = [], []
    for tag, a1, a2, b1, b2 in s.get_opcodes():
        if tag == "equal":
            continue
        elif tag == "delete":
            line_num_a.extend(range(a1 + 1, a2 + 1))
        elif tag == "insert":
            line_num_b.extend(range(b1 + 1, b2 + 1))
        elif tag == "replace":
            line_num_a.extend(range(a1 + 1, a2 + 1))
            line_num_b.extend(range(b1 + 1, b2 + 1))
    return line_num_a, line_num_b


def get_changes(
    diff: git.Diff, repo_path: str
) -> Tuple[str, List[int], List[int]]:
    """
    return: (file, line_num_new, line_num_old)
    line_num_new: commit 이후 버전 기준 수정된 line_num 들
    line_num_old: commit 이전 버전 기준 수정된 line_num 들
    """
    new_blob, old_blob = get_blobs(diff)
    file_path = new_blob.path
    line_num_new, line_num_old = get_changed_lines(
        new_blob, old_blob, repo_path
    )
    return file_path, line_num_new, line_num_old


def is_modified_file(diff: git.Diff) -> bool:
    return (
        not diff.deleted_file
        and not diff.new_file
        and not diff.renamed_file
        and not diff.copied_file
    )


def get_changes_from_commit(commit: git.Commit, repo_path: str) -> Dict:
    """
    return: [parent: [(file, line_num_new, line_num_old), ...]]
    """
    ret = {}
    for parent in get_parents(commit):
        pcid = parent.hexsha
        pdict = {}
        for diff in get_diff(commit, parent):
            if not is_modified_file(diff):
                continue
            file_path, line_num_new, line_num_old = get_changes(diff, repo_path)
            pdict[file_path] = (line_num_new, line_num_old)
        ret[pcid] = pdict
    return ret


class Changes:
    def __init__(self, repo_path: str, commit: git.Commit):
        self.repo_path = repo_path
        self.commit = commit
        self.author = commit.author.name
        self.cid = commit.hexsha
        self.parents = {}
        for parent in get_parents(commit):
            pcid = parent.hexsha
            self.parents[pcid] = parent
        self.diffs = {}
        for pcid, parent in self.parents.items():
            self.diffs[pcid] = {}
            for diff in get_diff(commit, parent):
                if not is_modified_file(diff):
                    continue
                file_path, line_num_new, line_num_old = get_changes(
                    diff, repo_path
                )
                self.diffs[pcid][file_path] = (line_num_new, line_num_old)

    def get_blob(self, pcid, file_path, is_old):
        parent = self.parents[pcid]
        for diff in get_diff(commit, parent):
            if diff.a_blob.path == file_path:
                return diff.b_blob if is_old else diff.a_blob

    def print_context_diff(self, pcid, file_path):
        old_blob = self.get_blob(pcid, file_path, True)
        new_blob = self.get_blob(pcid, file_path, False)
        old_blob_str = subprocess.check_output(
            f"git cat-file -p {old_blob.hexsha}", shell=True, cwd=self.repo_path
        ).decode("utf-8")
        new_blob_str = subprocess.check_output(
            f"git cat-file -p {new_blob.hexsha}", shell=True, cwd=self.repo_path
        ).decode("utf-8")
        diff_result = difflib.context_diff(
            [s + "\n" for s in old_blob_str.splitlines()],
            [s + "\n" for s in new_blob_str.splitlines()],
            n=3,
        )
        sys.stdout.writelines(diff_result)


# print("[HEAD]")
# changes = get_changes_from_commit(head_commit, repo_path)
# print(changes)
print("[HEAD~1]")
head_1_commit = head_commit.parents[0]
# changes_1 = get_changes_from_commit(head_1_commit, repo_path)
# print(changes_1)
cobj_1 = Changes(repo_path, head_1_commit)
print(f"{cobj_1.cid=}")
print(f"{cobj_1.author=}")
print(f"{cobj_1.diffs=}")
# old_blob = cobj_1.get_blob(
#     "1fe3c3cff6bc8b2801354d6e2a605948d2b9cb67",
#     "src/java/org/apache/ivy/util/url/BasicURLHandler.java",
#     True,
# )
# new_blob = cobj_1.get_blob(
#     "1fe3c3cff6bc8b2801354d6e2a605948d2b9cb67",
#     "src/java/org/apache/ivy/util/url/BasicURLHandler.java",
#     False,
# )

# new_blob_str = subprocess.check_output(
#     f"git cat-file -p {new_blob.hexsha}", shell=True, cwd=repo_path
# ).decode("utf-8")
# old_blob_str = subprocess.check_output(
#     f"git cat-file -p {old_blob.hexsha}", shell=True, cwd=repo_path
# ).decode("utf-8")

# diff_result = difflib.context_diff(
#     [s + "\n" for s in old_blob_str.splitlines()],
#     [s + "\n" for s in new_blob_str.splitlines()],
#     n=0,
# )
# sys.stdout.writelines(diff_result)
cobj_1.print_context_diff(
    "1fe3c3cff6bc8b2801354d6e2a605948d2b9cb67",
    "src/java/org/apache/ivy/util/url/BasicURLHandler.java",
)


[HEAD~1]
cobj_1.cid='a2701d733a57f28bd4b1e0111c61b659efef6444'
cobj_1.author='Jaikiran'
cobj_1.diffs={'1fe3c3cff6bc8b2801354d6e2a605948d2b9cb67': {'asciidoc/release-notes.adoc': ([53], []), 'src/java/org/apache/ivy/util/url/BasicURLHandler.java': ([44, 45, 101, 205, 252], [])}, '17a0d80da3f3ca228a2665297d2b515677218c26': {}}
*** 
--- 
***************
*** 41,46 ****
--- 41,48 ----
  
      private static final int BUFFER_SIZE = 64 * 1024;
  
+     private static final String ACCEPT_HEADER_VALUE = "*/*";
+ 
      private static final class HttpStatus {
          static final int SC_OK = 200;
  
***************
*** 96,101 ****
--- 98,104 ----
              con.setConnectTimeout(connectionTimeout);
              con.setReadTimeout(readTimeout);
              con.setRequestProperty("User-Agent", getUserAgent());
+             con.setRequestProperty("Accept", ACCEPT_HEADER_VALUE);
              if (con instanceof HttpURLConnection) {
                  HttpURLConnection httpCon = (HttpURLConn

In [162]:
for commit in  repo.iter_commits():
    if commit.hexsha == "17f272782618066b672977266d19aa8c71d26e09":
        break
changes = Changes(repo_path, commit)
pp.pprint(changes.diffs)

{   '0864db0364aaf07509762d04f56ef6f8a8d478a3': {   'test/java/org/apache/ivy/ant/IvyRetrieveTest.java': (   [   31,
                                                                                                                 104,
                                                                                                                 105,
                                                                                                                 106,
                                                                                                                 107,
                                                                                                                 108,
                                                                                                                 111,
                                                                                                                 112,
                                                         

In [133]:
import contextlib
def is_java_file(file_path: str) -> bool:
    return file_path.endswith(".java")


def is_merge_commit(commit: git.Commit) -> bool:
    return len(commit.parents) > 1


def parse_java_file(file_str: str) -> javalang.tree.CompilationUnit:
    return javalang.parse.parse(file_str)


def get_final_line(node: javalang.tree.Node) -> int:
    # traverse node and find the max line
    max_line = 0
    for path, child in node.filter(javalang.tree.Node):
        with contextlib.suppress(TypeError):
            max_line = max(max_line, child.position[0])
    return max_line


def build_position_dict(tree: javalang.tree.CompilationUnit) -> Dict:
    package_name = tree.package.name
    ret = {"package": package_name, "classes": []}
    for path, node in tree.filter(javalang.tree.ClassDeclaration):
        classDeclNode: javalang.tree.ClassDeclaration = node
        class_name = classDeclNode.name
        class_pos = (
            classDeclNode.position[0],
            get_final_line(classDeclNode),
        )
        class_dict = {"name": class_name, "pos": class_pos, "methods": []}
        for method in classDeclNode.methods:
            method_dict = {
                "name": method.name,
                "pos": (method.position[0], get_final_line(method)),
                "paramtypes": [p.type.name for p in method.parameters],
            }
            class_dict["methods"].append(method_dict)
        class_dict["inner"] = None
        for prev_class_dict in ret["classes"]:
            if prev_class_dict["pos"][1] >= class_pos[1] and prev_class_dict["pos"][0] <= class_pos[0]:
                class_dict["inner"] = prev_class_dict["name"]
                break
        ret["classes"].append(class_dict)
    return ret



def get_posdict_from_blob(blob: git.Blob) -> Dict:
    file_str = subprocess.check_output(
        f"git cat-file -p {blob.hexsha}", shell=True, cwd=repo_path
    ).decode("utf-8")
    tree: javalang.tree.CompilationUnit = parse_java_file(file_str)
    return build_position_dict(tree)




# diffidx_1 = head_1_commit.diff(head_1_commit.parents[0])
# print(f"{is_java_file(diffidx_1[0].a_blob.path)=}")
# print(f"{is_java_file(diffidx_1[1].a_blob.path)=}")
# print(f"{diffidx_1[1].a_blob.path=}")
# file_str = subprocess.check_output(
#     f"git cat-file -p {diffidx_1[1].a_blob.hexsha}", shell=True, cwd=repo_path
# ).decode("utf-8")
# print("[file_str]=============================")
# print("\n".join(file_str.splitlines()[45:55]))
# print("=======================================")
# tree: javalang.tree.CompilationUnit = parse_java_file(file_str)
# pos_dict = build_position_dict(tree)
# pp.pprint(pos_dict)


for commit in repo.iter_commits():
    if commit.hexsha == "17a0d80da3f3ca228a2665297d2b515677218c26": break
changes = Changes(repo_path, commit)
pcid, diff_dict = changes.diffs.popitem()
for file_path, (line_num_new, line_num_old) in diff_dict.items():
    if not is_java_file(file_path):
        continue
    blob = changes.get_blob(pcid, file_path, False)
    pos_dict = get_posdict_from_blob(blob)
    pp.pprint(pos_dict)

{   'classes': [   {   'inner': None,
                       'methods': [   {   'name': 'getURLInfo',
                                          'paramtypes': ['URL'],
                                          'pos': (57, 58)},
                                      {   'name': 'getURLInfo',
                                          'paramtypes': ['URL', 'int'],
                                          'pos': (63, 64)},
                                      {   'name': 'isReachable',
                                          'paramtypes': [   'URL',
                                                            'TimeoutConstraint'],
                                          'pos': (69, 70)},
                                      {   'name': 'getContentLength',
                                          'paramtypes': [   'URL',
                                                            'TimeoutConstraint'],
                                          'pos': (75, 76)},
                        

In [165]:

def get_clsNmeth(pos_dict: Dict, line_num: int) -> Tuple[str, Optional[str]]:
    package = pos_dict["package"]
    clspath, methsig = None, None
    for cls in pos_dict["classes"]:
        if cls["pos"][0] <= line_num <= cls["pos"][1]:
            clsname = cls["name"]
            inner = cls["inner"]
            while inner:
                clsname = f"{inner}#{clsname}"
                outer_class = [
                    c for c in pos_dict["classes"] if c["name"] == inner
                ][0]
                inner = outer_class["inner"]
            clspath = f"{package}#{clsname}"
            for method in cls["methods"]:
                if method["pos"][0] <= line_num <= method["pos"][1]:
                    methsig = method["name"]
                    if method["paramtypes"]:
                        methsig = f"{methsig}({','.join(method['paramtypes'])})"
                    break
            break
    # assert clspath is not None
    return (clspath, methsig)


def get_change_dict(
    changed_line_nums: List[int],
    changes: Changes,
    pcid: str,
    file_path: str,
    is_old: bool,
) -> Dict:
    change_dict = {}
    blob = changes.get_blob(pcid, file_path, is_old)
    pos_dict = get_posdict_from_blob(blob)
    for line_num in changed_line_nums:
        change_clspath, change_methname = get_clsNmeth(pos_dict, line_num)
        if (change_clspath, change_methname) not in change_dict:
            change_dict[(change_clspath, change_methname)] = []
        change_dict[(change_clspath, change_methname)].append(line_num)
    return change_dict


for idx, commit in enumerate(repo.iter_commits()):
    if idx == 100:
        break
    print(time.strftime("%Y %b %d %H:%M", time.gmtime(commit.authored_date)))
    if is_merge_commit(commit):
        continue
    changes = Changes(repo_path, commit)
    assert len(changes.diffs) == 1
    pcid, diff_dict = changes.diffs.popitem()
    change_dict = {}
    for file_path, (line_num_new, line_num_old) in diff_dict.items():
        if not is_java_file(file_path):
            continue
        if len(line_num_new):
            change_dict["new"] = get_change_dict(
                line_num_new, changes, pcid, file_path, False
            )
        if len(line_num_old):
            change_dict["old"] = get_change_dict(
                line_num_old, changes, pcid, file_path, True
            )
    if not len(change_dict):
        continue
    print(f"{commit.hexsha=}")
    print(f"{commit.message=}")
    print(f"{commit.author.name=}")
    pp.pprint(change_dict)
    break


2022 Jan 15 04:27
2022 Jan 15 04:21
2021 Dec 23 05:12
2021 Dec 20 08:21
commit.hexsha='17a0d80da3f3ca228a2665297d2b515677218c26'
commit.message='IVY-1632: Use valid value for HTTP header "Accept".\n\nThe default accept header of Java isn\'t valid as described at\nhttps://bugs.openjdk.java.net/browse/JDK-8163921\n\nTherefore set an accept header that accepts simply anything in the\nivy:retrieve Ant task.'
commit.author.name='Berno Langer'
{   'new': {   ('org.apache.ivy.util.url#BasicURLHandler', None): [44, 45],
               ('org.apache.ivy.util.url#BasicURLHandler', 'download(URL,File,CopyProgressListener,TimeoutConstraint)'): [   252],
               ('org.apache.ivy.util.url#BasicURLHandler', 'getURLInfo(URL,TimeoutConstraint)'): [   101],
               ('org.apache.ivy.util.url#BasicURLHandler', 'openStream(URL,TimeoutConstraint)'): [   205]}}


In [166]:
for commit in  repo.iter_commits():
    if commit.hexsha == "17f272782618066b672977266d19aa8c71d26e09":
        break
changes = Changes(repo_path, commit)
pp.pprint(changes.diffs)
pcid, diff_dict = changes.diffs.popitem()
change_dict = {}
for file_path, (line_num_new, line_num_old) in diff_dict.items():
    if not is_java_file(file_path):
        continue
    if len(line_num_new):
        change_dict["new"] = get_change_dict(
            line_num_new, changes, pcid, file_path, False
        )
    if len(line_num_old):
        change_dict["old"] = get_change_dict(
            line_num_old, changes, pcid, file_path, True
        )

print(f"{commit.hexsha=}")
print(f"{commit.message=}")
print(f"{commit.author.name=}")
pp.pprint(change_dict)

{   '0864db0364aaf07509762d04f56ef6f8a8d478a3': {   'test/java/org/apache/ivy/ant/IvyRetrieveTest.java': (   [   31,
                                                                                                                 104,
                                                                                                                 105,
                                                                                                                 106,
                                                                                                                 107,
                                                                                                                 108,
                                                                                                                 111,
                                                                                                                 112,
                                                         