In [2]:
import json
import re
import stat
from collections import Counter, defaultdict
from pathlib import Path
from textwrap import dedent

import numpy as np
import pandas as pd

# Comparison with line annotations in the HaPy-Bug dataset

**HaPy-Bug** dataset comprises annotated diff files from three sources. None
of them had previously been subjected to human annotation at the
granular (line-by-line) level.

$D_{BIP}$: **BugsInPy** subset is an extension of dataset of 496 real
bugs proposed in 
_"[BugsInPy: a database of existing bugs in Python programs to enable controlled testing and debugging studies][BugsInPy]"_ (2020).
This subset focuses on bugs in source code
and excludes issues related to configurations, build scripts, docu-
mentation, and test cases. It also requires bugs to be reproducible,
i.e. at least one test case from the fixed version must fail with the
faulty version. Only changes involving isolated bugs are included.

$D_{CVE}$: **Python CVE** and $D_{CRAWL}$: **Crawled Python CVE** are
new custom made, specialized collections of Python-related bugs
sourced from the [CVE DB](https://cve.mitre.org/) and projects git repositories.

$D_{CVE}$ comprises bugs identified through a comprehensive full-text search
of CVE DB. This subset was refined by selecting bugs with direct
links to source code fixes that involved modifications to Python code.

$D_{CRAWL}$ is a subset created by scanning repositories of most
popular Python projects for commits that have a CVE id pattern
inside. Each bug found was cross-referenced with CVE DB.

[BugsInPy]: https://doi.org/10.1145/3368089.3417943

## $D_{BIP}$: BugsInPy subset of HaPy-Bug dataset

Here all entries are **single diff**

### Experiments extracting data for a single entry (single bug)

> Larger outputs are stored collapsed

In [3]:
annotator_json = '/mnt/data/python-diff-annotator/example_annotations/HaPy-Bug/bugsinpy-dataset/cookiecutter-1/annotation/7f6804c4953a18386809f11faf4d86898570debc.v2.json'

In [4]:
with open(annotator_json, mode='r') as json_fp:
    annotator_data = json.load(json_fp)

In [5]:
annotator_data.keys()

dict_keys(['commit_metadata', 'changes', 'diff_metadata'])

In [6]:
annotator_data['changes'].keys()

dict_keys(['cookiecutter/generate.py', '/dev/null', 'tests/test-generate-context/non_ascii.json', 'tests/test_generate_context.py'])

In [7]:
hapybug_json = '/mnt/data/HaPy-Bug/raw_data/bugsinpy-dataset/cookiecutter-1/annotation/7f6804c4953a18386809f11faf4d86898570debc.json'

In [8]:
with open(hapybug_json, mode='r') as json_fp:
    hapybug_data = json.load(json_fp)

In [9]:
hapybug_data.keys()

dict_keys(['cookiecutter/generate.py', '/dev/null', 'tests/test-generate-context/non_ascii.json', 'tests/test_generate_context.py'])

In [10]:
annotator_data['changes']['cookiecutter/generate.py']

{'language': 'Python',
 'type': 'programming',
 'purpose': 'programming',
 '-': [{'id': 3,
   'file_line_no': 85,
   'type': 'code',
   'purpose': 'programming',
   'tokens': [[40, ['Text'], '        '],
    [48, ['Keyword'], 'with'],
    [52, ['Text'], ' '],
    [53, ['Name', 'Builtin'], 'open'],
    [57, ['Punctuation'], '('],
    [58, ['Name'], 'context_file'],
    [70, ['Punctuation'], ')'],
    [71, ['Text'], ' '],
    [72, ['Keyword'], 'as'],
    [74, ['Text'], ' '],
    [75, ['Name'], 'file_handle'],
    [86, ['Punctuation'], ':'],
    [87, ['Text', 'Whitespace'], '\n']]}],
 '+': [{'id': 4,
   'file_line_no': 85,
   'type': 'code',
   'purpose': 'programming',
   'tokens': [[40, ['Text'], '        '],
    [48, ['Keyword'], 'with'],
    [52, ['Text'], ' '],
    [53, ['Name', 'Builtin'], 'open'],
    [57, ['Punctuation'], '('],
    [58, ['Name'], 'context_file'],
    [70, ['Punctuation'], ','],
    [71, ['Text'], ' '],
    [72, ['Name'], 'encoding'],
    [80, ['Operator'], '='],
 

In [11]:
hapybug_data['cookiecutter/generate.py']

{'language': 'Python',
 'type': 'programming',
 'purpose': 'programming',
 '+': [{'id': 85, 'type': 'bug(fix)'}],
 '-': [{'id': 85, 'type': 'bug(fix)'}]}

In [12]:
hapybug_data['tests/test_generate_context.py']

{'language': 'Python',
 'type': 'programming',
 'purpose': 'test',
 '+': [{'id': 111, 'type': 'test'},
  {'id': 112, 'type': 'test'},
  {'id': 113, 'type': 'test'},
  {'id': 114, 'type': 'test'},
  {'id': 115, 'type': 'test'},
  {'id': 116, 'type': 'test'},
  {'id': 117, 'type': 'test'},
  {'id': 118, 'type': 'test'},
  {'id': 119, 'type': 'test'},
  {'id': 120, 'type': 'test'},
  {'id': 121, 'type': 'test'}],
 '-': []}

In [13]:
print(Path('/mnt/data/HaPy-Bug/raw_data/bugsinpy-dataset/cookiecutter-1/patches/7f6804c4953a18386809f11faf4d86898570debc.diff').read_text())

diff --git a/cookiecutter/generate.py b/cookiecutter/generate.py
index 37365a4..c526b97 100644
--- a/cookiecutter/generate.py
+++ b/cookiecutter/generate.py
@@ -82,7 +82,7 @@ def generate_context(
     context = OrderedDict([])
 
     try:
-        with open(context_file) as file_handle:
+        with open(context_file, encoding='utf-8') as file_handle:
             obj = json.load(file_handle, object_pairs_hook=OrderedDict)
     except ValueError as e:
         # JSON decoding error.  Let's throw a new exception that is more
diff --git a/tests/test-generate-context/non_ascii.json b/tests/test-generate-context/non_ascii.json
new file mode 100644
index 0000000..af0edf6
--- /dev/null
+++ b/tests/test-generate-context/non_ascii.json
@@ -0,0 +1,3 @@
+{
+    "full_name": "éèà"
+}
diff --git a/tests/test_generate_context.py b/tests/test_generate_context.py
index 26e7d4d..69d0148 100644
--- a/tests/test_generate_context.py
+++ b/tests/test_generate_context.py
@@ -108,6 +108,17 @@ def test_def

In [14]:
with open('/mnt/data/CVE/final_bugs_packages.json', mode='r') as json_fp:
    where_labeling_data = json.load(json_fp)

In [15]:
where_labeling_data['cookiecutter-1']

{'rA': 1, 'rB': 1, 'rC': 0, 'rD': 1, 'pA': 2, 'pB': 4, 'pC': 1, 'pD': 3}

In [16]:
label_studio_json_1 = '/mnt/data/HaPy-Bug/annotated_data/D_4_3.json'

In [17]:
with open(label_studio_json_1, mode='r') as json_fp:
    label_studio_data_1 = json.load(json_fp)

In [18]:
[elem['annotations'][0]['result'][3]['value'] for elem in label_studio_data_1]

[{'hyperlinks': [{'url': 'http://lists.fedoraproject.org/pipermail/package-announce/2013-May/106220.html',
    'dates': {'min': '2013-01-01', 'max': '2020-05-24'},
    'labels': ['lists.fedoraproject.org',
     'lists.fedoraproject.org/pipermail',
     'lists.fedoraproject.org/pipermail/package-announce',
     'lists.fedoraproject.org/pipermail/package-announce/2013-May']},
   {'url': 'http://lists.fedoraproject.org/pipermail/package-announce/2013-May/105916.html',
    'dates': {'min': '2001-05-22', 'max': '2013-05-14'},
    'labels': ['lists.fedoraproject.org',
     'lists.fedoraproject.org/pipermail',
     'lists.fedoraproject.org/pipermail/package-announce',
     'lists.fedoraproject.org/pipermail/package-announce/2013-May']},
   {'url': 'http://rhn.redhat.com/errata/RHSA-2013-0806.html',
    'dates': {'min': '2012-02-04', 'max': '2013-05-09'},
    'labels': ['Vendor Advisory']},
   {'url': 'https://bugs.launchpad.net/keystone/+bug/1172195',
    'dates': {'min': '2013-01-01', 'max':

### Using the collective.{csv,json}, generated by Paper.ipynb

In [19]:
collective_dir = '../../data/experiments/HaPy-Bug/'
list(Path(collective_dir).glob('*'))

[PosixPath('../../data/experiments/HaPy-Bug/run_annotation_bugsinpy_repos.sh'),
 PosixPath('../../data/experiments/HaPy-Bug/consensus.csv'),
 PosixPath('../../data/experiments/HaPy-Bug/bip_blame.csv'),
 PosixPath('../../data/experiments/HaPy-Bug/repositories.json'),
 PosixPath('../../data/experiments/HaPy-Bug/crawl_blame.csv'),
 PosixPath('../../data/experiments/HaPy-Bug/hapybug_line_callback_func.py'),
 PosixPath('../../data/experiments/HaPy-Bug/run_annotation_hapy_bip_repos.sh'),
 PosixPath('../../data/experiments/HaPy-Bug/collective.csv'),
 PosixPath('../../data/experiments/HaPy-Bug/cve_blame.csv')]

In [20]:
%ls -l '../../data/experiments/HaPy-Bug/'

total 81736
-rw-r--r-- 1 jnareb jnareb  2558150 Dec  1 01:30 bip_blame.csv
-rw-r--r-- 1 jnareb jnareb 50424028 Dec  1 01:30 collective.csv
-rw-r--r-- 1 jnareb jnareb 18222220 Dec  1 01:30 consensus.csv
-rw-r--r-- 1 jnareb jnareb  6895847 Dec  1 01:30 crawl_blame.csv
-rw-r--r-- 1 jnareb jnareb  5385717 Dec  1 01:30 cve_blame.csv
-rw-r--r-- 1 jnareb jnareb      939 Dec  1 01:30 hapybug_line_callback_func.py
-rw-r--r-- 1 jnareb jnareb    15132 Dec  1 01:30 repositories.json
-rwxr-xr-x 1 jnareb jnareb    26627 Dec  1 01:30 [0m[01;32mrun_annotation_bugsinpy_repos.sh[0m*
-rwxr-xr-x 1 jnareb jnareb   150891 Dec  1 01:30 [01;32mrun_annotation_hapy_bip_repos.sh[0m*


In [21]:
collective_csv = Path(collective_dir) / 'collective.csv'
collective_csv

PosixPath('../../data/experiments/HaPy-Bug/collective.csv')

In [22]:
collective_df = pd.read_csv(collective_csv, index_col=0)
collective_df.index = collective_df.index.rename(name='')
collective_df

Unnamed: 0,id,bundle,file,fcat,image,line,annotation,user,auto,ds,bug
,,,,,,,,,,,
0,cve_CVE-2020-10289,B_6_13,actionlib_tools/scripts/library.py,programming,afterChange,103,bug(fix),U1,False,cve,CVE-2020-10289
1,cve_CVE-2020-10289,B_6_13,actionlib_tools/scripts/library.py,programming,afterChange,137,bug(fix),U1,False,cve,CVE-2020-10289
2,cve_CVE-2020-10289,B_6_13,actionlib_tools/scripts/library.py,programming,beforeChange,103,bug(fix),U1,False,cve,CVE-2020-10289
3,cve_CVE-2020-10289,B_6_13,actionlib_tools/scripts/library.py,programming,beforeChange,137,bug(fix),U1,False,cve,CVE-2020-10289
4,cve_CVE-2020-10289,C_4_9,actionlib_tools/scripts/library.py,programming,afterChange,103,bug(fix),U2,False,cve,CVE-2020-10289
...,...,...,...,...,...,...,...,...,...,...,...
391913,cve_CVE-2018-16876,auto_C_5_8,lib/ansible/plugins/connection/ssh.py,programming,afterChange,361,bug(fix),U2,True,cve,CVE-2018-16876
391914,cve_CVE-2018-16876,auto_C_5_8,lib/ansible/plugins/connection/ssh.py,programming,afterChange,362,bug(fix),U2,True,cve,CVE-2018-16876
391915,cve_CVE-2018-16876,auto_C_5_8,lib/ansible/plugins/connection/ssh.py,programming,afterChange,363,bug(fix),U2,True,cve,CVE-2018-16876


In [23]:
collective_df['ds'].value_counts()

ds
crawl         146366
cve           125176
bugs-in-py    120376
Name: count, dtype: int64

In [24]:
collective_df_manual = collective_df[collective_df['auto'] == False]
collective_df_manual['ds'].value_counts()

ds
crawl         73183
cve           62588
bugs-in-py    60194
Name: count, dtype: int64

In [25]:
collective_df_manual[collective_df_manual['ds'] == 'bugs-in-py']['id'].drop_duplicates()


16414          bugs-in-py_keras-17
16459        bugs-in-py_thefuck-23
16633          bugs-in-py_scrapy-2
16735           bugs-in-py_luigi-6
16969         bugs-in-py_scrapy-29
                    ...           
194691       bugs-in-py_pandas-164
194742        bugs-in-py_pandas-88
194820    bugs-in-py_youtube-dl-42
195465        bugs-in-py_pandas-36
195884        bugs-in-py_pandas-54
Name: id, Length: 496, dtype: object

In [26]:
collective_df_manual[collective_df_manual['ds'] == 'bugs-in-py']['id'].drop_duplicates().shape

(496,)

### Running annotation on BugsInPy dataset

The annotation data was generated using the following command:

```console
diff-annotate \
    --purpose-to-annotation=data \
    --purpose-to-annotation=documentation \
    --purpose-to-annotation=markup \
    --purpose-to-annotation=other \
    --purpose-to-annotation=project \
    --purpose-to-annotation=test \
    dataset \
    --output-prefix=/mnt/data/python-diff-annotator/example_annotations/HaPy-Bug \
    /mnt/data/HaPy-Bug/raw_data/bugsinpy-dataset/
```

And as can be seen, it is present in `/mnt/data/python-diff-annotator/example_annotations/HaPy-Bug/bugsinpy-dataset/`

In [27]:
bugsinpy_annotated_from_dataset_dir = '/mnt/data/python-diff-annotator/example_annotations/HaPy-Bug/bugsinpy-dataset/'

In [28]:
%ls /mnt/data/python-diff-annotator/example_annotations/HaPy-Bug/bugsinpy-dataset/

[0m[01;34mansible-1[0m/       [01;34mkeras-40[0m/       [01;34mpandas-125[0m/  [01;34mpandas-64[0m/    [01;34mthefuck-12[0m/
[01;34mansible-10[0m/      [01;34mkeras-41[0m/       [01;34mpandas-126[0m/  [01;34mpandas-65[0m/    [01;34mthefuck-13[0m/
[01;34mansible-11[0m/      [01;34mkeras-42[0m/       [01;34mpandas-127[0m/  [01;34mpandas-66[0m/    [01;34mthefuck-14[0m/
[01;34mansible-12[0m/      [01;34mkeras-43[0m/       [01;34mpandas-128[0m/  [01;34mpandas-67[0m/    [01;34mthefuck-15[0m/
[01;34mansible-13[0m/      [01;34mkeras-44[0m/       [01;34mpandas-129[0m/  [01;34mpandas-68[0m/    [01;34mthefuck-16[0m/
[01;34mansible-14[0m/      [01;34mkeras-45[0m/       [01;34mpandas-13[0m/   [01;34mpandas-69[0m/    [01;34mthefuck-17[0m/
[01;34mansible-15[0m/      [01;34mkeras-5[0m/        [01;34mpandas-130[0m/  [01;34mpandas-7[0m/     [01;34mthefuck-18[0m/
[01;34mansible-16[0m/      [01;34mkeras-6[0m/        [01;34mpandas-

In [29]:
%ls /mnt/data/python-diff-annotator/example_annotations/HaPy-Bug/bugsinpy-dataset/cookiecutter-1/annotation/

7f6804c4953a18386809f11faf4d86898570debc.v2.json


In [30]:
example_repo = 'cookiecutter'
example_bug = 'cookiecutter-1'

example_path = next(Path(bugsinpy_annotated_from_dataset_dir).joinpath(example_bug, 'annotation').glob('*.json'))
example_path

PosixPath('/mnt/data/python-diff-annotator/example_annotations/HaPy-Bug/bugsinpy-dataset/cookiecutter-1/annotation/7f6804c4953a18386809f11faf4d86898570debc.v2.json')

In [31]:
with open(example_path, mode='r') as json_fp:
    example_data_from_dataset = json.load(json_fp)

type(example_data_from_dataset)

dict

In [32]:
example_data_from_dataset.keys()

dict_keys(['commit_metadata', 'changes', 'diff_metadata'])

In [33]:
example_data_from_dataset['commit_metadata']

{'id': '7f6804c4953a18386809f11faf4d86898570debc'}

In [34]:
example_data_from_dataset['diff_metadata']

{'n_files': 3,
 'hunk_span_src': 11,
 'hunk_span_dst': 24,
 'n_hunks': 3,
 'n_lines_added': 15,
 'n_lines_removed': 1,
 'n_lines_all': 28,
 'n_mod': 1,
 'n_groups': 3,
 'patch_size': 15,
 'n_added_files': 1,
 'n_add': 14}

In [35]:
example_data_from_dataset['changes'].keys()

dict_keys(['cookiecutter/generate.py', '/dev/null', 'tests/test-generate-context/non_ascii.json', 'tests/test_generate_context.py'])

In [36]:
example_data_from_dataset['changes']['cookiecutter/generate.py']

{'language': 'Python',
 'type': 'programming',
 'purpose': 'programming',
 '-': [{'id': 3,
   'file_line_no': 85,
   'type': 'code',
   'purpose': 'programming',
   'tokens': [[40, ['Text'], '        '],
    [48, ['Keyword'], 'with'],
    [52, ['Text'], ' '],
    [53, ['Name', 'Builtin'], 'open'],
    [57, ['Punctuation'], '('],
    [58, ['Name'], 'context_file'],
    [70, ['Punctuation'], ')'],
    [71, ['Text'], ' '],
    [72, ['Keyword'], 'as'],
    [74, ['Text'], ' '],
    [75, ['Name'], 'file_handle'],
    [86, ['Punctuation'], ':'],
    [87, ['Text', 'Whitespace'], '\n']]}],
 '+': [{'id': 4,
   'file_line_no': 85,
   'type': 'code',
   'purpose': 'programming',
   'tokens': [[40, ['Text'], '        '],
    [48, ['Keyword'], 'with'],
    [52, ['Text'], ' '],
    [53, ['Name', 'Builtin'], 'open'],
    [57, ['Punctuation'], '('],
    [58, ['Name'], 'context_file'],
    [70, ['Punctuation'], ','],
    [71, ['Text'], ' '],
    [72, ['Name'], 'encoding'],
    [80, ['Operator'], '='],
 

### Running annotation with line callback on BugsInPy dataset

The annotation data was generated using the following command:

```console
diff-annotate \
    --line-callback='data/experiments/HaPy-Bug/hapybug_line_callback_func.py' \
    dataset \
    --output-prefix=/mnt/data/python-diff-annotator/example_annotations/HaPy-Bug_bip \
    /mnt/data/HaPy-Bug/raw_data/bugsinpy-dataset/
```

In [37]:
bugsinpy_hapy_from_dataset_dir = '/mnt/data/python-diff-annotator/example_annotations/HaPy-Bug_bip/bugsinpy-dataset/'
Path(bugsinpy_hapy_from_dataset_dir).is_dir()

True

In [38]:
%ls /mnt/data/python-diff-annotator/example_annotations/HaPy-Bug_bip/bugsinpy-dataset/cookiecutter-1/annotation/

7f6804c4953a18386809f11faf4d86898570debc.v2.json


### Extracting commit ids from BugsInPy dataset

For each bug in **BugsInPy** dataset we want repository and commit id, to be able to use more powerful `diff-annotate from-repo`, rather than `diff-annotate dataset`.

In [39]:
bugsinpy_dir = '/mnt/data/HaPy-Bug/raw_data/bugsinpy-dataset/'

In [40]:
repo_commits = {}

for bug_dir in Path(bugsinpy_dir).iterdir():
    repo_name = bug_dir.name.rsplit('-', maxsplit=1)[0]

    #print(f"{bug_dir.name=}, {repo_name=}")
    if repo_name not in repo_commits:
        repo_commits[repo_name] = { 'commits': [], 'bugs': [] }

    repo_commits[repo_name]['bugs'].append(bug_dir.name)
    
    for diff_file in bug_dir.joinpath('patches').glob('*.diff'):
        #print(f"  {diff_file.stem=}")
        repo_commits[repo_name]['commits'].append(diff_file.stem)

repo_commits['cookiecutter']

{'commits': ['7f6804c4953a18386809f11faf4d86898570debc',
  '7129d474206761a6156925db78eee4b62a0e3944',
  '90434ff4ea4477941444f1e83313beb414838535',
  '457a1a4e862aab4102b644ff1d2b2e2b5a766b3c'],
 'bugs': ['cookiecutter-1',
  'cookiecutter-3',
  'cookiecutter-2',
  'cookiecutter-4']}

In [41]:
repo_commits.keys()

dict_keys(['pandas', 'thefuck', 'tornado', 'black', 'youtube-dl', 'spacy', 'keras', 'ansible', 'scrapy', 'fastapi', 'luigi', 'matplotlib', 'tqdm', 'sanic', 'cookiecutter', 'httpie', 'PySnooper'])

Find where repositories were cloned to (locally):

In [42]:
repositories_json = '../../data/experiments/HaPy-Bug/repositories.json'
%ls -l '../../data/experiments/HaPy-Bug/repositories.json'

-rw-r--r-- 1 jnareb jnareb 15132 Dec  1 01:30 ../../data/experiments/HaPy-Bug/repositories.json


In [43]:
with open(repositories_json, mode='r') as json_fp:
    repositories_data = json.load(json_fp)

repositories_data[:3]

[{'project': 'pandas',
  'repository_url': 'https://github.com/pandas-dev/pandas',
  'repository_path': '/mnt/data/python_bug_localization_data/repositories/pandas'},
 {'project': 'ansible',
  'repository_url': 'https://github.com/ansible/ansible',
  'repository_path': '/mnt/data/python_bug_localization_data/repositories/ansible'},
 {'project': 'black',
  'repository_url': 'https://github.com/psf/black',
  'repository_path': '/mnt/data/python_bug_localization_data/repositories/black'}]

In [44]:
repositories_map = {
    elem['project']: {'url': elem['repository_url'], 'path': elem['repository_path'] }
    for elem in repositories_data
}

repositories_map['cookiecutter']

{'url': 'https://github.com/cookiecutter/cookiecutter',
 'path': '/mnt/data/python_bug_localization_data/repositories/cookiecutter'}

### Running annotation on BugsInPy repos

In [45]:
script_file = '../../run_annotation_bugsinpy_repos.sh'

In [46]:
file_purpose_list = [
    "data",
    "documentation",
    "markup",
    "other",
    "project",
    #"test",  # rely on diff-annotate to return "documentation" or "test" for test files
]

with open(script_file, 'wt') as fp:
    print('#!/usr/bin/sh', file=fp)
    print('', file=fp)
    print('echo "running annotations on BugsInPy repos for BugsInPy buggy commits"', file=fp)
    print('', file=fp)

Path(script_file).chmod(0o755)  # 0755/-rwxr-xr-x

In [47]:
for repo_name, repo_data in repo_commits.items():
    print(f"{repo_name}:")
    cmd_str = ''.join([
        "diff-annotate ",
        *[f"--purpose-to-annotation={file_purpose} " for file_purpose in file_purpose_list],
        "from-repo ",
        f"--output-dir=/mnt/data/python-diff-annotator/example_annotations/bugsinpy-from-repo/{repo_name} ",
        f"{repositories_map[repo_name]['path']} --no-walk=sorted {' '.join(repo_data['commits'])}",
    ])
    print("  arg_length <=", len(cmd_str))
    
    with open(script_file, 'at') as fp:
        print(f"# {repo_name}", file=fp)
        print(cmd_str, file=fp)
    
    print("")

pandas:
  arg_length <= 7238

thefuck:
  arg_length <= 1664

tornado:
  arg_length <= 1008

black:
  arg_length <= 1291

youtube-dl:
  arg_length <= 2121

spacy:
  arg_length <= 758

keras:
  arg_length <= 2193

ansible:
  arg_length <= 1090

scrapy:
  arg_length <= 1990

fastapi:
  arg_length <= 1008

luigi:
  arg_length <= 1701

matplotlib:
  arg_length <= 1465

tqdm:
  arg_length <= 715

sanic:
  arg_length <= 553

cookiecutter:
  arg_length <= 526

httpie:
  arg_length <= 555

PySnooper:
  arg_length <= 479



Run for example:

```console
uptime && time diff-annotate \
    --purpose-to-annotation=data \
    --purpose-to-annotation=documentation \
    --purpose-to-annotation=markup \
    --purpose-to-annotation=other \
    --purpose-to-annotation=project \
    --purpose-to-annotation=test \
    from-repo \
    --output-dir=/mnt/data/python-diff-annotator/example_annotations/bugsinpy-from-repo/cookiecutter \
    /mnt/data/python_bug_localization_data/repositories/cookiecutter --no-walk=sorted \
    7f6804c4953a18386809f11faf4d86898570debc 7129d474206761a6156925db78eee4b62a0e3944 \
    90434ff4ea4477941444f1e83313beb414838535 457a1a4e862aab4102b644ff1d2b2e2b5a766b3c
```

The output below as for the run without `--purpose-to-annotation=` parameters

```
 02:36:41 up 289 days,  4:26, 12 users,  load average: 1.08, 1.18, 1.54
Logging to 'diff-annotate.log' file, with log level=WARNING
Computing patch sizes and spreads (# files, # change groups, # spanned lines,...)
Storing annotations in <output_dir>/<commit_id>.json
  with output dir: '/mnt/data/python-diff-annotator/example_annotations/bugsinpy-from-repo/cookiecutter'
Ensuring that output directory '/mnt/data/python-diff-annotator/example_annotations/bugsinpy-from-repo/cookiecutter' exists
Generating patches from local Git repo '/mnt/data/python_bug_localization_data/repositories/cookiecutter'
  using `git log -p '--no-walk=sorted' '7f6804c4953a18386809f11faf4d86898570debc' '7129d474206761a6156925db78eee4b62a0e3944' '90434ff4ea4477941444f1e83313beb414838535' '457a1a4e862aab4102b644ff1d2b2e2b5a766b3c'`
  took 0.212 seconds (includes parsing unified diffs)
Annotating commits and saving annotated data, for 4 commits
  lexing pre- and post-image file contents, from repo 'cookiecutter'
  using sequential processing
commits: 100%|█████████████████████████████████████████████████████| 4/4 [00:00<00:00,  7.38it/s]

real    0m1.720s
user    0m8.406s
sys     0m0.176s
```

Extract the same commit annotated data:

In [48]:
example_repo = 'cookiecutter'
example_commit = '7f6804c4953a18386809f11faf4d86898570debc'

In [49]:
bugsinpy_annotated_from_repo_dir = '/mnt/data/python-diff-annotator/example_annotations/bugsinpy-from-repo/'

%ls -1 '/mnt/data/python-diff-annotator/example_annotations/bugsinpy-from-repo/'

[0m[01;34mansible[0m/
[01;34mblack[0m/
[01;34mcookiecutter[0m/
[01;34mfastapi[0m/
[01;34mhttpie[0m/
[01;34mkeras[0m/
[01;34mluigi[0m/
[01;34mmatplotlib[0m/
[01;34mpandas[0m/
[01;34mPySnooper[0m/
[01;34msanic[0m/
[01;34mscrapy[0m/
[01;34mspacy[0m/
[01;34mthefuck[0m/
[01;34mtornado[0m/
[01;34mtqdm[0m/
[01;34myoutube-dl[0m/


In [50]:
%ls -1 '/mnt/data/python-diff-annotator/example_annotations/bugsinpy-from-repo/cookiecutter'

457a1a4e862aab4102b644ff1d2b2e2b5a766b3c.v2.json
7129d474206761a6156925db78eee4b62a0e3944.v2.json
7f6804c4953a18386809f11faf4d86898570debc.v2.json
90434ff4ea4477941444f1e83313beb414838535.v2.json


In [51]:
example_path_2 = Path(bugsinpy_annotated_from_repo_dir).joinpath(example_repo, f"{example_commit}.v2.json")
example_path_2

PosixPath('/mnt/data/python-diff-annotator/example_annotations/bugsinpy-from-repo/cookiecutter/7f6804c4953a18386809f11faf4d86898570debc.v2.json')

In [52]:
with open(example_path_2, mode='r') as json_fp:
    example_data_from_repo = json.load(json_fp)

type(example_data_from_repo)

dict

In [53]:
example_data_from_repo.keys()

dict_keys(['commit_metadata', 'changes', 'diff_metadata'])

There is more commit metadata, because `diff-annotate dataset ...` does not yet try to parse `*.message` files

In [54]:
example_data_from_repo['commit_metadata']

{'id': '7f6804c4953a18386809f11faf4d86898570debc',
 'parents': ['c15633745df6abdb24e02746b82aadb20b8cdf8c'],
 'tree': 'd04faaa47bc47a2f2cda28dcba057ac3865d842e',
 'author': {'author': 'Aurélien Gâteau <mail@agateau.com>',
  'name': 'Aurélien Gâteau',
  'email': 'mail@agateau.com',
  'timestamp': 1590790310,
  'tz_info': '+0200'},
 'committer': {'committer': 'GitHub <noreply@github.com>',
  'name': 'GitHub',
  'email': 'noreply@github.com',
  'timestamp': 1590790310,
  'tz_info': '+0300'},
 'message': 'Fix default values being loaded with wrong encoding on Windows (#1414)\n\nExplicitly set the encoding to utf-8 when reading the context file to\nensure values are correctly loaded.\n\nCo-authored-by: Andrey Shpak <insspb@users.noreply.github.com>\n'}

In [55]:
example_data_from_repo['diff_metadata']

{'n_files': 3,
 'hunk_span_src': 11,
 'hunk_span_dst': 24,
 'n_hunks': 3,
 'n_lines_added': 15,
 'n_lines_removed': 1,
 'n_lines_all': 28,
 'n_mod': 1,
 'n_groups': 3,
 'patch_size': 15,
 'n_added_files': 1,
 'n_add': 14}

In [56]:
example_data_from_dataset['changes'].keys()

dict_keys(['cookiecutter/generate.py', '/dev/null', 'tests/test-generate-context/non_ascii.json', 'tests/test_generate_context.py'])

In [57]:
example_data_from_dataset['changes']['cookiecutter/generate.py']

{'language': 'Python',
 'type': 'programming',
 'purpose': 'programming',
 '-': [{'id': 3,
   'file_line_no': 85,
   'type': 'code',
   'purpose': 'programming',
   'tokens': [[40, ['Text'], '        '],
    [48, ['Keyword'], 'with'],
    [52, ['Text'], ' '],
    [53, ['Name', 'Builtin'], 'open'],
    [57, ['Punctuation'], '('],
    [58, ['Name'], 'context_file'],
    [70, ['Punctuation'], ')'],
    [71, ['Text'], ' '],
    [72, ['Keyword'], 'as'],
    [74, ['Text'], ' '],
    [75, ['Name'], 'file_handle'],
    [86, ['Punctuation'], ':'],
    [87, ['Text', 'Whitespace'], '\n']]}],
 '+': [{'id': 4,
   'file_line_no': 85,
   'type': 'code',
   'purpose': 'programming',
   'tokens': [[40, ['Text'], '        '],
    [48, ['Keyword'], 'with'],
    [52, ['Text'], ' '],
    [53, ['Name', 'Builtin'], 'open'],
    [57, ['Punctuation'], '('],
    [58, ['Name'], 'context_file'],
    [70, ['Punctuation'], ','],
    [71, ['Text'], ' '],
    [72, ['Name'], 'encoding'],
    [80, ['Operator'], '='],
 

### Running annotation with line callback on BugsInPy repo

In [58]:
script_file_2 = '../../run_annotation_hapy_bip_repos.sh'

In [59]:
%ls -l '../../run_annotation_hapy_bip_repos.sh'

-rwxr-xr-x 1 jnareb jnareb 429 Nov 30 20:12 [0m[01;32m../../run_annotation_hapy_bip_repos.sh[0m*


In [60]:
callback_file='data/experiments/HaPy-Bug/hapybug_line_callback_func.py'

In [61]:
bugsinpy_annotated_hapy_bip_dir='/mnt/data/python-diff-annotator/example_annotations/hapy_bip-from-repo'

In [62]:
with open(script_file_2, 'wt') as fp:
    print('#!/usr/bin/sh', file=fp)
    print('', file=fp)
    print(f'CALLBACK_FILE="{callback_file}"', file=fp)
    print('if [ ! -f "$CALLBACK_FILE" ]; then', file=fp)
    print('    echo "Could not find file $CALLBACK_FILE"', file=fp)
    print('    echo "You are in directory $PWD"', file=fp)
    print('    echo "Change directory to the top dir of this repo"', file=fp)
    print('    exit 1', file=fp)
    print('fi', file=fp)
    print('', file=fp)
    print('echo "running annotations on BugsInPy repos for BugsInPy buggy commits"', file=fp)
    print('echo "using the original-ish code for generating initial automatic annotations"', file=fp)
    print('', file=fp)

Path(script_file_2).chmod(0o755)  # 0755/-rwxr-xr-x

In [63]:
%ls -l '../../data/experiments/HaPy-Bug/hapybug_line_callback_func.py'

-rw-r--r-- 1 jnareb jnareb 939 Dec  1 01:30 ../../data/experiments/HaPy-Bug/hapybug_line_callback_func.py


In [64]:
for repo_name, repo_data in repo_commits.items():
    print(f"{repo_name:12s}", end='')
    cmd_str = ''.join([
        "diff-annotate ",
        f"--line-callback='{callback_file}' "
        "from-repo ",
        f"--output-dir={bugsinpy_annotated_hapy_bip_dir}/{repo_name} ",
        f"{repositories_map[repo_name]['path']} --no-walk=sorted {' '.join(repo_data['commits'])}",
    ])
    print("\targ_length <=", len(cmd_str))
    
    with open(script_file, 'at') as fp:
        print(f"# {repo_name}", file=fp)
        print(cmd_str, file=fp)

pandas      	arg_length <= 7152
thefuck     	arg_length <= 1578
tornado     	arg_length <= 922
black       	arg_length <= 1205
youtube-dl  	arg_length <= 2035
spacy       	arg_length <= 672
keras       	arg_length <= 2107
ansible     	arg_length <= 1004
scrapy      	arg_length <= 1904
fastapi     	arg_length <= 922
luigi       	arg_length <= 1615
matplotlib  	arg_length <= 1379
tqdm        	arg_length <= 629
sanic       	arg_length <= 467
cookiecutter	arg_length <= 440
httpie      	arg_length <= 469
PySnooper   	arg_length <= 393


In [65]:
%ls -l '../../run_annotation_hapy_bip_repos.sh'

-rwxr-xr-x 1 jnareb jnareb 429 Dec  5 09:35 [0m[01;32m../../run_annotation_hapy_bip_repos.sh[0m*


### Creating DataFrame for comparison

In [66]:
collective_df_manual.columns

Index(['id', 'bundle', 'file', 'fcat', 'image', 'line', 'annotation', 'user',
       'auto', 'ds', 'bug'],
      dtype='object')

In [67]:
collective_df_manual.dtypes

id            object
bundle        object
file          object
fcat          object
image         object
line           int64
annotation    object
user          object
auto            bool
ds            object
bug           object
dtype: object

In [68]:
collective_df_manual.head(5)

Unnamed: 0,id,bundle,file,fcat,image,line,annotation,user,auto,ds,bug
,,,,,,,,,,,
0.0,cve_CVE-2020-10289,B_6_13,actionlib_tools/scripts/library.py,programming,afterChange,103.0,bug(fix),U1,False,cve,CVE-2020-10289
1.0,cve_CVE-2020-10289,B_6_13,actionlib_tools/scripts/library.py,programming,afterChange,137.0,bug(fix),U1,False,cve,CVE-2020-10289
2.0,cve_CVE-2020-10289,B_6_13,actionlib_tools/scripts/library.py,programming,beforeChange,103.0,bug(fix),U1,False,cve,CVE-2020-10289
3.0,cve_CVE-2020-10289,B_6_13,actionlib_tools/scripts/library.py,programming,beforeChange,137.0,bug(fix),U1,False,cve,CVE-2020-10289
4.0,cve_CVE-2020-10289,C_4_9,actionlib_tools/scripts/library.py,programming,afterChange,103.0,bug(fix),U2,False,cve,CVE-2020-10289


In [69]:
collective_df_manual['ds'].value_counts()

ds
crawl         73183
cve           62588
bugs-in-py    60194
Name: count, dtype: int64

In [70]:
collective_df_manual_bugsinpy = collective_df_manual[collective_df_manual['ds'] == 'bugs-in-py']
collective_df_manual_bugsinpy

Unnamed: 0,id,bundle,file,fcat,image,line,annotation,user,auto,ds,bug
,,,,,,,,,,,
16414,bugs-in-py_keras-17,B_6_13,keras/metrics.py,programming,afterChange,37,documentation,U1,False,bugs-in-py,keras-17
16415,bugs-in-py_keras-17,B_6_13,keras/metrics.py,programming,afterChange,38,bug(fix),U1,False,bugs-in-py,keras-17
16416,bugs-in-py_keras-17,B_6_13,keras/metrics.py,programming,beforeChange,37,bug(fix),U1,False,bugs-in-py,keras-17
16417,bugs-in-py_keras-17,B_6_13,tests/keras/metrics_test.py,test,afterChange,50,test,U1,False,bugs-in-py,keras-17
16418,bugs-in-py_keras-17,B_6_13,tests/keras/metrics_test.py,test,afterChange,51,test,U1,False,bugs-in-py,keras-17
...,...,...,...,...,...,...,...,...,...,...,...
195909,bugs-in-py_pandas-54,A_1_24,pandas/tests/dtypes/test_dtypes.py,test,afterChange,133,test,E1,False,bugs-in-py,pandas-54
195910,bugs-in-py_pandas-54,A_1_24,pandas/tests/dtypes/test_dtypes.py,test,afterChange,134,test,E1,False,bugs-in-py,pandas-54
195911,bugs-in-py_pandas-54,A_1_24,pandas/tests/indexes/common.py,test,afterChange,608,test,E1,False,bugs-in-py,pandas-54


In [71]:
collective_df_manual_bugsinpy[collective_df_manual_bugsinpy['bug'] == 'cookiecutter-1']['bundle'].value_counts()

bundle
D_4_3     16
B_5_14    16
A_3_22    16
Name: count, dtype: int64

In [72]:
example_collective = collective_df_manual_bugsinpy[
    (collective_df_manual_bugsinpy['bug'] == 'cookiecutter-1') & 
    (collective_df_manual_bugsinpy['bundle'] == 'D_4_3')
]

example_collective

Unnamed: 0,id,bundle,file,fcat,image,line,annotation,user,auto,ds,bug
,,,,,,,,,,,
145609.0,bugs-in-py_cookiecutter-1,D_4_3,cookiecutter/generate.py,programming,afterChange,85.0,bug(fix),U3,False,bugs-in-py,cookiecutter-1
145610.0,bugs-in-py_cookiecutter-1,D_4_3,cookiecutter/generate.py,programming,beforeChange,85.0,bug(fix),U3,False,bugs-in-py,cookiecutter-1
145611.0,bugs-in-py_cookiecutter-1,D_4_3,tests/test-generate-context/non_ascii.json,test,afterChange,1.0,test,U3,False,bugs-in-py,cookiecutter-1
145612.0,bugs-in-py_cookiecutter-1,D_4_3,tests/test-generate-context/non_ascii.json,test,afterChange,2.0,test,U3,False,bugs-in-py,cookiecutter-1
145613.0,bugs-in-py_cookiecutter-1,D_4_3,tests/test-generate-context/non_ascii.json,test,afterChange,3.0,test,U3,False,bugs-in-py,cookiecutter-1
145614.0,bugs-in-py_cookiecutter-1,D_4_3,tests/test_generate_context.py,test,afterChange,111.0,test,U3,False,bugs-in-py,cookiecutter-1
145615.0,bugs-in-py_cookiecutter-1,D_4_3,tests/test_generate_context.py,test,afterChange,112.0,documentation,U3,False,bugs-in-py,cookiecutter-1
145616.0,bugs-in-py_cookiecutter-1,D_4_3,tests/test_generate_context.py,test,afterChange,113.0,test,U3,False,bugs-in-py,cookiecutter-1
145617.0,bugs-in-py_cookiecutter-1,D_4_3,tests/test_generate_context.py,test,afterChange,114.0,test,U3,False,bugs-in-py,cookiecutter-1


In [73]:
example_data_from_dataset['changes'].keys()

dict_keys(['cookiecutter/generate.py', '/dev/null', 'tests/test-generate-context/non_ascii.json', 'tests/test_generate_context.py'])

In [74]:
example_data_from_dataset['changes']['cookiecutter/generate.py']

{'language': 'Python',
 'type': 'programming',
 'purpose': 'programming',
 '-': [{'id': 3,
   'file_line_no': 85,
   'type': 'code',
   'purpose': 'programming',
   'tokens': [[40, ['Text'], '        '],
    [48, ['Keyword'], 'with'],
    [52, ['Text'], ' '],
    [53, ['Name', 'Builtin'], 'open'],
    [57, ['Punctuation'], '('],
    [58, ['Name'], 'context_file'],
    [70, ['Punctuation'], ')'],
    [71, ['Text'], ' '],
    [72, ['Keyword'], 'as'],
    [74, ['Text'], ' '],
    [75, ['Name'], 'file_handle'],
    [86, ['Punctuation'], ':'],
    [87, ['Text', 'Whitespace'], '\n']]}],
 '+': [{'id': 4,
   'file_line_no': 85,
   'type': 'code',
   'purpose': 'programming',
   'tokens': [[40, ['Text'], '        '],
    [48, ['Keyword'], 'with'],
    [52, ['Text'], ' '],
    [53, ['Name', 'Builtin'], 'open'],
    [57, ['Punctuation'], '('],
    [58, ['Name'], 'context_file'],
    [70, ['Punctuation'], ','],
    [71, ['Text'], ' '],
    [72, ['Name'], 'encoding'],
    [80, ['Operator'], '='],
 

In [75]:
example_records = []
dataset = "bugs-in-py"
bug = "cookiecutter-1"

for patched_file, file_data in example_data_from_dataset['changes'].items():
    if patched_file == '/dev/null':
        continue
        
    for pm in list("-+"):
        if pm not in file_data:
            continue

        for line_data in file_data[pm]:
            example_records.append({
                'id': f"{dataset}_{bug}",
                'file': patched_file,
                'fcat': file_data['purpose'],
                'image': 'beforeChange' if pm == '-' else 'afterChange',
                'line': line_data['file_line_no'],
                'annotation': 'bug(fix)' if line_data['type'] == 'code' else line_data['type'],
                'ds': dataset,
                'bug': bug,
            })

example_records[:5]

[{'id': 'bugs-in-py_cookiecutter-1',
  'file': 'cookiecutter/generate.py',
  'fcat': 'programming',
  'image': 'beforeChange',
  'line': 85,
  'annotation': 'bug(fix)',
  'ds': 'bugs-in-py',
  'bug': 'cookiecutter-1'},
 {'id': 'bugs-in-py_cookiecutter-1',
  'file': 'cookiecutter/generate.py',
  'fcat': 'programming',
  'image': 'afterChange',
  'line': 85,
  'annotation': 'bug(fix)',
  'ds': 'bugs-in-py',
  'bug': 'cookiecutter-1'},
 {'id': 'bugs-in-py_cookiecutter-1',
  'file': 'tests/test-generate-context/non_ascii.json',
  'fcat': 'test',
  'image': 'afterChange',
  'line': 1,
  'annotation': 'test',
  'ds': 'bugs-in-py',
  'bug': 'cookiecutter-1'},
 {'id': 'bugs-in-py_cookiecutter-1',
  'file': 'tests/test-generate-context/non_ascii.json',
  'fcat': 'test',
  'image': 'afterChange',
  'line': 2,
  'annotation': 'test',
  'ds': 'bugs-in-py',
  'bug': 'cookiecutter-1'},
 {'id': 'bugs-in-py_cookiecutter-1',
  'file': 'tests/test-generate-context/non_ascii.json',
  'fcat': 'test',
  'i

In [76]:
example_df = pd.DataFrame.from_records(example_records)
example_df

Unnamed: 0,id,file,fcat,image,line,annotation,ds,bug
0,bugs-in-py_cookiecutter-1,cookiecutter/generate.py,programming,beforeChange,85,bug(fix),bugs-in-py,cookiecutter-1
1,bugs-in-py_cookiecutter-1,cookiecutter/generate.py,programming,afterChange,85,bug(fix),bugs-in-py,cookiecutter-1
2,bugs-in-py_cookiecutter-1,tests/test-generate-context/non_ascii.json,test,afterChange,1,test,bugs-in-py,cookiecutter-1
3,bugs-in-py_cookiecutter-1,tests/test-generate-context/non_ascii.json,test,afterChange,2,test,bugs-in-py,cookiecutter-1
4,bugs-in-py_cookiecutter-1,tests/test-generate-context/non_ascii.json,test,afterChange,3,test,bugs-in-py,cookiecutter-1
5,bugs-in-py_cookiecutter-1,tests/test_generate_context.py,test,afterChange,111,test,bugs-in-py,cookiecutter-1
6,bugs-in-py_cookiecutter-1,tests/test_generate_context.py,test,afterChange,112,test,bugs-in-py,cookiecutter-1
7,bugs-in-py_cookiecutter-1,tests/test_generate_context.py,test,afterChange,113,test,bugs-in-py,cookiecutter-1
8,bugs-in-py_cookiecutter-1,tests/test_generate_context.py,test,afterChange,114,test,bugs-in-py,cookiecutter-1
9,bugs-in-py_cookiecutter-1,tests/test_generate_context.py,test,afterChange,115,test,bugs-in-py,cookiecutter-1


In [77]:
example_df.head(5)

Unnamed: 0,id,file,fcat,image,line,annotation,ds,bug
0,bugs-in-py_cookiecutter-1,cookiecutter/generate.py,programming,beforeChange,85,bug(fix),bugs-in-py,cookiecutter-1
1,bugs-in-py_cookiecutter-1,cookiecutter/generate.py,programming,afterChange,85,bug(fix),bugs-in-py,cookiecutter-1
2,bugs-in-py_cookiecutter-1,tests/test-generate-context/non_ascii.json,test,afterChange,1,test,bugs-in-py,cookiecutter-1
3,bugs-in-py_cookiecutter-1,tests/test-generate-context/non_ascii.json,test,afterChange,2,test,bugs-in-py,cookiecutter-1
4,bugs-in-py_cookiecutter-1,tests/test-generate-context/non_ascii.json,test,afterChange,3,test,bugs-in-py,cookiecutter-1


In [78]:
example_collective.head(5)

Unnamed: 0,id,bundle,file,fcat,image,line,annotation,user,auto,ds,bug
,,,,,,,,,,,
145609.0,bugs-in-py_cookiecutter-1,D_4_3,cookiecutter/generate.py,programming,afterChange,85.0,bug(fix),U3,False,bugs-in-py,cookiecutter-1
145610.0,bugs-in-py_cookiecutter-1,D_4_3,cookiecutter/generate.py,programming,beforeChange,85.0,bug(fix),U3,False,bugs-in-py,cookiecutter-1
145611.0,bugs-in-py_cookiecutter-1,D_4_3,tests/test-generate-context/non_ascii.json,test,afterChange,1.0,test,U3,False,bugs-in-py,cookiecutter-1
145612.0,bugs-in-py_cookiecutter-1,D_4_3,tests/test-generate-context/non_ascii.json,test,afterChange,2.0,test,U3,False,bugs-in-py,cookiecutter-1
145613.0,bugs-in-py_cookiecutter-1,D_4_3,tests/test-generate-context/non_ascii.json,test,afterChange,3.0,test,U3,False,bugs-in-py,cookiecutter-1


### Join/merge for comparison

In [79]:
example_collective_sel = example_collective[['ds', 'bug', 'bundle', 'user', 'file', 'fcat', 'image', 'line', 'annotation']]
example_collective_sel.head(5)

Unnamed: 0,ds,bug,bundle,user,file,fcat,image,line,annotation
,,,,,,,,,
145609.0,bugs-in-py,cookiecutter-1,D_4_3,U3,cookiecutter/generate.py,programming,afterChange,85.0,bug(fix)
145610.0,bugs-in-py,cookiecutter-1,D_4_3,U3,cookiecutter/generate.py,programming,beforeChange,85.0,bug(fix)
145611.0,bugs-in-py,cookiecutter-1,D_4_3,U3,tests/test-generate-context/non_ascii.json,test,afterChange,1.0,test
145612.0,bugs-in-py,cookiecutter-1,D_4_3,U3,tests/test-generate-context/non_ascii.json,test,afterChange,2.0,test
145613.0,bugs-in-py,cookiecutter-1,D_4_3,U3,tests/test-generate-context/non_ascii.json,test,afterChange,3.0,test


In [80]:
example_df_sel = example_df[['bug', 'file', 'fcat', 'image', 'line', 'annotation']]
example_df_sel.head(5)

Unnamed: 0,bug,file,fcat,image,line,annotation
0,cookiecutter-1,cookiecutter/generate.py,programming,beforeChange,85,bug(fix)
1,cookiecutter-1,cookiecutter/generate.py,programming,afterChange,85,bug(fix)
2,cookiecutter-1,tests/test-generate-context/non_ascii.json,test,afterChange,1,test
3,cookiecutter-1,tests/test-generate-context/non_ascii.json,test,afterChange,2,test
4,cookiecutter-1,tests/test-generate-context/non_ascii.json,test,afterChange,3,test


In [81]:
example_merge_sel = pd.merge(
    example_collective_sel, example_df_sel,
    how='outer', on=['bug', 'file', 'image', 'line'],
    indicator="indicator_column", suffixes=("_hapy", "_auto"),
)
example_merge_sel.head()

Unnamed: 0,ds,bug,bundle,user,file,fcat_hapy,image,line,annotation_hapy,fcat_auto,annotation_auto,indicator_column
0,bugs-in-py,cookiecutter-1,D_4_3,U3,cookiecutter/generate.py,programming,afterChange,85,bug(fix),programming,bug(fix),both
1,bugs-in-py,cookiecutter-1,D_4_3,U3,cookiecutter/generate.py,programming,beforeChange,85,bug(fix),programming,bug(fix),both
2,bugs-in-py,cookiecutter-1,D_4_3,U3,tests/test-generate-context/non_ascii.json,test,afterChange,1,test,test,test,both
3,bugs-in-py,cookiecutter-1,D_4_3,U3,tests/test-generate-context/non_ascii.json,test,afterChange,2,test,test,test,both
4,bugs-in-py,cookiecutter-1,D_4_3,U3,tests/test-generate-context/non_ascii.json,test,afterChange,3,test,test,test,both


In [82]:
example_merge_sel['fcat_eq'] = example_merge_sel['fcat_hapy'] == example_merge_sel['fcat_auto']
example_merge_sel['annotation_eq'] = example_merge_sel['annotation_hapy'] == example_merge_sel['annotation_auto']

example_merge_sel.head()

Unnamed: 0,ds,bug,bundle,user,file,fcat_hapy,image,line,annotation_hapy,fcat_auto,annotation_auto,indicator_column,fcat_eq,annotation_eq
0,bugs-in-py,cookiecutter-1,D_4_3,U3,cookiecutter/generate.py,programming,afterChange,85,bug(fix),programming,bug(fix),both,True,True
1,bugs-in-py,cookiecutter-1,D_4_3,U3,cookiecutter/generate.py,programming,beforeChange,85,bug(fix),programming,bug(fix),both,True,True
2,bugs-in-py,cookiecutter-1,D_4_3,U3,tests/test-generate-context/non_ascii.json,test,afterChange,1,test,test,test,both,True,True
3,bugs-in-py,cookiecutter-1,D_4_3,U3,tests/test-generate-context/non_ascii.json,test,afterChange,2,test,test,test,both,True,True
4,bugs-in-py,cookiecutter-1,D_4_3,U3,tests/test-generate-context/non_ascii.json,test,afterChange,3,test,test,test,both,True,True


In [83]:
example_merge_sel[['fcat_eq', 'annotation_eq']].value_counts()

fcat_eq  annotation_eq
True     True             15
         False             1
Name: count, dtype: int64

In [84]:
example_merge_sel[(example_merge_sel['fcat_eq'] == False) | (example_merge_sel['annotation_eq'] == False)]

Unnamed: 0,ds,bug,bundle,user,file,fcat_hapy,image,line,annotation_hapy,fcat_auto,annotation_auto,indicator_column,fcat_eq,annotation_eq
6,bugs-in-py,cookiecutter-1,D_4_3,U3,tests/test_generate_context.py,test,afterChange,112,documentation,test,test,both,True,False


### Full comparison

In [85]:
sha_to_bug = {}
for repo_data in repo_commits.values():
    for sha, bug in zip(repo_data['commits'], repo_data['bugs']):
        sha_to_bug[sha] = bug

{sha: bug for sha, bug in sha_to_bug.items() if bug.startswith('cookiecutter')}

{'7f6804c4953a18386809f11faf4d86898570debc': 'cookiecutter-1',
 '7129d474206761a6156925db78eee4b62a0e3944': 'cookiecutter-3',
 '90434ff4ea4477941444f1e83313beb414838535': 'cookiecutter-2',
 '457a1a4e862aab4102b644ff1d2b2e2b5a766b3c': 'cookiecutter-4'}

#### for bugsinpy-from-repo

In [86]:
bugsinpy_annotated_from_repo_dir

'/mnt/data/python-diff-annotator/example_annotations/bugsinpy-from-repo/'

In [87]:
records_from_repos = []

dataset = 'bugs-in-py'

for subdir in Path(bugsinpy_annotated_from_repo_dir).iterdir():
    print(f"{subdir.name}")
    count = 0
    n_files = 0
    n_lines = 0

    for json_file in subdir.glob('*.json'):
        sha = json_file.name.split('.', maxsplit=1)[0]
        bug = sha_to_bug[sha]
        #print(f"  {json_file.name} -> {sha=}, {bug=}")
        count += 1

        with open(json_file, mode='r') as json_fp:
            json_data = json.load(json_fp)

        for patched_file, file_data in json_data['changes'].items():
            if patched_file == '/dev/null':
                continue

            n_files += 1
            
            for pm in list("-+"):
                if pm not in file_data:
                    continue

                for line_data in file_data[pm]:
                    n_lines += 1
                    records_from_repos.append({
                        'id': f"{dataset}_{bug}",
                        'ds': dataset,
                        'bug': bug,
                        'sha': sha,
                        'file': patched_file,
                        'fcat': file_data['purpose'],
                        'image': 'beforeChange' if pm == '-' else 'afterChange',
                        'line': line_data['file_line_no'],
                        'annotation': 'bug(fix)' if line_data['type'] == 'code' else line_data['type'],
                    })

    print(f"  {count} commits, {n_files} changed files, {n_lines} changed lines") 

httpie
  5 commits, 13 changed files, 145 changed lines
PySnooper
  3 commits, 8 changed files, 117 changed lines
keras
  45 commits, 107 changed files, 2122 changed lines
pandas
  168 commits, 582 changed files, 7464 changed lines
spacy
  10 commits, 29 changed files, 270 changed lines
tornado
  16 commits, 39 changed files, 575 changed lines
scrapy
  40 commits, 98 changed files, 1196 changed lines
youtube-dl
  43 commits, 100 changed files, 702 changed lines
matplotlib
  27 commits, 64 changed files, 714 changed lines
black
  23 commits, 74 changed files, 1638 changed lines
sanic
  5 commits, 14 changed files, 207 changed lines
cookiecutter
  4 commits, 11 changed files, 108 changed lines
fastapi
  16 commits, 43 changed files, 1415 changed lines
luigi
  33 commits, 70 changed files, 1308 changed lines
tqdm
  9 commits, 19 changed files, 215 changed lines
ansible
  18 commits, 54 changed files, 803 changed lines
thefuck
  32 commits, 72 changed files, 891 changed lines


In [88]:
records_from_repos[:5]

[{'id': 'bugs-in-py_httpie-1',
  'ds': 'bugs-in-py',
  'bug': 'httpie-1',
  'sha': '5300b0b490b8db48fac30b5e32164be93dc574b7',
  'file': 'CHANGELOG.rst',
  'fcat': 'documentation',
  'image': 'afterChange',
  'line': 30,
  'annotation': 'documentation'},
 {'id': 'bugs-in-py_httpie-1',
  'ds': 'bugs-in-py',
  'bug': 'httpie-1',
  'sha': '5300b0b490b8db48fac30b5e32164be93dc574b7',
  'file': 'httpie/downloads.py',
  'fcat': 'programming',
  'image': 'beforeChange',
  'line': 142,
  'annotation': 'bug(fix)'},
 {'id': 'bugs-in-py_httpie-1',
  'ds': 'bugs-in-py',
  'bug': 'httpie-1',
  'sha': '5300b0b490b8db48fac30b5e32164be93dc574b7',
  'file': 'httpie/downloads.py',
  'fcat': 'programming',
  'image': 'beforeChange',
  'line': 143,
  'annotation': 'bug(fix)'},
 {'id': 'bugs-in-py_httpie-1',
  'ds': 'bugs-in-py',
  'bug': 'httpie-1',
  'sha': '5300b0b490b8db48fac30b5e32164be93dc574b7',
  'file': 'httpie/downloads.py',
  'fcat': 'programming',
  'image': 'afterChange',
  'line': 10,
  'annot

In [89]:
from_repos_df = pd.DataFrame.from_records(records_from_repos)
from_repos_df

Unnamed: 0,id,ds,bug,sha,file,fcat,image,line,annotation
0,bugs-in-py_httpie-1,bugs-in-py,httpie-1,5300b0b490b8db48fac30b5e32164be93dc574b7,CHANGELOG.rst,documentation,afterChange,30,documentation
1,bugs-in-py_httpie-1,bugs-in-py,httpie-1,5300b0b490b8db48fac30b5e32164be93dc574b7,httpie/downloads.py,programming,beforeChange,142,bug(fix)
2,bugs-in-py_httpie-1,bugs-in-py,httpie-1,5300b0b490b8db48fac30b5e32164be93dc574b7,httpie/downloads.py,programming,beforeChange,143,bug(fix)
3,bugs-in-py_httpie-1,bugs-in-py,httpie-1,5300b0b490b8db48fac30b5e32164be93dc574b7,httpie/downloads.py,programming,afterChange,10,bug(fix)
4,bugs-in-py_httpie-1,bugs-in-py,httpie-1,5300b0b490b8db48fac30b5e32164be93dc574b7,httpie/downloads.py,programming,afterChange,139,bug(fix)
...,...,...,...,...,...,...,...,...,...
19885,bugs-in-py_thefuck-9,bugs-in-py,thefuck-9,feb36ede5c518fdc3b6eddf945b2d8b1e2294d15,thefuck/rules/git_push.py,programming,afterChange,27,bug(fix)
19886,bugs-in-py_thefuck-9,bugs-in-py,thefuck-9,feb36ede5c518fdc3b6eddf945b2d8b1e2294d15,thefuck/rules/git_push.py,programming,afterChange,28,bug(fix)
19887,bugs-in-py_thefuck-9,bugs-in-py,thefuck-9,feb36ede5c518fdc3b6eddf945b2d8b1e2294d15,thefuck/rules/git_push.py,programming,afterChange,29,bug(fix)
19888,bugs-in-py_thefuck-9,bugs-in-py,thefuck-9,feb36ede5c518fdc3b6eddf945b2d8b1e2294d15,thefuck/rules/git_push.py,programming,afterChange,30,documentation


#### for hapy_bip-from-repo

In [90]:
bugsinpy_annotated_hapy_bip_dir

'/mnt/data/python-diff-annotator/example_annotations/hapy_bip-from-repo'

In [91]:
records_from_repos_2 = []

dataset = 'bugs-in-py'

for subdir in Path(bugsinpy_annotated_hapy_bip_dir).iterdir():
    print(f"{subdir.name}", end='')
    count = 0
    n_files = 0
    n_lines = 0

    for json_file in subdir.glob('*.json'):
        sha = json_file.name.split('.', maxsplit=1)[0]
        bug = sha_to_bug[sha]
        #print(f"  {json_file.name} -> {sha=}, {bug=}")
        count += 1

        with open(json_file, mode='r') as json_fp:
            json_data = json.load(json_fp)

        for patched_file, file_data in json_data['changes'].items():
            if patched_file == '/dev/null':
                continue

            n_files += 1
            
            for pm in list("-+"):
                if pm not in file_data:
                    continue

                for line_data in file_data[pm]:
                    n_lines += 1
                    records_from_repos_2.append({
                        'id': f"{dataset}_{bug}",
                        'ds': dataset,
                        'bug': bug,
                        'sha': sha,
                        'file': patched_file,
                        'fcat': file_data['purpose'],
                        'image': 'beforeChange' if pm == '-' else 'afterChange',
                        'line': line_data['file_line_no'],
                        'annotation': line_data['type'],
                    })

    print(f"  {count} commits, {n_files} changed files, {n_lines} changed lines")

records_from_repos_2[:2]

httpie  5 commits, 13 changed files, 145 changed lines
PySnooper  3 commits, 8 changed files, 117 changed lines
keras  45 commits, 107 changed files, 2122 changed lines
pandas  168 commits, 582 changed files, 7464 changed lines
spacy  10 commits, 29 changed files, 270 changed lines
tornado  16 commits, 39 changed files, 575 changed lines
scrapy  40 commits, 98 changed files, 1196 changed lines
youtube-dl  43 commits, 100 changed files, 702 changed lines
matplotlib  27 commits, 64 changed files, 714 changed lines
black  23 commits, 74 changed files, 1638 changed lines
sanic  5 commits, 14 changed files, 207 changed lines
cookiecutter  4 commits, 11 changed files, 108 changed lines
fastapi  16 commits, 43 changed files, 1415 changed lines
luigi  33 commits, 70 changed files, 1308 changed lines
tqdm  9 commits, 19 changed files, 215 changed lines
ansible  18 commits, 54 changed files, 803 changed lines
thefuck  32 commits, 72 changed files, 891 changed lines


[{'id': 'bugs-in-py_httpie-1',
  'ds': 'bugs-in-py',
  'bug': 'httpie-1',
  'sha': '5300b0b490b8db48fac30b5e32164be93dc574b7',
  'file': 'CHANGELOG.rst',
  'fcat': 'documentation',
  'image': 'afterChange',
  'line': 30,
  'annotation': 'documentation'},
 {'id': 'bugs-in-py_httpie-1',
  'ds': 'bugs-in-py',
  'bug': 'httpie-1',
  'sha': '5300b0b490b8db48fac30b5e32164be93dc574b7',
  'file': 'httpie/downloads.py',
  'fcat': 'programming',
  'image': 'beforeChange',
  'line': 142,
  'annotation': 'bug(fix)'}]

In [92]:
hapy_bip_from_repos_df = pd.DataFrame.from_records(records_from_repos_2)
hapy_bip_from_repos_df

Unnamed: 0,id,ds,bug,sha,file,fcat,image,line,annotation
0,bugs-in-py_httpie-1,bugs-in-py,httpie-1,5300b0b490b8db48fac30b5e32164be93dc574b7,CHANGELOG.rst,documentation,afterChange,30,documentation
1,bugs-in-py_httpie-1,bugs-in-py,httpie-1,5300b0b490b8db48fac30b5e32164be93dc574b7,httpie/downloads.py,programming,beforeChange,142,bug(fix)
2,bugs-in-py_httpie-1,bugs-in-py,httpie-1,5300b0b490b8db48fac30b5e32164be93dc574b7,httpie/downloads.py,programming,beforeChange,143,bug(fix)
3,bugs-in-py_httpie-1,bugs-in-py,httpie-1,5300b0b490b8db48fac30b5e32164be93dc574b7,httpie/downloads.py,programming,afterChange,10,bug(fix)
4,bugs-in-py_httpie-1,bugs-in-py,httpie-1,5300b0b490b8db48fac30b5e32164be93dc574b7,httpie/downloads.py,programming,afterChange,139,bug(fix)
...,...,...,...,...,...,...,...,...,...
19885,bugs-in-py_thefuck-9,bugs-in-py,thefuck-9,feb36ede5c518fdc3b6eddf945b2d8b1e2294d15,thefuck/rules/git_push.py,programming,afterChange,27,bug(fix)
19886,bugs-in-py_thefuck-9,bugs-in-py,thefuck-9,feb36ede5c518fdc3b6eddf945b2d8b1e2294d15,thefuck/rules/git_push.py,programming,afterChange,28,bug(fix)
19887,bugs-in-py_thefuck-9,bugs-in-py,thefuck-9,feb36ede5c518fdc3b6eddf945b2d8b1e2294d15,thefuck/rules/git_push.py,programming,afterChange,29,bug(fix)
19888,bugs-in-py_thefuck-9,bugs-in-py,thefuck-9,feb36ede5c518fdc3b6eddf945b2d8b1e2294d15,thefuck/rules/git_push.py,programming,afterChange,30,documentation


In [93]:
hapy_bip_from_repos_df['annotation'].value_counts()

annotation
test             11557
bug(fix)          5798
documentation     2535
Name: count, dtype: int64

#### for HaPy-Bug_bip/bugsinpy-dataset

In [94]:
bugsinpy_dataset_hapy_bip_dir = '/mnt/data/python-diff-annotator/example_annotations/HaPy-Bug_bip/bugsinpy-dataset'
bugsinpy_dataset_hapy_bip_dir

'/mnt/data/python-diff-annotator/example_annotations/HaPy-Bug_bip/bugsinpy-dataset'

In [95]:
records_from_dataset = []

dataset = 'bugs-in-py'

for subdir in sorted(Path(bugsinpy_dataset_hapy_bip_dir).iterdir()):
    print(f"{subdir.name}", end='')
    bug = subdir.name
    repo = subdir.name.rsplit('-', maxsplit=1)[0]

    print(f" -> {repo=}, {bug=}", end='')
    
    count = 0
    n_files = 0
    n_lines = 0

    for json_file in subdir.joinpath('annotation').glob('*.json'):
        sha = json_file.name.split('.', maxsplit=1)[0]
        #bug = sha_to_bug[sha]
        #print(f"  {json_file.name} -> {sha=}, {bug=}")
        count += 1

        with open(json_file, mode='r') as json_fp:
            json_data = json.load(json_fp)

        for patched_file, file_data in json_data['changes'].items():
            if patched_file == '/dev/null':
                continue

            n_files += 1
            
            for pm in list("-+"):
                if pm not in file_data:
                    continue

                for line_data in file_data[pm]:
                    n_lines += 1
                    records_from_dataset.append({
                        'id': f"{dataset}_{bug}",
                        'ds': dataset,
                        'bug': bug,
                        'sha': sha,
                        'file': patched_file,
                        'fcat': file_data['purpose'],
                        'image': 'beforeChange' if pm == '-' else 'afterChange',
                        'line': line_data['file_line_no'],
                        'annotation': line_data['type'],
                    })

    print(f"  {count} commit(s), {n_files} changed file(s), {n_lines} changed line(s)")

PySnooper-1 -> repo='PySnooper', bug='PySnooper-1'  1 commit(s), 4 changed file(s), 60 changed line(s)
PySnooper-2 -> repo='PySnooper', bug='PySnooper-2'  1 commit(s), 2 changed file(s), 27 changed line(s)
PySnooper-3 -> repo='PySnooper', bug='PySnooper-3'  1 commit(s), 2 changed file(s), 30 changed line(s)
ansible-1 -> repo='ansible', bug='ansible-1'  1 commit(s), 2 changed file(s), 39 changed line(s)
ansible-10 -> repo='ansible', bug='ansible-10'  1 commit(s), 3 changed file(s), 24 changed line(s)
ansible-11 -> repo='ansible', bug='ansible-11'  1 commit(s), 2 changed file(s), 57 changed line(s)
ansible-12 -> repo='ansible', bug='ansible-12'  1 commit(s), 5 changed file(s), 64 changed line(s)
ansible-13 -> repo='ansible', bug='ansible-13'  1 commit(s), 4 changed file(s), 43 changed line(s)
ansible-14 -> repo='ansible', bug='ansible-14'  1 commit(s), 3 changed file(s), 67 changed line(s)
ansible-15 -> repo='ansible', bug='ansible-15'  1 commit(s), 2 changed file(s), 8 changed line(s)
a

In [96]:
records_from_dataset[:2]

[{'id': 'bugs-in-py_PySnooper-1',
  'ds': 'bugs-in-py',
  'bug': 'PySnooper-1',
  'sha': '56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2',
  'file': 'pysnooper/pycompat.py',
  'fcat': 'programming',
  'image': 'afterChange',
  'line': 11,
  'annotation': 'bug(fix)'},
 {'id': 'bugs-in-py_PySnooper-1',
  'ds': 'bugs-in-py',
  'bug': 'PySnooper-1',
  'sha': '56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2',
  'file': 'pysnooper/tracer.py',
  'fcat': 'programming',
  'image': 'beforeChange',
  'line': 87,
  'annotation': 'bug(fix)'}]

In [97]:
hapy_bip_from_dataset_df = pd.DataFrame.from_records(records_from_dataset)
hapy_bip_from_dataset_df

Unnamed: 0,id,ds,bug,sha,file,fcat,image,line,annotation
0,bugs-in-py_PySnooper-1,bugs-in-py,PySnooper-1,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,pysnooper/pycompat.py,programming,afterChange,11,bug(fix)
1,bugs-in-py_PySnooper-1,bugs-in-py,PySnooper-1,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,pysnooper/tracer.py,programming,beforeChange,87,bug(fix)
2,bugs-in-py_PySnooper-1,bugs-in-py,PySnooper-1,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,pysnooper/tracer.py,programming,beforeChange,133,bug(fix)
3,bugs-in-py_PySnooper-1,bugs-in-py,PySnooper-1,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,pysnooper/tracer.py,programming,afterChange,17,bug(fix)
4,bugs-in-py_PySnooper-1,bugs-in-py,PySnooper-1,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,pysnooper/tracer.py,programming,afterChange,18,bug(fix)
...,...,...,...,...,...,...,...,...,...
20263,bugs-in-py_youtube-dl-9,bugs-in-py,youtube-dl-9,cf2ac6df6896dac4d23918867bb86fac1e1088d9,youtube_dl/YoutubeDL.py,programming,afterChange,960,bug(fix)
20264,bugs-in-py_youtube-dl-9,bugs-in-py,youtube-dl-9,cf2ac6df6896dac4d23918867bb86fac1e1088d9,youtube_dl/YoutubeDL.py,programming,afterChange,971,bug(fix)
20265,bugs-in-py_youtube-dl-9,bugs-in-py,youtube-dl-9,cf2ac6df6896dac4d23918867bb86fac1e1088d9,youtube_dl/YoutubeDL.py,programming,afterChange,972,bug(fix)
20266,bugs-in-py_youtube-dl-9,bugs-in-py,youtube-dl-9,cf2ac6df6896dac4d23918867bb86fac1e1088d9,youtube_dl/YoutubeDL.py,programming,afterChange,975,bug(fix)


In [98]:
hapy_bip_from_dataset_df['annotation'].value_counts()

annotation
test             11728
bug(fix)          6077
documentation     2463
Name: count, dtype: int64

#### selecting columns

In [99]:
collective_df_manual

Unnamed: 0,id,bundle,file,fcat,image,line,annotation,user,auto,ds,bug
,,,,,,,,,,,
0,cve_CVE-2020-10289,B_6_13,actionlib_tools/scripts/library.py,programming,afterChange,103,bug(fix),U1,False,cve,CVE-2020-10289
1,cve_CVE-2020-10289,B_6_13,actionlib_tools/scripts/library.py,programming,afterChange,137,bug(fix),U1,False,cve,CVE-2020-10289
2,cve_CVE-2020-10289,B_6_13,actionlib_tools/scripts/library.py,programming,beforeChange,103,bug(fix),U1,False,cve,CVE-2020-10289
3,cve_CVE-2020-10289,B_6_13,actionlib_tools/scripts/library.py,programming,beforeChange,137,bug(fix),U1,False,cve,CVE-2020-10289
4,cve_CVE-2020-10289,C_4_9,actionlib_tools/scripts/library.py,programming,afterChange,103,bug(fix),U2,False,cve,CVE-2020-10289
...,...,...,...,...,...,...,...,...,...,...,...
195960,cve_CVE-2018-16876,C_5_8,lib/ansible/plugins/connection/ssh.py,programming,afterChange,365,bug(fix) + refactoring,U2,False,cve,CVE-2018-16876
195961,cve_CVE-2018-16876,C_5_8,lib/ansible/plugins/connection/ssh.py,programming,beforeChange,335,bug(fix),U2,False,cve,CVE-2018-16876
195962,cve_CVE-2018-16876,C_5_8,lib/ansible/plugins/connection/ssh.py,programming,beforeChange,339,bug(fix),U2,False,cve,CVE-2018-16876


In [100]:
collective_df_manual['ds'].value_counts()

ds
crawl         73183
cve           62588
bugs-in-py    60194
Name: count, dtype: int64

In [101]:
collective_df_bugsinpy = collective_df_manual[collective_df_manual['ds'] == 'bugs-in-py']
collective_df_bugsinpy

Unnamed: 0,id,bundle,file,fcat,image,line,annotation,user,auto,ds,bug
,,,,,,,,,,,
16414,bugs-in-py_keras-17,B_6_13,keras/metrics.py,programming,afterChange,37,documentation,U1,False,bugs-in-py,keras-17
16415,bugs-in-py_keras-17,B_6_13,keras/metrics.py,programming,afterChange,38,bug(fix),U1,False,bugs-in-py,keras-17
16416,bugs-in-py_keras-17,B_6_13,keras/metrics.py,programming,beforeChange,37,bug(fix),U1,False,bugs-in-py,keras-17
16417,bugs-in-py_keras-17,B_6_13,tests/keras/metrics_test.py,test,afterChange,50,test,U1,False,bugs-in-py,keras-17
16418,bugs-in-py_keras-17,B_6_13,tests/keras/metrics_test.py,test,afterChange,51,test,U1,False,bugs-in-py,keras-17
...,...,...,...,...,...,...,...,...,...,...,...
195909,bugs-in-py_pandas-54,A_1_24,pandas/tests/dtypes/test_dtypes.py,test,afterChange,133,test,E1,False,bugs-in-py,pandas-54
195910,bugs-in-py_pandas-54,A_1_24,pandas/tests/dtypes/test_dtypes.py,test,afterChange,134,test,E1,False,bugs-in-py,pandas-54
195911,bugs-in-py_pandas-54,A_1_24,pandas/tests/indexes/common.py,test,afterChange,608,test,E1,False,bugs-in-py,pandas-54


In [102]:
collective_df_bugsinpy_sel = collective_df_bugsinpy[['ds', 'bug', 'bundle', 'user', 'file', 'fcat', 'image', 'line', 'annotation']]
collective_df_bugsinpy_sel.head(5)

Unnamed: 0,ds,bug,bundle,user,file,fcat,image,line,annotation
,,,,,,,,,
16414.0,bugs-in-py,keras-17,B_6_13,U1,keras/metrics.py,programming,afterChange,37.0,documentation
16415.0,bugs-in-py,keras-17,B_6_13,U1,keras/metrics.py,programming,afterChange,38.0,bug(fix)
16416.0,bugs-in-py,keras-17,B_6_13,U1,keras/metrics.py,programming,beforeChange,37.0,bug(fix)
16417.0,bugs-in-py,keras-17,B_6_13,U1,tests/keras/metrics_test.py,test,afterChange,50.0,test
16418.0,bugs-in-py,keras-17,B_6_13,U1,tests/keras/metrics_test.py,test,afterChange,51.0,test


In [103]:
from_repos_df_sel = from_repos_df[['ds', 'bug', 'sha', 'file', 'fcat', 'image', 'line', 'annotation']]
from_repos_df_sel.head(5)

Unnamed: 0,ds,bug,sha,file,fcat,image,line,annotation
0,bugs-in-py,httpie-1,5300b0b490b8db48fac30b5e32164be93dc574b7,CHANGELOG.rst,documentation,afterChange,30,documentation
1,bugs-in-py,httpie-1,5300b0b490b8db48fac30b5e32164be93dc574b7,httpie/downloads.py,programming,beforeChange,142,bug(fix)
2,bugs-in-py,httpie-1,5300b0b490b8db48fac30b5e32164be93dc574b7,httpie/downloads.py,programming,beforeChange,143,bug(fix)
3,bugs-in-py,httpie-1,5300b0b490b8db48fac30b5e32164be93dc574b7,httpie/downloads.py,programming,afterChange,10,bug(fix)
4,bugs-in-py,httpie-1,5300b0b490b8db48fac30b5e32164be93dc574b7,httpie/downloads.py,programming,afterChange,139,bug(fix)


#### Merge for bugsinpy-from-repos (from_repos_df_sel)

In [104]:
merge_sel = pd.merge(
    collective_df_bugsinpy_sel, from_repos_df_sel,
    how='outer', on=['ds', 'bug', 'file', 'image', 'line'],
    indicator="indicator_column", suffixes=("_hapy", "_auto"),
)

merge_sel['fcat_neq'] = merge_sel['fcat_hapy'] != merge_sel['fcat_auto']
merge_sel['annotation_neq'] = merge_sel['annotation_hapy'] != merge_sel['annotation_auto']

merge_sel[[
    'ds', 'bug', 'sha',
    'bundle', 'user',
    'file', 'fcat_hapy', 'fcat_auto',
    'image', 'line', 'annotation_hapy', 'annotation_auto',
    'fcat_neq', 'annotation_neq'
]].head()

Unnamed: 0,ds,bug,sha,bundle,user,file,fcat_hapy,fcat_auto,image,line,annotation_hapy,annotation_auto,fcat_neq,annotation_neq
0,bugs-in-py,PySnooper-1,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,B_6_13,U1,pysnooper/pycompat.py,programming,programming,afterChange,11,other,bug(fix),False,True
1,bugs-in-py,PySnooper-1,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,D_2_5,U3,pysnooper/pycompat.py,programming,programming,afterChange,11,bug(fix),bug(fix),False,False
2,bugs-in-py,PySnooper-1,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,A_4_21,E1,pysnooper/pycompat.py,programming,programming,afterChange,11,bug(fix),bug(fix),False,False
3,bugs-in-py,PySnooper-1,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,B_6_13,U1,pysnooper/tracer.py,programming,programming,afterChange,17,bug(fix) + refactoring,bug(fix),False,True
4,bugs-in-py,PySnooper-1,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,D_2_5,U3,pysnooper/tracer.py,programming,programming,afterChange,17,bug(fix),bug(fix),False,False


In [105]:
merge_sel['indicator_column'].value_counts()

indicator_column
both          58604
left_only      1590
right_only      211
Name: count, dtype: int64

In [106]:
merge_sel[['fcat_neq', 'annotation_neq']].value_counts()

fcat_neq  annotation_neq
False     False             53944
          True               4421
True      True               1929
          False               111
Name: count, dtype: int64

In [107]:
merge_sel['fcat_neq'].value_counts()

fcat_neq
False    58365
True      2040
Name: count, dtype: int64

In [108]:
merge_sel['annotation_neq'].value_counts()

annotation_neq
False    54055
True      6350
Name: count, dtype: int64

### Analysis of comparison results

Disagreement, as percentage

In [109]:
merge_sel.shape

(60405, 15)

In [110]:
merge_sel['annotation_neq'].value_counts().sum()

np.int64(60405)

In [111]:
merge_sel['annotation_neq'].value_counts()/merge_sel.shape[0]

annotation_neq
False    0.894876
True     0.105124
Name: count, dtype: float64

Analyze what was the source of disagreement

In [112]:
merge_sel[merge_sel['annotation_neq']]['annotation_hapy'].value_counts()

annotation_hapy
bug(fix) + refactoring    1534
test + refactoring        1155
refactoring               1136
test                      1046
documentation              750
other                      303
bug(fix)                   215
Name: count, dtype: int64

In [113]:
merge_sel[merge_sel['annotation_neq']]['annotation_hapy'].value_counts()/merge_sel.shape[0]

annotation_hapy
bug(fix) + refactoring    0.025395
test + refactoring        0.019121
refactoring               0.018806
test                      0.017316
documentation             0.012416
other                     0.005016
bug(fix)                  0.003559
Name: count, dtype: float64

Let's examine the case for **df['annotation_hapy'] == 'documentation'** and disagreement

In [114]:
df = merge_sel[merge_sel['annotation_neq'] & (merge_sel['annotation_hapy'] == 'documentation')][[
    'bug','bundle','user',
    'file','image','line',
    'annotation_hapy', 'annotation_auto'
]]
df

Unnamed: 0,bug,bundle,user,file,image,line,annotation_hapy,annotation_auto
468,ansible-10,A_2_23,E2,changelogs/fragments/66398-pamd_fix-attributee...,afterChange,1,documentation,data
469,ansible-10,C_4_9,U2,changelogs/fragments/66398-pamd_fix-attributee...,afterChange,1,documentation,data
470,ansible-10,B_4_15,U1,changelogs/fragments/66398-pamd_fix-attributee...,afterChange,1,documentation,data
471,ansible-10,A_2_23,E2,changelogs/fragments/66398-pamd_fix-attributee...,afterChange,2,documentation,data
472,ansible-10,C_4_9,U2,changelogs/fragments/66398-pamd_fix-attributee...,afterChange,2,documentation,data
...,...,...,...,...,...,...,...,...
56833,tornado-3,B_4_15,U1,.travis.yml,afterChange,87,documentation,data
56834,tornado-3,A_5_20,E2,.travis.yml,afterChange,87,documentation,data
56838,tornado-3,C_6_7,U2,.travis.yml,beforeChange,87,documentation,data
56839,tornado-3,B_4_15,U1,.travis.yml,beforeChange,87,documentation,data


In [115]:
df[['annotation_hapy', 'annotation_auto']].value_counts()

annotation_hapy  annotation_auto
documentation    data               77
                 bug(fix)           16
                 test               12
Name: count, dtype: int64

In [116]:
df_2 = merge_sel[
    merge_sel['annotation_neq'] &
    (merge_sel['annotation_hapy'] == 'documentation') &
    (merge_sel['annotation_auto'] == 'test')
][[
    'bug','bundle','user','sha',
    'file','image','line',
    'annotation_hapy', 'annotation_auto'
]]
df_2

Unnamed: 0,bug,bundle,user,sha,file,image,line,annotation_hapy,annotation_auto
491,ansible-10,B_4_15,U1,a4b59d021368285490f7cda50c11ac4f7a8030b5,test/units/modules/system/test_pamd.py,afterChange,137,documentation,test
494,ansible-10,B_4_15,U1,a4b59d021368285490f7cda50c11ac4f7a8030b5,test/units/modules/system/test_pamd.py,afterChange,138,documentation,test
497,ansible-10,B_4_15,U1,a4b59d021368285490f7cda50c11ac4f7a8030b5,test/units/modules/system/test_pamd.py,afterChange,139,documentation,test
500,ansible-10,B_4_15,U1,a4b59d021368285490f7cda50c11ac4f7a8030b5,test/units/modules/system/test_pamd.py,afterChange,140,documentation,test
503,ansible-10,B_4_15,U1,a4b59d021368285490f7cda50c11ac4f7a8030b5,test/units/modules/system/test_pamd.py,afterChange,141,documentation,test
27686,pandas-113,C_6_7,U2,8705aad961dd227d38ff93a39697547b98109c9d,pandas/conftest.py,afterChange,671,documentation,test
27692,pandas-113,C_6_7,U2,8705aad961dd227d38ff93a39697547b98109c9d,pandas/conftest.py,afterChange,673,documentation,test
27695,pandas-113,C_6_7,U2,8705aad961dd227d38ff93a39697547b98109c9d,pandas/conftest.py,afterChange,674,documentation,test
35887,pandas-19,D_4_3,U3,c6a1638bcd99df677a8f76f036c0b30027eb243c,pandas/tests/indexing/multiindex/test_loc.py,afterChange,298,documentation,test
35888,pandas-19,C_3_10,U2,c6a1638bcd99df677a8f76f036c0b30027eb243c,pandas/tests/indexing/multiindex/test_loc.py,afterChange,298,documentation,test


In [117]:
df_2[df_2['file'].str.count(r'^test|/test|conftest\.py$|_testing\.py') == 0]

Unnamed: 0,bug,bundle,user,sha,file,image,line,annotation_hapy,annotation_auto


In [118]:
df_2.shape[0]/merge_sel.shape[0]

0.00019865905140302955

In [119]:
df_2.sample(4)

Unnamed: 0,bug,bundle,user,sha,file,image,line,annotation_hapy,annotation_auto
494,ansible-10,B_4_15,U1,a4b59d021368285490f7cda50c11ac4f7a8030b5,test/units/modules/system/test_pamd.py,afterChange,138,documentation,test
491,ansible-10,B_4_15,U1,a4b59d021368285490f7cda50c11ac4f7a8030b5,test/units/modules/system/test_pamd.py,afterChange,137,documentation,test
27692,pandas-113,C_6_7,U2,8705aad961dd227d38ff93a39697547b98109c9d,pandas/conftest.py,afterChange,673,documentation,test
500,ansible-10,B_4_15,U1,a4b59d021368285490f7cda50c11ac4f7a8030b5,test/units/modules/system/test_pamd.py,afterChange,140,documentation,test


In all cases for project, files, and lines selected at current run thought the notebook, I got comment or comment-like i.e. docstring.
Here are the results:

```console
repositories/keras$ git show fe38f9dfc8c732a77ac03507b63c79b1d2acfba2:tests/keras/test_sequential_model.py | sed -n '171p'
    # Test serialization
repositories/ansible$ git show 18a66e291dad71128a32d662aa808213acefe0e9:test/units/playbook/test_collectionsearch.py | sed -n '27p'
    """Test that collection name is not templated.
repositories/black$ git show 6316e293ac30a2837ec20eba289fd28a2a18cf89:tests/python2.py | sed -n '17p'
# output
repositories/luigi$ git show b7115974c3deadf77113686248b39567cb67e38f:test/retcodes_test.py | sed -n '176p'
    """
```

Which means that in 4 test cases, it was 2 times comment, 2 times docstring - in a test file

In [120]:
df_3 = merge_sel[
    merge_sel['annotation_neq'] &
    (merge_sel['annotation_hapy'] == 'documentation') &
    (merge_sel['annotation_auto'] == 'bug(fix)')
][[
    'bug','bundle','user','sha',
    'file','image','line',
    'annotation_hapy', 'annotation_auto'
]]
df_3

Unnamed: 0,bug,bundle,user,sha,file,image,line,annotation_hapy,annotation_auto
3793,black-15,D_6_1,U3,df2ae3bbe6c45298aabb6c04e85cb353205626f1,black.py,afterChange,2567,documentation,bug(fix)
7721,cookiecutter-4,C_6_7,U2,457a1a4e862aab4102b644ff1d2b2e2b5a766b3c,cookiecutter/exceptions.py,afterChange,84,documentation,bug(fix)
7724,cookiecutter-4,C_6_7,U2,457a1a4e862aab4102b644ff1d2b2e2b5a766b3c,cookiecutter/exceptions.py,afterChange,85,documentation,bug(fix)
13909,keras-11,C_6_7,U2,d6b5c5ebb410e3366c9d7aca41977a60134bfe10,keras/engine/training_utils.py,afterChange,593,documentation,bug(fix)
13912,keras-11,C_6_7,U2,d6b5c5ebb410e3366c9d7aca41977a60134bfe10,keras/engine/training_utils.py,afterChange,594,documentation,bug(fix)
26358,pandas-105,C_5_8,U2,cb5f9d1ff407f5ccef7c717e0c23bbd6ed96cf5f,pandas/core/generic.py,beforeChange,668,documentation,bug(fix)
45587,pandas-90,C_3_10,U2,1c3d64bae7c07b5ae1be337e0ebd751385b7ce27,pandas/io/pickle.py,afterChange,165,documentation,bug(fix)
45590,pandas-90,C_3_10,U2,1c3d64bae7c07b5ae1be337e0ebd751385b7ce27,pandas/io/pickle.py,afterChange,166,documentation,bug(fix)
45593,pandas-90,C_3_10,U2,1c3d64bae7c07b5ae1be337e0ebd751385b7ce27,pandas/io/pickle.py,afterChange,167,documentation,bug(fix)
45596,pandas-90,C_3_10,U2,1c3d64bae7c07b5ae1be337e0ebd751385b7ce27,pandas/io/pickle.py,afterChange,168,documentation,bug(fix)


In [121]:
df_3.shape[0]/merge_sel.shape[0]

0.0002648787352040394

In [122]:
df_3.sample(5)

Unnamed: 0,bug,bundle,user,sha,file,image,line,annotation_hapy,annotation_auto
13912,keras-11,C_6_7,U2,d6b5c5ebb410e3366c9d7aca41977a60134bfe10,keras/engine/training_utils.py,afterChange,594,documentation,bug(fix)
3793,black-15,D_6_1,U3,df2ae3bbe6c45298aabb6c04e85cb353205626f1,black.py,afterChange,2567,documentation,bug(fix)
13909,keras-11,C_6_7,U2,d6b5c5ebb410e3366c9d7aca41977a60134bfe10,keras/engine/training_utils.py,afterChange,593,documentation,bug(fix)
45599,pandas-90,C_3_10,U2,1c3d64bae7c07b5ae1be337e0ebd751385b7ce27,pandas/io/pickle.py,afterChange,169,documentation,bug(fix)
7724,cookiecutter-4,C_6_7,U2,457a1a4e862aab4102b644ff1d2b2e2b5a766b3c,cookiecutter/exceptions.py,afterChange,85,documentation,bug(fix)


- 2f3edf96078d78450b985bdf3bfffe7e0c627169:keras/engine/training.py:1946 - inside very long docstring (should be detected by new annotator for `from-repo` case)
- 1c3d64bae7c07b5ae1be337e0ebd751385b7ce27:pandas/io/pickle.py:165 - code just after the end of the docstring; here the user is _**wrong**_,<br> though the context visible in the UI of Label Studio might be the cause of this mistake
- 1c3d64bae7c07b5ae1be337e0ebd751385b7ce27:pandas/io/pickle.py:115 - inside quite a long docstring (should be detected by new annotator)
- 50817487ce5b1a2c4896495509e2b53e22fa3212:pandas/core/indexes/timedeltas.py:218 - 2-nd line of 3-line docstring (should be detected by new annotator)
- 65c7c05060fd2d1fc161d4904243d5e0b31e202b^:scrapy/utils/response.py:50 - empty line inside just started docstring - one line prior (should be detected by new annotator)

In [123]:
merge_sel[merge_sel['annotation_neq']]['annotation_auto'].value_counts()/merge_sel.shape[0]

annotation_auto
bug(fix)         0.040659
test             0.031074
documentation    0.005579
data             0.001440
project          0.000050
Name: count, dtype: float64

In [124]:
merge_sel[merge_sel['annotation_auto']=='project']

Unnamed: 0,ds,bug,bundle,user,file,fcat_hapy,image,line,annotation_hapy,sha,fcat_auto,annotation_auto,indicator_column,fcat_neq,annotation_neq
56649,bugs-in-py,tornado-15,A_6_19,E3,MANIFEST.in,project,afterChange,12,bug(fix),ecb3ea7543cc942659faf3d2144853018afa6139,project,project,both,False,True
56650,bugs-in-py,tornado-15,C_2_11,U2,MANIFEST.in,project,afterChange,12,bug(fix),ecb3ea7543cc942659faf3d2144853018afa6139,project,project,both,False,True
56651,bugs-in-py,tornado-15,B_4_15,U1,MANIFEST.in,project,afterChange,12,test,ecb3ea7543cc942659faf3d2144853018afa6139,project,project,both,False,True


In [125]:
merge_sel[merge_sel['annotation_auto']=='data']

Unnamed: 0,ds,bug,bundle,user,file,fcat_hapy,image,line,annotation_hapy,sha,fcat_auto,annotation_auto,indicator_column,fcat_neq,annotation_neq
468,bugs-in-py,ansible-10,A_2_23,E2,changelogs/fragments/66398-pamd_fix-attributee...,documentation,afterChange,1,documentation,a4b59d021368285490f7cda50c11ac4f7a8030b5,data,data,both,True,True
469,bugs-in-py,ansible-10,C_4_9,U2,changelogs/fragments/66398-pamd_fix-attributee...,documentation,afterChange,1,documentation,a4b59d021368285490f7cda50c11ac4f7a8030b5,data,data,both,True,True
470,bugs-in-py,ansible-10,B_4_15,U1,changelogs/fragments/66398-pamd_fix-attributee...,documentation,afterChange,1,documentation,a4b59d021368285490f7cda50c11ac4f7a8030b5,data,data,both,True,True
471,bugs-in-py,ansible-10,A_2_23,E2,changelogs/fragments/66398-pamd_fix-attributee...,documentation,afterChange,2,documentation,a4b59d021368285490f7cda50c11ac4f7a8030b5,data,data,both,True,True
472,bugs-in-py,ansible-10,C_4_9,U2,changelogs/fragments/66398-pamd_fix-attributee...,documentation,afterChange,2,documentation,a4b59d021368285490f7cda50c11ac4f7a8030b5,data,data,both,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56839,bugs-in-py,tornado-3,B_4_15,U1,.travis.yml,project,beforeChange,87,documentation,aa622e724f80e0f7fcee369f75d69d1db13d72f2,data,data,both,True,True
56840,bugs-in-py,tornado-3,A_5_20,E2,.travis.yml,project,beforeChange,87,documentation,aa622e724f80e0f7fcee369f75d69d1db13d72f2,data,data,both,True,True
56841,bugs-in-py,tornado-3,C_6_7,U2,.travis.yml,project,beforeChange,88,bug(fix),aa622e724f80e0f7fcee369f75d69d1db13d72f2,data,data,both,True,True
56842,bugs-in-py,tornado-3,B_4_15,U1,.travis.yml,project,beforeChange,88,other,aa622e724f80e0f7fcee369f75d69d1db13d72f2,data,data,both,True,True


### Mismatch between labels: human vs tool

In [126]:
merge_sel['annotation_hapy'].value_counts()

annotation_hapy
test                      33343
bug(fix)                  14764
documentation              7959
bug(fix) + refactoring     1534
test + refactoring         1155
refactoring                1136
other                       303
Name: count, dtype: int64

In [127]:
merge_sel['annotation_auto'].value_counts()

annotation_auto
test             34174
bug(fix)         17005
documentation     7546
data                87
project              3
Name: count, dtype: int64

Possible values of annotation available to human annotators:
- bug(fix)
- bug(fix) + refactoring
- test
- test + refactoring
- refactoring
- documentation
- other

Possible values of line types that `diff-annotate` can produce:
- code (for files with "programming" purpose)
  - $=$ bug(fix)
- test
- documentation (includes comments in "programming")
- project
- _markup_ (not present for HaPy-Bug)
- data
- _other_ (not present for HaPy-Bug)

In [128]:
merge_sel[
    merge_sel['annotation_neq'] &
    (
        (merge_sel['annotation_auto'] == 'project') |
        (merge_sel['annotation_auto'] == 'data') |
        (merge_sel['annotation_auto'] == 'markup') |
        (merge_sel['annotation_auto'] == 'other')
    )
][['annotation_hapy', 'annotation_auto']].value_counts()

annotation_hapy  annotation_auto
documentation    data               77
bug(fix)         data                4
refactoring      data                4
bug(fix)         project             2
other            data                2
test             project             1
Name: count, dtype: int64

In [129]:
merge_sel[
    merge_sel['annotation_neq'] &
    (
        (merge_sel['annotation_auto'] == 'project') |
        (merge_sel['annotation_auto'] == 'data') |
        (merge_sel['annotation_auto'] == 'markup') |
        (merge_sel['annotation_auto'] == 'other')
    ) &
    (merge_sel['annotation_hapy'] != 'documentation')
][['bug', 'bundle', 'user', 'sha', 'file', 'image', 'line', 'annotation_hapy', 'annotation_auto', 'annotation_neq']]

Unnamed: 0,bug,bundle,user,sha,file,image,line,annotation_hapy,annotation_auto,annotation_neq
18681,keras-43,A_1_24,E1,b17169ca5d6cd1c8aeb237fc2bb0555c9e1b6a02,docs/mkdocs.yml,afterChange,2,refactoring,data,True
18684,keras-43,A_1_24,E1,b17169ca5d6cd1c8aeb237fc2bb0555c9e1b6a02,docs/mkdocs.yml,afterChange,3,refactoring,data,True
18687,keras-43,A_1_24,E1,b17169ca5d6cd1c8aeb237fc2bb0555c9e1b6a02,docs/mkdocs.yml,beforeChange,2,refactoring,data,True
18690,keras-43,A_1_24,E1,b17169ca5d6cd1c8aeb237fc2bb0555c9e1b6a02,docs/mkdocs.yml,beforeChange,3,refactoring,data,True
56649,tornado-15,A_6_19,E3,ecb3ea7543cc942659faf3d2144853018afa6139,MANIFEST.in,afterChange,12,bug(fix),project,True
56650,tornado-15,C_2_11,U2,ecb3ea7543cc942659faf3d2144853018afa6139,MANIFEST.in,afterChange,12,bug(fix),project,True
56651,tornado-15,B_4_15,U1,ecb3ea7543cc942659faf3d2144853018afa6139,MANIFEST.in,afterChange,12,test,project,True
56835,tornado-3,C_6_7,U2,aa622e724f80e0f7fcee369f75d69d1db13d72f2,.travis.yml,afterChange,88,bug(fix),data,True
56836,tornado-3,B_4_15,U1,aa622e724f80e0f7fcee369f75d69d1db13d72f2,.travis.yml,afterChange,88,other,data,True
56837,tornado-3,A_5_20,E2,aa622e724f80e0f7fcee369f75d69d1db13d72f2,.travis.yml,afterChange,88,bug(fix),data,True


In [130]:
merge_sel[
    merge_sel['annotation_neq'] &
    (merge_sel['annotation_auto'] == 'data') &
    (merge_sel['annotation_hapy'] == 'documentation')
][['bug', 'bundle', 'user', 'sha', 'file', 'image', 'line', 'annotation_hapy', 'annotation_auto']]

Unnamed: 0,bug,bundle,user,sha,file,image,line,annotation_hapy,annotation_auto
468,ansible-10,A_2_23,E2,a4b59d021368285490f7cda50c11ac4f7a8030b5,changelogs/fragments/66398-pamd_fix-attributee...,afterChange,1,documentation,data
469,ansible-10,C_4_9,U2,a4b59d021368285490f7cda50c11ac4f7a8030b5,changelogs/fragments/66398-pamd_fix-attributee...,afterChange,1,documentation,data
470,ansible-10,B_4_15,U1,a4b59d021368285490f7cda50c11ac4f7a8030b5,changelogs/fragments/66398-pamd_fix-attributee...,afterChange,1,documentation,data
471,ansible-10,A_2_23,E2,a4b59d021368285490f7cda50c11ac4f7a8030b5,changelogs/fragments/66398-pamd_fix-attributee...,afterChange,2,documentation,data
472,ansible-10,C_4_9,U2,a4b59d021368285490f7cda50c11ac4f7a8030b5,changelogs/fragments/66398-pamd_fix-attributee...,afterChange,2,documentation,data
...,...,...,...,...,...,...,...,...,...
56833,tornado-3,B_4_15,U1,aa622e724f80e0f7fcee369f75d69d1db13d72f2,.travis.yml,afterChange,87,documentation,data
56834,tornado-3,A_5_20,E2,aa622e724f80e0f7fcee369f75d69d1db13d72f2,.travis.yml,afterChange,87,documentation,data
56838,tornado-3,C_6_7,U2,aa622e724f80e0f7fcee369f75d69d1db13d72f2,.travis.yml,beforeChange,87,documentation,data
56839,tornado-3,B_4_15,U1,aa622e724f80e0f7fcee369f75d69d1db13d72f2,.travis.yml,beforeChange,87,documentation,data


In [131]:
merge_sel[
    merge_sel['annotation_neq'] &
    (merge_sel['annotation_auto'] == 'data') &
    (merge_sel['annotation_hapy'] == 'documentation')
][['bug', 'bundle', 'user', 'sha', 'file', 'image', 'line', 'annotation_hapy', 'annotation_auto']].sample(6)

Unnamed: 0,bug,bundle,user,sha,file,image,line,annotation_hapy,annotation_auto
1729,ansible-4,D_2_5,U3,18a66e291dad71128a32d662aa808213acefe0e9,changelogs/fragments/68723-force-static-collec...,afterChange,2,documentation,data
905,ansible-13,A_3_22,E3,694ef5660d45fcb97c9beea5b2750f6eadcf5e93,changelogs/fragments/collection-install-url.yaml,afterChange,1,documentation,data
56840,tornado-3,A_5_20,E2,aa622e724f80e0f7fcee369f75d69d1db13d72f2,.travis.yml,beforeChange,87,documentation,data
468,ansible-10,A_2_23,E2,a4b59d021368285490f7cda50c11ac4f7a8030b5,changelogs/fragments/66398-pamd_fix-attributee...,afterChange,1,documentation,data
715,ansible-12,D_5_2,U3,2fa8f9cfd80daf32c7d222190edf7cfc7234582a,changelogs/fragments/65541-fix-utf8-issue-env-...,afterChange,2,documentation,data
1309,ansible-17,B_1_18,U1,b38cb37728df76e0529243bdce694b18ca0e1163,changelogs/fragments/mount-facts-octal-escapes...,afterChange,1,documentation,data


In [132]:
merge_sel_ren = merge_sel.copy()
merge_sel_ren['annotation_auto'] = merge_sel_ren['annotation_auto'].str.replace('data', 'documentation')

merge_sel_ren['annotation_neq'] = merge_sel_ren['annotation_hapy'] != merge_sel_ren['annotation_auto']

merge_sel_ren[[
    'ds', 'bug', 'sha',
    'bundle', 'user',
    'file', 'fcat_hapy', 'fcat_auto',
    'image', 'line', 'annotation_hapy', 'annotation_auto',
    'fcat_neq', 'annotation_neq'
]].head()

Unnamed: 0,ds,bug,sha,bundle,user,file,fcat_hapy,fcat_auto,image,line,annotation_hapy,annotation_auto,fcat_neq,annotation_neq
0,bugs-in-py,PySnooper-1,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,B_6_13,U1,pysnooper/pycompat.py,programming,programming,afterChange,11,other,bug(fix),False,True
1,bugs-in-py,PySnooper-1,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,D_2_5,U3,pysnooper/pycompat.py,programming,programming,afterChange,11,bug(fix),bug(fix),False,False
2,bugs-in-py,PySnooper-1,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,A_4_21,E1,pysnooper/pycompat.py,programming,programming,afterChange,11,bug(fix),bug(fix),False,False
3,bugs-in-py,PySnooper-1,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,B_6_13,U1,pysnooper/tracer.py,programming,programming,afterChange,17,bug(fix) + refactoring,bug(fix),False,True
4,bugs-in-py,PySnooper-1,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,D_2_5,U3,pysnooper/tracer.py,programming,programming,afterChange,17,bug(fix),bug(fix),False,False


In [133]:
merge_sel['annotation_neq'].value_counts()/merge_sel.shape[0]

annotation_neq
False    0.894876
True     0.105124
Name: count, dtype: float64

In [134]:
merge_sel_ren['annotation_neq'].value_counts()/merge_sel_ren.shape[0]

annotation_neq
False    0.896151
True     0.103849
Name: count, dtype: float64

So renaming "data" to "documentation" in the "annotation_auto" column did not help much

### Per-author comparison (WIP)

In [135]:
user_counts_s = merge_sel['user'].value_counts()
user_counts_s

user
U2    15711
U3    15589
U1    14874
E2     5198
E3     4931
E1     3891
Name: count, dtype: int64

In [136]:
users = user_counts_s.index.to_list()
users

['U2', 'U3', 'U1', 'E2', 'E3', 'E1']

In [137]:
merge_sel_U1 = merge_sel[merge_sel['user'] == 'U1']
merge_sel_U1[merge_sel_U1['annotation_neq']][[
    'bug','bundle','user','sha',
    'file','image','line',
    'annotation_hapy', 'annotation_auto'
]]

Unnamed: 0,bug,bundle,user,sha,file,image,line,annotation_hapy,annotation_auto
0,PySnooper-1,B_6_13,U1,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,pysnooper/pycompat.py,afterChange,11,other,bug(fix)
3,PySnooper-1,B_6_13,U1,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,pysnooper/tracer.py,afterChange,17,bug(fix) + refactoring,bug(fix)
6,PySnooper-1,B_6_13,U1,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,pysnooper/tracer.py,afterChange,18,bug(fix) + refactoring,bug(fix)
12,PySnooper-1,B_6_13,U1,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,pysnooper/tracer.py,afterChange,135,bug(fix) + refactoring,bug(fix)
21,PySnooper-1,B_6_13,U1,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,pysnooper/tracer.py,beforeChange,133,bug(fix) + refactoring,bug(fix)
...,...,...,...,...,...,...,...,...,...
60194,youtube-dl-5,B_2_17,U1,7dc2a74e0ac9cfa74cc9de6f586ffd5cc8bac0d9,youtube_dl/utils.py,beforeChange,1112,bug(fix) + refactoring,bug(fix)
60220,youtube-dl-6,B_3_16,U1,d631d5f9f27f93767226192e4288990413fa9dbd,youtube_dl/utils.py,afterChange,1979,other,bug(fix)
60223,youtube-dl-6,B_3_16,U1,d631d5f9f27f93767226192e4288990413fa9dbd,youtube_dl/utils.py,afterChange,2023,bug(fix) + refactoring,bug(fix)
60244,youtube-dl-6,B_3_16,U1,d631d5f9f27f93767226192e4288990413fa9dbd,youtube_dl/utils.py,beforeChange,1979,other,bug(fix)


In [138]:
merge_sel_U1.shape

(14874, 15)

In [139]:
merge_sel_U1['annotation_neq'].value_counts()/merge_sel_U1.shape[0]

annotation_neq
False    0.879857
True     0.120143
Name: count, dtype: float64

In [140]:
merge_sel_U1[merge_sel_U1['annotation_neq']][['annotation_hapy', 'annotation_auto']].value_counts()/merge_sel_U1.shape[0]

annotation_hapy         annotation_auto
bug(fix) + refactoring  bug(fix)           0.053046
other                   bug(fix)           0.015867
test + refactoring      test               0.010891
refactoring             bug(fix)           0.003025
bug(fix)                test               0.002151
refactoring             test               0.001546
                        documentation      0.001412
test                    documentation      0.001412
documentation           data               0.001345
bug(fix)                documentation      0.000605
other                   test               0.000403
documentation           test               0.000336
other                   documentation      0.000269
                        data               0.000134
bug(fix) + refactoring  test               0.000134
test                    bug(fix)           0.000067
                        project            0.000067
Name: count, dtype: float64

In [141]:
merge_sel_U1_refactoring = merge_sel_U1['annotation_hapy'].str.contains('refactoring')
merge_sel_U1['annotation_hapy'].str.contains('refactoring').sum()/merge_sel_U1.shape[0]

np.float64(0.0700551297566223)

In [142]:
merge_sel_U1[~merge_sel_U1_refactoring]['annotation_neq'].value_counts()

annotation_neq
False    13087
True       745
Name: count, dtype: int64

In [143]:
count_not_refactoring = (merge_sel_U1.shape[0] - merge_sel_U1['annotation_hapy'].str.contains('refactoring').sum())
count_not_refactoring

np.int64(13832)

In [144]:
merge_sel_U1[~merge_sel_U1_refactoring]['annotation_neq'].value_counts()/count_not_refactoring

annotation_neq
False    0.946139
True     0.053861
Name: count, dtype: float64

In [145]:
merge_sel_U1[~merge_sel_U1_refactoring & merge_sel_U1['annotation_neq']][[
    'annotation_hapy', 'annotation_auto'
]].value_counts()/count_not_refactoring

annotation_hapy  annotation_auto
other            bug(fix)           0.017062
bug(fix)         test               0.002313
test             documentation      0.001518
documentation    data               0.001446
bug(fix)         documentation      0.000651
other            test               0.000434
documentation    test               0.000361
other            documentation      0.000289
                 data               0.000145
test             bug(fix)           0.000072
                 project            0.000072
Name: count, dtype: float64

## Consensus in HaPy-Bug

Code copied from [`00-HaPy_Bug-Paper.ipynb`](./00-HaPy_Bug-Paper.ipynb) notebook.

(with some changes)

In [146]:
user_counts_s = merge_sel['user'].value_counts()
user_counts_s

user
U2    15711
U3    15589
U1    14874
E2     5198
E3     4931
E1     3891
Name: count, dtype: int64

In [147]:
users = user_counts_s.index.to_list()
users

['U2', 'U3', 'U1', 'E2', 'E3', 'E1']

In [148]:
user = 'U1'
mask = (collective_df['user'] == user) & (collective_df['auto'] == False)

#collective_df[filter]

In [149]:
df = collective_df

user_l = {}
df_inter_rater = None
users_ids = ['U1', 'U2', 'U3', 'E1', 'E2', 'E3']
cols = ['ds', 'id', 'file', 'image', 'line', 'annotation']  # no 'fileid', like at that point of 00-HaPy_Bug-Paper.ipynb
colsj = cols[:-1]

for user in users_ids:
    mask = (df['user'] == user) & (df['auto'] == False)
    if df_inter_rater is None:
        df_inter_rater = df[mask][cols].copy()
    else:
        df_inter_rater = df_inter_rater.merge(df[mask][cols], left_on = colsj, right_on = colsj, suffixes=('', "_"+user), how='outer')

    #df_inter_rater = df_inter_rater.drop_duplicates(subset=['id', 'file', 'image', 'line', 'annotation'])
    
    #tmp_g_debug = df_inter_rater.groupby(colsj).count().reset_index()
    #display(tmp_g_debug[tmp_g_debug['annotation'] > 1])

In [150]:
df.shape

(391918, 11)

In [151]:
mask = (df['auto'] == False)
len(df[mask])/3, len(df[~mask])/3,

(65321.666666666664, 65317.666666666664)

In [152]:
df_inter_rater.columns = [ c if c != 'annotation' else 'annotation_U1' for c in df_inter_rater.columns]

In [153]:
possible_line_annotations = df['annotation'].unique().tolist()
possible_line_annotations

['bug(fix)',
 'bug(fix) + refactoring',
 'documentation',
 'refactoring',
 'other',
 'test + refactoring',
 'test']

In [154]:
df['annotation'].value_counts()

annotation
test                      183689
bug(fix)                  122846
documentation              69243
refactoring                 6727
bug(fix) + refactoring      4606
test + refactoring          2861
other                       1946
Name: count, dtype: int64

In [155]:
df_inter_rater

Unnamed: 0,ds,id,file,image,line,annotation_U1,annotation_U2,annotation_U3,annotation_E1,annotation_E2,annotation_E3
0,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/pycompat.py,afterChange,11,other,,bug(fix),bug(fix),,
1,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/tracer.py,afterChange,17,bug(fix) + refactoring,,bug(fix),bug(fix),,
2,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/tracer.py,afterChange,18,bug(fix) + refactoring,,bug(fix),bug(fix),,
3,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/tracer.py,afterChange,89,bug(fix),,bug(fix),bug(fix),,
4,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/tracer.py,afterChange,135,bug(fix) + refactoring,,bug(fix) + refactoring,bug(fix),,
...,...,...,...,...,...,...,...,...,...,...,...
145264,cve,cve_CVE-2022-31124,tests/fuzzer/fuzz_valid_magic.py,afterChange,27,test,test,,,test,
145265,cve,cve_CVE-2022-31124,tests/fuzzer/requirements.txt,afterChange,1,other,bug(fix),,,other,
145266,cve,cve_CVE-2022-31124,tests/fuzzer/requirements.txt,afterChange,2,other,bug(fix),,,other,
145267,cve,cve_CVE-2022-31124,tests/fuzzer/requirements.txt,afterChange,3,other,bug(fix),,,other,


In [156]:
def consensus(row):
    ret = {k:0 for k in possible_line_annotations}
    val = None
    for u in users_ids:
        u_val = row['annotation_' + u]
        if not pd.isnull(u_val):
            ret[u_val] += 1
            val = u_val
            
    if sum(ret.values()) < 3: # Check if all 3 reviewers annotated line
        print(f"Error at {row['id']}: {sum(ret.values())} < 3 reviewers")

    return Counter(ret).most_common(1)[0][0]

In [157]:
def n_reviewers(row):
    ret = 0
    for u in users_ids:
        u_val = row['annotation_' + u]
        if not pd.isnull(u_val):
            ret += 1

    return ret

In [158]:
def most_common_count(row):
    ret = {k:0 for k in possible_line_annotations}
    val = None
    for u in users_ids:
        u_val = row['annotation_' + u]
        if not pd.isnull(u_val):
            ret[u_val] += 1
            val = u_val
            
    #if sum(ret.values()) < 3: # Check if all 3 reviewers annotated line
    #    print(f"Error at {row['id']}: {ret.values()} < 3 reviewers")

    return Counter(ret).most_common(1)[0][1]

In [159]:
tmp = pd.DataFrame(df_inter_rater.apply(consensus, axis=1).tolist())
G = df_inter_rater.copy()
G['most_common'] = tmp
G['common_count'] = pd.DataFrame(df_inter_rater.apply(most_common_count, axis=1).tolist())
G['n_reviewers'] = pd.DataFrame(df_inter_rater.apply(n_reviewers, axis=1).tolist())

Error at bugs-in-py_black-6: 2 < 3 reviewers
Error at bugs-in-py_black-6: 2 < 3 reviewers
Error at bugs-in-py_black-6: 2 < 3 reviewers
Error at bugs-in-py_black-6: 2 < 3 reviewers
Error at bugs-in-py_black-6: 2 < 3 reviewers
Error at bugs-in-py_black-6: 2 < 3 reviewers
Error at bugs-in-py_black-6: 2 < 3 reviewers
Error at bugs-in-py_black-6: 2 < 3 reviewers
Error at bugs-in-py_black-6: 2 < 3 reviewers
Error at bugs-in-py_black-6: 2 < 3 reviewers
Error at bugs-in-py_black-6: 2 < 3 reviewers
Error at bugs-in-py_black-6: 2 < 3 reviewers
Error at bugs-in-py_black-6: 2 < 3 reviewers
Error at bugs-in-py_black-6: 2 < 3 reviewers
Error at bugs-in-py_black-6: 2 < 3 reviewers
Error at bugs-in-py_black-6: 2 < 3 reviewers
Error at bugs-in-py_black-6: 2 < 3 reviewers
Error at bugs-in-py_black-6: 2 < 3 reviewers
Error at bugs-in-py_black-6: 2 < 3 reviewers
Error at bugs-in-py_black-6: 2 < 3 reviewers
Error at bugs-in-py_black-6: 2 < 3 reviewers
Error at bugs-in-py_black-6: 2 < 3 reviewers
Error at b

In [160]:
G.loc[G['common_count'] >= 2, 'consensus'] = G[G['common_count'] >= 2]['most_common']

In [161]:
G

Unnamed: 0,ds,id,file,image,line,annotation_U1,annotation_U2,annotation_U3,annotation_E1,annotation_E2,annotation_E3,most_common,common_count,n_reviewers,consensus
0,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/pycompat.py,afterChange,11,other,,bug(fix),bug(fix),,,bug(fix),2,3,bug(fix)
1,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/tracer.py,afterChange,17,bug(fix) + refactoring,,bug(fix),bug(fix),,,bug(fix),2,3,bug(fix)
2,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/tracer.py,afterChange,18,bug(fix) + refactoring,,bug(fix),bug(fix),,,bug(fix),2,3,bug(fix)
3,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/tracer.py,afterChange,89,bug(fix),,bug(fix),bug(fix),,,bug(fix),3,3,bug(fix)
4,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/tracer.py,afterChange,135,bug(fix) + refactoring,,bug(fix) + refactoring,bug(fix),,,bug(fix) + refactoring,2,3,bug(fix) + refactoring
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145264,cve,cve_CVE-2022-31124,tests/fuzzer/fuzz_valid_magic.py,afterChange,27,test,test,,,test,,test,3,3,test
145265,cve,cve_CVE-2022-31124,tests/fuzzer/requirements.txt,afterChange,1,other,bug(fix),,,other,,other,2,3,other
145266,cve,cve_CVE-2022-31124,tests/fuzzer/requirements.txt,afterChange,2,other,bug(fix),,,other,,other,2,3,other
145267,cve,cve_CVE-2022-31124,tests/fuzzer/requirements.txt,afterChange,3,other,bug(fix),,,other,,other,2,3,other


In [162]:
G['n_reviewers'].value_counts()

n_reviewers
3    131259
2     14010
Name: count, dtype: int64

In [163]:
G['n_reviewers'].value_counts()/G.shape[0]

n_reviewers
3    0.903558
2    0.096442
Name: count, dtype: float64

In [164]:
G['consensus'].value_counts()/G.shape[0]

consensus
test                      0.435702
documentation             0.283729
bug(fix)                  0.233154
refactoring               0.011083
test + refactoring        0.006650
bug(fix) + refactoring    0.004640
other                     0.001583
Name: count, dtype: float64

In [165]:
(G['consensus'].value_counts()/G.shape[0]).sum()

np.float64(0.9765400739318093)

In [166]:
G['consensus'].isna().value_counts()/G.shape[0]

consensus
False    0.97654
True     0.02346
Name: count, dtype: float64

In [167]:
G[G['consensus'].isna()][['common_count', 'n_reviewers']].value_counts()/G.shape[0]

common_count  n_reviewers
1             2              0.014167
              3              0.009293
Name: count, dtype: float64

In [168]:
G.to_csv("consensus.csv")
%ls -l -h 'consensus.csv'

-rw-r--r-- 1 jnareb jnareb 19M Dec  5 09:35 consensus.csv


### Consensus in $D_{BIP}$

In [169]:
G_bip = G[G['ds'] == 'bugs-in-py']
G_bip

Unnamed: 0,ds,id,file,image,line,annotation_U1,annotation_U2,annotation_U3,annotation_E1,annotation_E2,annotation_E3,most_common,common_count,n_reviewers,consensus
0,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/pycompat.py,afterChange,11,other,,bug(fix),bug(fix),,,bug(fix),2,3,bug(fix)
1,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/tracer.py,afterChange,17,bug(fix) + refactoring,,bug(fix),bug(fix),,,bug(fix),2,3,bug(fix)
2,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/tracer.py,afterChange,18,bug(fix) + refactoring,,bug(fix),bug(fix),,,bug(fix),2,3,bug(fix)
3,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/tracer.py,afterChange,89,bug(fix),,bug(fix),bug(fix),,,bug(fix),3,3,bug(fix)
4,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/tracer.py,afterChange,135,bug(fix) + refactoring,,bug(fix) + refactoring,bug(fix),,,bug(fix) + refactoring,2,3,bug(fix) + refactoring
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20216,bugs-in-py,bugs-in-py_youtube-dl-9,youtube_dl/YoutubeDL.py,beforeChange,955,,bug(fix),bug(fix),,bug(fix),,bug(fix),3,3,bug(fix)
20217,bugs-in-py,bugs-in-py_youtube-dl-9,youtube_dl/YoutubeDL.py,beforeChange,966,,bug(fix),bug(fix),,bug(fix),,bug(fix),3,3,bug(fix)
20218,bugs-in-py,bugs-in-py_youtube-dl-9,youtube_dl/YoutubeDL.py,beforeChange,969,,bug(fix),bug(fix),,bug(fix),,bug(fix),3,3,bug(fix)
20219,bugs-in-py,bugs-in-py_youtube-dl-9,youtube_dl/YoutubeDL.py,beforeChange,970,,bug(fix),bug(fix),,bug(fix),,bug(fix),3,3,bug(fix)


Consensus only

In [170]:
G.shape

(145269, 15)

In [171]:
G_bip.shape

(20221, 15)

In [172]:
G_bip_consensus = G_bip[~G_bip['consensus'].isna()]
G_bip_consensus.shape

(19762, 15)

In [173]:
G_bip_consensus.columns

Index(['ds', 'id', 'file', 'image', 'line', 'annotation_U1', 'annotation_U2',
       'annotation_U3', 'annotation_E1', 'annotation_E2', 'annotation_E3',
       'most_common', 'common_count', 'n_reviewers', 'consensus'],
      dtype='object')

### Compare with consensus

In [174]:
from_repos_df['id'] = from_repos_df['ds'] + '_' + from_repos_df['bug']
from_repos_df.head(5)

Unnamed: 0,id,ds,bug,sha,file,fcat,image,line,annotation
0,bugs-in-py_httpie-1,bugs-in-py,httpie-1,5300b0b490b8db48fac30b5e32164be93dc574b7,CHANGELOG.rst,documentation,afterChange,30,documentation
1,bugs-in-py_httpie-1,bugs-in-py,httpie-1,5300b0b490b8db48fac30b5e32164be93dc574b7,httpie/downloads.py,programming,beforeChange,142,bug(fix)
2,bugs-in-py_httpie-1,bugs-in-py,httpie-1,5300b0b490b8db48fac30b5e32164be93dc574b7,httpie/downloads.py,programming,beforeChange,143,bug(fix)
3,bugs-in-py_httpie-1,bugs-in-py,httpie-1,5300b0b490b8db48fac30b5e32164be93dc574b7,httpie/downloads.py,programming,afterChange,10,bug(fix)
4,bugs-in-py_httpie-1,bugs-in-py,httpie-1,5300b0b490b8db48fac30b5e32164be93dc574b7,httpie/downloads.py,programming,afterChange,139,bug(fix)


In [175]:
from_repos_df_sel_2 = from_repos_df[['ds', 'id', 'sha', 'file', 'image', 'line', 'annotation']]
from_repos_df_sel_2.head(5)

Unnamed: 0,ds,id,sha,file,image,line,annotation
0,bugs-in-py,bugs-in-py_httpie-1,5300b0b490b8db48fac30b5e32164be93dc574b7,CHANGELOG.rst,afterChange,30,documentation
1,bugs-in-py,bugs-in-py_httpie-1,5300b0b490b8db48fac30b5e32164be93dc574b7,httpie/downloads.py,beforeChange,142,bug(fix)
2,bugs-in-py,bugs-in-py_httpie-1,5300b0b490b8db48fac30b5e32164be93dc574b7,httpie/downloads.py,beforeChange,143,bug(fix)
3,bugs-in-py,bugs-in-py_httpie-1,5300b0b490b8db48fac30b5e32164be93dc574b7,httpie/downloads.py,afterChange,10,bug(fix)
4,bugs-in-py,bugs-in-py_httpie-1,5300b0b490b8db48fac30b5e32164be93dc574b7,httpie/downloads.py,afterChange,139,bug(fix)


In [176]:
G_bip_consensus_sel_2 = G_bip_consensus[['ds', 'id', 'file', 'image', 'line', 'n_reviewers', 'common_count', 'consensus']]
G_bip_consensus_sel_2.head(5)

Unnamed: 0,ds,id,file,image,line,n_reviewers,common_count,consensus
0,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/pycompat.py,afterChange,11,3,2,bug(fix)
1,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/tracer.py,afterChange,17,3,2,bug(fix)
2,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/tracer.py,afterChange,18,3,2,bug(fix)
3,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/tracer.py,afterChange,89,3,3,bug(fix)
4,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/tracer.py,afterChange,135,3,2,bug(fix) + refactoring


In [177]:
hapy_bip_from_repos_df_sel_2 = hapy_bip_from_repos_df[['ds', 'id', 'sha', 'file', 'image', 'line', 'annotation']]
hapy_bip_from_repos_df_sel_2.head(5)

Unnamed: 0,ds,id,sha,file,image,line,annotation
0,bugs-in-py,bugs-in-py_httpie-1,5300b0b490b8db48fac30b5e32164be93dc574b7,CHANGELOG.rst,afterChange,30,documentation
1,bugs-in-py,bugs-in-py_httpie-1,5300b0b490b8db48fac30b5e32164be93dc574b7,httpie/downloads.py,beforeChange,142,bug(fix)
2,bugs-in-py,bugs-in-py_httpie-1,5300b0b490b8db48fac30b5e32164be93dc574b7,httpie/downloads.py,beforeChange,143,bug(fix)
3,bugs-in-py,bugs-in-py_httpie-1,5300b0b490b8db48fac30b5e32164be93dc574b7,httpie/downloads.py,afterChange,10,bug(fix)
4,bugs-in-py,bugs-in-py_httpie-1,5300b0b490b8db48fac30b5e32164be93dc574b7,httpie/downloads.py,afterChange,139,bug(fix)


#### Merge consensus vs bugsinpy-from-repos (from_repos_df_sel_2)

In [178]:
merge_sel_consensus = pd.merge(
    G_bip_consensus_sel_2, from_repos_df_sel_2,
    how='outer', on=['ds', 'id', 'file', 'image', 'line'],
    indicator="indicator_column", suffixes=("_hapy", "_auto"),
)
merge_sel_consensus.head()

Unnamed: 0,ds,id,file,image,line,n_reviewers,common_count,consensus,sha,annotation,indicator_column
0,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/pycompat.py,afterChange,11,3.0,2.0,bug(fix),56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,bug(fix),both
1,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/tracer.py,afterChange,17,3.0,2.0,bug(fix),56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,bug(fix),both
2,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/tracer.py,afterChange,18,3.0,2.0,bug(fix),56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,bug(fix),both
3,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/tracer.py,afterChange,89,3.0,3.0,bug(fix),56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,bug(fix),both
4,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/tracer.py,afterChange,135,3.0,2.0,bug(fix) + refactoring,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,bug(fix),both


#### Merge consensus vs hapy_bip-from-repos (hapy_bip_from_repos_df_sel_2)

In [179]:
hapy_bip_merge_sel_consensus = pd.merge(
    G_bip_consensus_sel_2, hapy_bip_from_repos_df_sel_2,
    how='outer', on=['ds', 'id', 'file', 'image', 'line'],
    indicator="indicator_column", suffixes=("_hapy", "_auto"),
)
hapy_bip_merge_sel_consensus.head()

Unnamed: 0,ds,id,file,image,line,n_reviewers,common_count,consensus,sha,annotation,indicator_column
0,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/pycompat.py,afterChange,11,3.0,2.0,bug(fix),56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,bug(fix),both
1,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/tracer.py,afterChange,17,3.0,2.0,bug(fix),56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,bug(fix),both
2,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/tracer.py,afterChange,18,3.0,2.0,bug(fix),56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,bug(fix),both
3,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/tracer.py,afterChange,89,3.0,3.0,bug(fix),56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,bug(fix),both
4,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/tracer.py,afterChange,135,3.0,2.0,bug(fix) + refactoring,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,bug(fix),both


#### Merge consensus vs dataset (hapy_bip_from_dataset_df_sel)

In [180]:
hapy_bip_from_dataset_df_sel = hapy_bip_from_dataset_df[['ds', 'id', 'sha', 'file', 'image', 'line', 'annotation']]
hapy_bip_from_dataset_df_sel.head(5)

Unnamed: 0,ds,id,sha,file,image,line,annotation
0,bugs-in-py,bugs-in-py_PySnooper-1,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,pysnooper/pycompat.py,afterChange,11,bug(fix)
1,bugs-in-py,bugs-in-py_PySnooper-1,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,pysnooper/tracer.py,beforeChange,87,bug(fix)
2,bugs-in-py,bugs-in-py_PySnooper-1,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,pysnooper/tracer.py,beforeChange,133,bug(fix)
3,bugs-in-py,bugs-in-py_PySnooper-1,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,pysnooper/tracer.py,afterChange,17,bug(fix)
4,bugs-in-py,bugs-in-py_PySnooper-1,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,pysnooper/tracer.py,afterChange,18,bug(fix)


In [181]:
hapy_bip_dataset_merge_sel_consensus = pd.merge(
    G_bip_consensus_sel_2, hapy_bip_from_dataset_df_sel,
    how='outer', on=['ds', 'id', 'file', 'image', 'line'],
    indicator="indicator_column", suffixes=("_hapy", "_auto"),
)
hapy_bip_dataset_merge_sel_consensus.head()

Unnamed: 0,ds,id,file,image,line,n_reviewers,common_count,consensus,sha,annotation,indicator_column
0,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/pycompat.py,afterChange,11,3.0,2.0,bug(fix),56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,bug(fix),both
1,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/tracer.py,afterChange,17,3.0,2.0,bug(fix),56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,bug(fix),both
2,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/tracer.py,afterChange,18,3.0,2.0,bug(fix),56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,bug(fix),both
3,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/tracer.py,afterChange,89,3.0,3.0,bug(fix),56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,bug(fix),both
4,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/tracer.py,afterChange,135,3.0,2.0,bug(fix) + refactoring,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,bug(fix),both


In [182]:
hapy_bip_dataset_merge_sel_consensus['indicator_column'].value_counts()

indicator_column
both          19762
right_only      518
left_only         0
Name: count, dtype: int64

#### Examining results

In [183]:
merge_sel_consensus[[
    'ds', 'id', 'sha',
    'file', 'image', 'line', 
    'consensus', 'annotation',
    'n_reviewers', 'common_count',
    'indicator_column',
]].head()

Unnamed: 0,ds,id,sha,file,image,line,consensus,annotation,n_reviewers,common_count,indicator_column
0,bugs-in-py,bugs-in-py_PySnooper-1,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,pysnooper/pycompat.py,afterChange,11,bug(fix),bug(fix),3.0,2.0,both
1,bugs-in-py,bugs-in-py_PySnooper-1,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,pysnooper/tracer.py,afterChange,17,bug(fix),bug(fix),3.0,2.0,both
2,bugs-in-py,bugs-in-py_PySnooper-1,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,pysnooper/tracer.py,afterChange,18,bug(fix),bug(fix),3.0,2.0,both
3,bugs-in-py,bugs-in-py_PySnooper-1,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,pysnooper/tracer.py,afterChange,89,bug(fix),bug(fix),3.0,3.0,both
4,bugs-in-py,bugs-in-py_PySnooper-1,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,pysnooper/tracer.py,afterChange,135,bug(fix) + refactoring,bug(fix),3.0,2.0,both


In [184]:
merge_sel_consensus['annotation_neq'] = merge_sel_consensus['consensus'] != merge_sel_consensus['annotation']

merge_sel_consensus['annotation_neq'].value_counts()

annotation_neq
False    18624
True      1808
Name: count, dtype: int64

In [185]:
hapy_bip_merge_sel_consensus['annotation_neq'] = hapy_bip_merge_sel_consensus['consensus'] != hapy_bip_merge_sel_consensus['annotation']

hapy_bip_merge_sel_consensus['annotation_neq'].value_counts()

annotation_neq
False    18629
True      1803
Name: count, dtype: int64

In [186]:
hapy_bip_dataset_merge_sel_consensus['annotation_neq'] = \
    hapy_bip_dataset_merge_sel_consensus['consensus'] != hapy_bip_dataset_merge_sel_consensus['annotation']

print(hapy_bip_dataset_merge_sel_consensus.shape)
hapy_bip_dataset_merge_sel_consensus['annotation_neq'].value_counts()

(20280, 12)


annotation_neq
False    18707
True      1573
Name: count, dtype: int64

In [187]:
merge_sel_consensus.shape[0]

20432

In [188]:
merge_sel_consensus['annotation_neq'].value_counts()/merge_sel_consensus.shape[0]

annotation_neq
False    0.911511
True     0.088489
Name: count, dtype: float64

In [189]:
hapy_bip_merge_sel_consensus['annotation_neq'].value_counts()/hapy_bip_merge_sel_consensus.shape[0]

annotation_neq
False    0.911756
True     0.088244
Name: count, dtype: float64

In [190]:
hapy_bip_dataset_merge_sel_consensus['annotation_neq'].value_counts()/hapy_bip_dataset_merge_sel_consensus.shape[0]

annotation_neq
False    0.922436
True     0.077564
Name: count, dtype: float64

In [191]:
merge_sel_consensus[merge_sel_consensus['annotation_neq']][['consensus', 'annotation']].value_counts()

consensus               annotation   
bug(fix) + refactoring  bug(fix)         224
test + refactoring      test             148
refactoring             bug(fix)          87
other                   bug(fix)          40
refactoring             test              36
documentation           data              27
test                    documentation     24
bug(fix)                documentation     14
refactoring             documentation      3
bug(fix)                data               2
documentation           test               2
bug(fix)                project            1
Name: count, dtype: int64

In [192]:
merge_sel_consensus[merge_sel_consensus['annotation_neq']][['consensus', 'annotation']].value_counts()/merge_sel_consensus.shape[0]

consensus               annotation   
bug(fix) + refactoring  bug(fix)         0.010963
test + refactoring      test             0.007244
refactoring             bug(fix)         0.004258
other                   bug(fix)         0.001958
refactoring             test             0.001762
documentation           data             0.001321
test                    documentation    0.001175
bug(fix)                documentation    0.000685
refactoring             documentation    0.000147
bug(fix)                data             0.000098
documentation           test             0.000098
bug(fix)                project          0.000049
Name: count, dtype: float64

In [193]:
hapy_bip_merge_sel_consensus[hapy_bip_merge_sel_consensus['annotation_neq']][['consensus', 'annotation']].value_counts()

consensus               annotation   
bug(fix) + refactoring  bug(fix)         224
test + refactoring      test             148
refactoring             bug(fix)          87
other                   bug(fix)          40
refactoring             test              36
documentation           bug(fix)          27
test                    documentation     22
bug(fix)                documentation     14
refactoring             documentation      3
documentation           test               2
Name: count, dtype: int64

In [194]:
hapy_bip_merge_sel_consensus[hapy_bip_merge_sel_consensus['annotation_neq']][[
    'consensus', 'annotation'
]].value_counts()/hapy_bip_merge_sel_consensus.shape[0]

consensus               annotation   
bug(fix) + refactoring  bug(fix)         0.010963
test + refactoring      test             0.007244
refactoring             bug(fix)         0.004258
other                   bug(fix)         0.001958
refactoring             test             0.001762
documentation           bug(fix)         0.001321
test                    documentation    0.001077
bug(fix)                documentation    0.000685
refactoring             documentation    0.000147
documentation           test             0.000098
Name: count, dtype: float64

In [195]:
hapy_bip_merge_sel_consensus[
    hapy_bip_merge_sel_consensus['annotation_neq'] &
    hapy_bip_merge_sel_consensus['annotation'].notna() &
    hapy_bip_merge_sel_consensus['consensus'].notna()
][[
    'consensus', 'annotation'
]].value_counts()/hapy_bip_merge_sel_consensus.shape[0]

consensus               annotation   
bug(fix) + refactoring  bug(fix)         0.010963
test + refactoring      test             0.007244
refactoring             bug(fix)         0.004258
other                   bug(fix)         0.001958
refactoring             test             0.001762
documentation           bug(fix)         0.001321
test                    documentation    0.001077
bug(fix)                documentation    0.000685
refactoring             documentation    0.000147
documentation           test             0.000098
Name: count, dtype: float64

In [196]:
hapy_bit_repos_vs_consensus = hapy_bip_merge_sel_consensus[
    hapy_bip_merge_sel_consensus['annotation_neq'] &
    hapy_bip_merge_sel_consensus['annotation'].notna() &
    hapy_bip_merge_sel_consensus['consensus'].notna()
]
hapy_bit_repos_vs_consensus

Unnamed: 0,ds,id,file,image,line,n_reviewers,common_count,consensus,sha,annotation,indicator_column,annotation_neq
4,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/tracer.py,afterChange,135,3.0,2.0,bug(fix) + refactoring,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,bug(fix),both,True
7,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/tracer.py,beforeChange,133,3.0,2.0,bug(fix) + refactoring,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,bug(fix),both,True
56,bugs-in-py,bugs-in-py_PySnooper-1,tests/utils.py,afterChange,257,3.0,2.0,test + refactoring,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,test,both,True
156,bugs-in-py,bugs-in-py_ansible-10,changelogs/fragments/66398-pamd_fix-attributee...,afterChange,1,3.0,3.0,documentation,a4b59d021368285490f7cda50c11ac4f7a8030b5,bug(fix),both,True
157,bugs-in-py,bugs-in-py_ansible-10,changelogs/fragments/66398-pamd_fix-attributee...,afterChange,2,3.0,3.0,documentation,a4b59d021368285490f7cda50c11ac4f7a8030b5,bug(fix),both,True
...,...,...,...,...,...,...,...,...,...,...,...,...
20186,bugs-in-py,bugs-in-py_youtube-dl-34,youtube_dl/utils.py,afterChange,1605,3.0,2.0,bug(fix) + refactoring,410f3e73ab268f74a455798ee39de5caba90caea,bug(fix),both,True
20282,bugs-in-py,bugs-in-py_youtube-dl-40,youtube_dl/downloader/f4m.py,afterChange,13,3.0,2.0,bug(fix) + refactoring,b53466e1680db3d710415329674c887d38af46c5,bug(fix),both,True
20283,bugs-in-py,bugs-in-py_youtube-dl-40,youtube_dl/downloader/f4m.py,afterChange,14,3.0,2.0,bug(fix) + refactoring,b53466e1680db3d710415329674c887d38af46c5,bug(fix),both,True
20344,bugs-in-py,bugs-in-py_youtube-dl-42,youtube_dl/utils.py,afterChange,1095,3.0,2.0,bug(fix) + refactoring,5aafe895fce2a7be9595cb2e56b7bd73a748e6b6,bug(fix),both,True


In [197]:
hapy_bit_repos_vs_consensus[['id', 'sha', 'file', 'image', 'line', 'consensus', 'annotation', 'common_count', 'n_reviewers']].sample(4)

Unnamed: 0,id,sha,file,image,line,consensus,annotation,common_count,n_reviewers
3682,bugs-in-py_fastapi-2,02441ff0313d5b471b662293244c53e712f1243f,fastapi/routing.py,afterChange,504,bug(fix) + refactoring,bug(fix),2.0,3.0
1793,bugs-in-py_black-22,c55d08d0b96c8de8bd867ca315e380d9e9d2d7ec,black.py,afterChange,13,bug(fix) + refactoring,bug(fix),2.0,3.0
17983,bugs-in-py_spacy-8,5efae495f18f37316bd641a05ca26e62cb78e242,spacy/matcher/matcher.pyx,afterChange,139,bug(fix) + refactoring,bug(fix),2.0,3.0
2759,bugs-in-py_fastapi-1,3397d4d69a9c2d64c1219fcbf291ea5697a4abb8,fastapi/routing.py,afterChange,54,refactoring,bug(fix),2.0,3.0


> NOTE: analyzed rows might be not the same as currently randomly selected sample

- pandas, 1c3d64bae7c07b5ae1be337e0ebd751385b7ce27:pandas/io/pickle.py:165<br>
  case where user consensus is "documentation", and our tool (`diff-annotate`) gives "bug(fix)"<br>
  actually it is code (i.e. "bug(fix)") just after the end of the docstring; here the consensus is wrong
- sanic, 44973125c15304b4262c51c78b5a86bd1daafa86:examples/blueprint_middlware_execution_order.py:25<br>
  2 out of 3 "consensus" is "other", tool "annotation" is "bug(fix)",<br>
  what we hit is `@bp.middleware('response')` function decorator; "annotation" is never "other", it can be "bug(fix)", "documentation", or "test"
- ansible, 7acae62fa849481b2a5e2e2d56961c5e1dcea96c:changelogs/fragments/galaxy-role-version.yaml<br>
  3 out of 3 "consensus" is "documentation", and not what "annotation" gives - namely "bug(fix)"<br>
  this is the question of protocol and tool configuration (perhaps for specific project): what are files in `changelogs/` subdirectory
- pandas, 6367bd23b935a85f1bcd2ae762c7f08433d0efbd^:pandas/core/arrays/datetimes.py:897<br>
  3 out of 3 "consensus" is that it is a "bug(fix)", while tool "annotation" states that it is "documentation"<br>
  this is fragment of code inside docstring, so called doctest, which might be not visible if all one uses is diff and its context lines
- scrapy, 9548691fdd47077a53f85daace091ef4af599cb9:tests/test_http_request.py:811<br>
  the "consensus" of human annotators is that it is "test" (i.e. code in test file), tool "annotation" says "documentation"<br>
  it is triple quoted string passed as parameter to a function ("""..."""), which may look for Pygments like dostring<br>
  Pygments mis-detects the `                <body>` line as `Literal.String.Doc`, i.e. docstring; it is not

    ```python
    def test_html_base_form_action(self):
        response = _buildresponse(
            """
            <html>
                <head>
                    <base href="http://b.com/">
                </head>
                <body>
                    <form action="test_form">
                    </form>
                </body>
            </html>
            """,
            url='http://a.com/'
        )
    ```

In [198]:
hapy_bip_dataset_merge_sel_consensus[
    hapy_bip_dataset_merge_sel_consensus['annotation_neq']
][['consensus', 'annotation']].value_counts()

consensus               annotation   
documentation           bug(fix)         388
bug(fix) + refactoring  bug(fix)         224
test + refactoring      test             148
bug(fix)                documentation     82
refactoring             bug(fix)          73
other                   bug(fix)          40
test                    documentation     37
refactoring             test              36
                        documentation     17
documentation           test              10
Name: count, dtype: int64

In [199]:
hapy_bip_dataset_merge_sel_consensus[
    hapy_bip_dataset_merge_sel_consensus['annotation_neq']
][['consensus', 'annotation']].value_counts()/hapy_bip_dataset_merge_sel_consensus.shape[0]

consensus               annotation   
documentation           bug(fix)         0.019132
bug(fix) + refactoring  bug(fix)         0.011045
test + refactoring      test             0.007298
bug(fix)                documentation    0.004043
refactoring             bug(fix)         0.003600
other                   bug(fix)         0.001972
test                    documentation    0.001824
refactoring             test             0.001775
                        documentation    0.000838
documentation           test             0.000493
Name: count, dtype: float64

In [200]:
hapy_bip_dataset_merge_sel_consensus[
    hapy_bip_dataset_merge_sel_consensus['annotation_neq'] &
    hapy_bip_dataset_merge_sel_consensus['consensus'].notna() &
    hapy_bip_dataset_merge_sel_consensus['annotation'].notna()
][['consensus', 'annotation']].value_counts()/hapy_bip_dataset_merge_sel_consensus.shape[0]

consensus               annotation   
documentation           bug(fix)         0.019132
bug(fix) + refactoring  bug(fix)         0.011045
test + refactoring      test             0.007298
bug(fix)                documentation    0.004043
refactoring             bug(fix)         0.003600
other                   bug(fix)         0.001972
test                    documentation    0.001824
refactoring             test             0.001775
                        documentation    0.000838
documentation           test             0.000493
Name: count, dtype: float64

#### Without "* refactoring *"

First, lets consider only those case where there was a match between HaPy-Bug and diff-annotate

In [201]:
hapy_bip_merge_sel_consensus_match = hapy_bip_merge_sel_consensus[
    hapy_bip_merge_sel_consensus['consensus'].notna() &
    hapy_bip_merge_sel_consensus['annotation'].notna()
]
hapy_bip_merge_sel_consensus_match.sample(3)

Unnamed: 0,ds,id,file,image,line,n_reviewers,common_count,consensus,sha,annotation,indicator_column,annotation_neq
15325,bugs-in-py,bugs-in-py_pandas-9,pandas/tests/indexes/categorical/test_indexing.py,afterChange,288,3.0,3.0,test,ebb727e5cd8865a7f5d6cfb4b22d3278b6bf5e6b,test,both,False
4635,bugs-in-py,bugs-in-py_keras-1,tests/keras/backend/backend_test.py,afterChange,606,3.0,3.0,test,8e23a3ec47a2ccbf6cdd222a80886c6b9f17264f,test,both,False
19566,bugs-in-py,bugs-in-py_tqdm-3,tqdm/tests/tests_tqdm.py,beforeChange,1722,3.0,3.0,test,73962a47026dd980ac0758820efc9c41cbf938e0,test,both,False


Examine how do the mismatch looks like in this case

In [202]:
hapy_bip_merge_sel_consensus_match[
    hapy_bip_merge_sel_consensus_match['annotation_neq'] &  # does not match with consensus
    ~(hapy_bip_merge_sel_consensus_match['consensus'].str.contains('refactoring').astype(bool))
][[
    'consensus', 'annotation'
]].value_counts()

consensus      annotation   
other          bug(fix)         40
documentation  bug(fix)         27
test           documentation    22
bug(fix)       documentation    14
documentation  test              2
Name: count, dtype: int64

In [203]:
hapy_bip_merge_sel_consensus_match[
    hapy_bip_merge_sel_consensus_match['annotation_neq'] &  # does not match with consensus
    ~(hapy_bip_merge_sel_consensus_match['consensus'].str.contains('refactoring').astype(bool))
][[
    'consensus', 'annotation'
]].value_counts()/hapy_bip_merge_sel_consensus_match.shape[0]

consensus      annotation   
other          bug(fix)         0.002080
documentation  bug(fix)         0.001404
test           documentation    0.001144
bug(fix)       documentation    0.000728
documentation  test             0.000104
Name: count, dtype: float64

In [204]:
hapy_bip_merge_sel_consensus_match[
    hapy_bip_merge_sel_consensus_match['annotation_neq'] &  # does not match with consensus
    ~(hapy_bip_merge_sel_consensus_match['consensus'].str.contains('refactoring').astype(bool))
][[
    'consensus', 'annotation'
]].value_counts().sum()/hapy_bip_merge_sel_consensus_match.shape[0]

np.float64(0.005459650582362729)

#### Examine N/A cases (no corresponding patches)

It looks like there quite a few case where we didn't find corresponding line in new `diff-annotate` annotations, which is a bit strange...

In [205]:
hapy_bip_merge_sel_consensus[hapy_bip_merge_sel_consensus['annotation'].isna()]

Unnamed: 0,ds,id,file,image,line,n_reviewers,common_count,consensus,sha,annotation,indicator_column,annotation_neq
604,bugs-in-py,bugs-in-py_ansible-4,test/units/playbook/test_collectionsearch.py,afterChange,2,3.0,3.0,documentation,,,left_only,True
605,bugs-in-py,bugs-in-py_ansible-4,test/units/playbook/test_collectionsearch.py,afterChange,3,3.0,3.0,documentation,,,left_only,True
606,bugs-in-py,bugs-in-py_ansible-4,test/units/playbook/test_collectionsearch.py,afterChange,4,3.0,3.0,documentation,,,left_only,True
607,bugs-in-py,bugs-in-py_ansible-4,test/units/playbook/test_collectionsearch.py,afterChange,5,3.0,3.0,documentation,,,left_only,True
608,bugs-in-py,bugs-in-py_ansible-4,test/units/playbook/test_collectionsearch.py,afterChange,6,3.0,3.0,documentation,,,left_only,True
...,...,...,...,...,...,...,...,...,...,...,...,...
17865,bugs-in-py,bugs-in-py_spacy-4,.github/contributors/onlyanegg.md,afterChange,94,3.0,3.0,documentation,,,left_only,True
17866,bugs-in-py,bugs-in-py_spacy-4,.github/contributors/onlyanegg.md,afterChange,95,3.0,3.0,documentation,,,left_only,True
17867,bugs-in-py,bugs-in-py_spacy-4,.github/contributors/onlyanegg.md,afterChange,96,3.0,3.0,documentation,,,left_only,True
17868,bugs-in-py,bugs-in-py_spacy-4,.github/contributors/onlyanegg.md,afterChange,97,3.0,3.0,documentation,,,left_only,True


There are also quite a few annotations in `diff-annotate` that somehow have not found matches in HaPy-Bug dataset (???)

In [206]:
hapy_bip_merge_sel_consensus[hapy_bip_merge_sel_consensus['consensus'].isna()]

Unnamed: 0,ds,id,file,image,line,n_reviewers,common_count,consensus,sha,annotation,indicator_column,annotation_neq
57,bugs-in-py,bugs-in-py_PySnooper-1,tests/utils.py,afterChange,276,,,,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,test,right_only,True
472,bugs-in-py,bugs-in-py_ansible-2,lib/ansible/utils/version.py,afterChange,78,,,,5b9418c06ca6d51507468124250bb58046886be6,bug(fix),right_only,True
476,bugs-in-py,bugs-in-py_ansible-2,lib/ansible/utils/version.py,afterChange,121,,,,5b9418c06ca6d51507468124250bb58046886be6,bug(fix),right_only,True
480,bugs-in-py,bugs-in-py_ansible-2,lib/ansible/utils/version.py,beforeChange,75,,,,5b9418c06ca6d51507468124250bb58046886be6,bug(fix),right_only,True
484,bugs-in-py,bugs-in-py_ansible-2,lib/ansible/utils/version.py,beforeChange,118,,,,5b9418c06ca6d51507468124250bb58046886be6,bug(fix),right_only,True
...,...,...,...,...,...,...,...,...,...,...,...,...
20183,bugs-in-py,bugs-in-py_youtube-dl-34,youtube_dl/utils.py,afterChange,677,,,,410f3e73ab268f74a455798ee39de5caba90caea,bug(fix),right_only,True
20330,bugs-in-py,bugs-in-py_youtube-dl-42,youtube_dl/extractor/clipsyndicate.py,afterChange,6,,,,5aafe895fce2a7be9595cb2e56b7bd73a748e6b6,bug(fix),right_only,True
20332,bugs-in-py,bugs-in-py_youtube-dl-42,youtube_dl/extractor/clipsyndicate.py,beforeChange,6,,,,5aafe895fce2a7be9595cb2e56b7bd73a748e6b6,bug(fix),right_only,True
20334,bugs-in-py,bugs-in-py_youtube-dl-42,youtube_dl/extractor/metacritic.py,afterChange,7,,,,5aafe895fce2a7be9595cb2e56b7bd73a748e6b6,bug(fix),right_only,True


There is similar amount of mismatch for an earlier case (with `--purpose-to-annotation`).

In [207]:
merge_sel_consensus['consensus'].isna().sum()

np.int64(670)

In [208]:
merge_sel_consensus['annotation'].isna().sum()

np.int64(530)

In [209]:
hapy_bip_merge_sel_consensus[hapy_bip_merge_sel_consensus['consensus'].isna()].sample(3)

Unnamed: 0,ds,id,file,image,line,n_reviewers,common_count,consensus,sha,annotation,indicator_column,annotation_neq
2335,bugs-in-py,bugs-in-py_black-6,blib2to3/pgen2/driver.py,afterChange,122,,,,f8617f975d56e81cfb4070ce65584f7b29a77e7a,bug(fix),right_only,True
19433,bugs-in-py,bugs-in-py_tornado-7,tornado/test/ioloop_test.py,afterChange,630,,,,a3b44cd701e0e82693363701bc0346b0125d2362,test,right_only,True
18673,bugs-in-py,bugs-in-py_thefuck-6,tests/rules/test_git_branch_exists.py,afterChange,7,,,,7c858fadb3458be829d3d43666ccb46c3ed5b8a0,test,right_only,True


In [210]:
G_bip_consensus_sel_2.columns

Index(['ds', 'id', 'file', 'image', 'line', 'n_reviewers', 'common_count',
       'consensus'],
      dtype='object')

In [211]:
hapy_bip_merge_sel_consensus[hapy_bip_merge_sel_consensus['annotation'].isna()].sample(3)

Unnamed: 0,ds,id,file,image,line,n_reviewers,common_count,consensus,sha,annotation,indicator_column,annotation_neq
4017,bugs-in-py,bugs-in-py_fastapi-4,tests/test_param_in_path_and_dependency.py,afterChange,80,3.0,3.0,test,,,left_only,True
17696,bugs-in-py,bugs-in-py_spacy-3,.github/contributors/elben10,afterChange,55,3.0,3.0,documentation,,,left_only,True
17718,bugs-in-py,bugs-in-py_spacy-3,.github/contributors/elben10,afterChange,77,3.0,3.0,documentation,,,left_only,True


Example:
- id: `bugs-in-py_fastapi-4`
- file: `tests/test_param_in_path_and_dependency.py`
- image: `afterChange`
- line: 80

In [220]:
# let's emind ourself of the structure of `repo_commits` dict
repo_commits['cookiecutter']

{'commits': ['7f6804c4953a18386809f11faf4d86898570debc',
  '7129d474206761a6156925db78eee4b62a0e3944',
  '90434ff4ea4477941444f1e83313beb414838535',
  '457a1a4e862aab4102b644ff1d2b2e2b5a766b3c'],
 'bugs': ['cookiecutter-1',
  'cookiecutter-3',
  'cookiecutter-2',
  'cookiecutter-4']}

In [215]:
[i for i, name in enumerate(repo_commits['fastapi']['bugs']) if name == 'fastapi-4']

[2]

In [216]:
repo_commits['fastapi']['bugs'][2]

'fastapi-4'

In [217]:
repo_commits['fastapi']['commits'][2]

'74c4d1c1dbe6bfdb05d6e4fc767ffe062398f0a3'

In [218]:
from_repos_df_sel_2.columns

Index(['ds', 'id', 'sha', 'file', 'image', 'line', 'annotation'], dtype='object')

In [222]:
from_repos_df_sel_2[
    (from_repos_df_sel_2['sha'] == '74c4d1c1dbe6bfdb05d6e4fc767ffe062398f0a3') &
    (from_repos_df_sel_2['file'] == 'tests/test_param_in_path_and_dependency.py')
].tail(8)

Unnamed: 0,ds,id,sha,file,image,line,annotation
16665,bugs-in-py,bugs-in-py_fastapi-4,74c4d1c1dbe6bfdb05d6e4fc767ffe062398f0a3,tests/test_param_in_path_and_dependency.py,afterChange,64,test
16666,bugs-in-py,bugs-in-py_fastapi-4,74c4d1c1dbe6bfdb05d6e4fc767ffe062398f0a3,tests/test_param_in_path_and_dependency.py,afterChange,65,test
16667,bugs-in-py,bugs-in-py_fastapi-4,74c4d1c1dbe6bfdb05d6e4fc767ffe062398f0a3,tests/test_param_in_path_and_dependency.py,afterChange,85,test
16668,bugs-in-py,bugs-in-py_fastapi-4,74c4d1c1dbe6bfdb05d6e4fc767ffe062398f0a3,tests/test_param_in_path_and_dependency.py,afterChange,86,test
16669,bugs-in-py,bugs-in-py_fastapi-4,74c4d1c1dbe6bfdb05d6e4fc767ffe062398f0a3,tests/test_param_in_path_and_dependency.py,afterChange,87,test
16670,bugs-in-py,bugs-in-py_fastapi-4,74c4d1c1dbe6bfdb05d6e4fc767ffe062398f0a3,tests/test_param_in_path_and_dependency.py,afterChange,88,test
16671,bugs-in-py,bugs-in-py_fastapi-4,74c4d1c1dbe6bfdb05d6e4fc767ffe062398f0a3,tests/test_param_in_path_and_dependency.py,afterChange,91,test
16672,bugs-in-py,bugs-in-py_fastapi-4,74c4d1c1dbe6bfdb05d6e4fc767ffe062398f0a3,tests/test_param_in_path_and_dependency.py,afterChange,92,test


In [225]:
lines = from_repos_df_sel_2[
    (from_repos_df_sel_2['sha'] == '74c4d1c1dbe6bfdb05d6e4fc767ffe062398f0a3') &
    (from_repos_df_sel_2['file'] == 'tests/test_param_in_path_and_dependency.py')
]['line'].to_list()
80 in lines

False

Let's examine the diff itself
```console
przybysz:/mnt/data/python_bug_localization_data/repositories/fastapi$ git show 74c4d1c1dbe6bfdb05d6e4fc767ffe062398f0a3
[...]
diff --git a/tests/test_param_in_path_and_dependency.py b/tests/test_param_in_path_and_dependency.py
new file mode 100644
index 00000000..55b667ee
--- /dev/null
+++ b/tests/test_param_in_path_and_dependency.py
@@ -0,0 +1,93 @@
+from fastapi import Depends, FastAPI
+from starlette.testclient import TestClient
+
+app = FastAPI()
+
+
+async def user_exists(user_id: int):
+    return True
+
+
+@app.get("/users/{user_id}", dependencies=[Depends(user_exists)])
[...]
```

Let's examine the annotations from `diff-annotate from-repo ...`

```console
$ jq '.changes["tests/test_param_in_path_and_dependency.py"]["+"]' 74c4d1c1dbe6bfdb05d6e4fc767ffe062398f0a3.v2.json | grep '"file_line_no"'
    "file_line_no": 1,
    "file_line_no": 7,
    "file_line_no": 8,
[...]
```

It looks like there are some lines missing from parse...

Let's also examine the annotations from `diff-annotate dataset ...`

```console
$ jq '.changes["tests/test_param_in_path_and_dependency.py"]["+"]' \
    HaPy-Bug/bugsinpy-dataset/fastapi-4/annotation/74c4d1c1dbe6bfdb05d6e4fc767ffe062398f0a3.v2.json | 
    grep '"file_line_no"'
    "file_line_no": 1,
    "file_line_no": 2,
    "file_line_no": 3,
    "file_line_no": 4,
    "file_line_no": 5,
    "file_line_no": 6,
    "file_line_no": 7,
    "file_line_no": 8,
    "file_line_no": 9,
    "file_line_no": 10,
[...]
```

There are no problems here. So the problem is with the code that tries to match changed lines with result of lexing whole file with Pygments...

**TODO:** ...

#### Single-bug perspective

In [1154]:
hapy_bip_merge_sel_consensus_match

Unnamed: 0,ds,id,file,image,line,n_reviewers,common_count,consensus,sha,annotation,indicator_column,annotation_neq
0,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/pycompat.py,afterChange,11,3.0,2.0,bug(fix),56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,bug(fix),both,False
1,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/tracer.py,afterChange,17,3.0,2.0,bug(fix),56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,bug(fix),both,False
2,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/tracer.py,afterChange,18,3.0,2.0,bug(fix),56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,bug(fix),both,False
3,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/tracer.py,afterChange,89,3.0,3.0,bug(fix),56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,bug(fix),both,False
4,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/tracer.py,afterChange,135,3.0,2.0,bug(fix) + refactoring,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,bug(fix),both,True
...,...,...,...,...,...,...,...,...,...,...,...,...
20427,bugs-in-py,bugs-in-py_youtube-dl-9,youtube_dl/YoutubeDL.py,beforeChange,955,3.0,3.0,bug(fix),cf2ac6df6896dac4d23918867bb86fac1e1088d9,bug(fix),both,False
20428,bugs-in-py,bugs-in-py_youtube-dl-9,youtube_dl/YoutubeDL.py,beforeChange,966,3.0,3.0,bug(fix),cf2ac6df6896dac4d23918867bb86fac1e1088d9,bug(fix),both,False
20429,bugs-in-py,bugs-in-py_youtube-dl-9,youtube_dl/YoutubeDL.py,beforeChange,969,3.0,3.0,bug(fix),cf2ac6df6896dac4d23918867bb86fac1e1088d9,bug(fix),both,False
20430,bugs-in-py,bugs-in-py_youtube-dl-9,youtube_dl/YoutubeDL.py,beforeChange,970,3.0,3.0,bug(fix),cf2ac6df6896dac4d23918867bb86fac1e1088d9,bug(fix),both,False


In [1157]:
df = hapy_bip_merge_sel_consensus_match.groupby('id')['annotation_neq'].agg(['sum', 'count'])
df['n_eq'] = df['count'] - df['sum']
df['ratio'] = df['n_eq']/df['count']
df

Unnamed: 0_level_0,sum,count,n_eq,ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bugs-in-py_PySnooper-1,3,59,56,0.949153
bugs-in-py_PySnooper-2,0,27,27,1.000000
bugs-in-py_PySnooper-3,0,30,30,1.000000
bugs-in-py_ansible-1,0,39,39,1.000000
bugs-in-py_ansible-10,2,24,22,0.916667
...,...,...,...,...
bugs-in-py_youtube-dl-5,0,7,7,1.000000
bugs-in-py_youtube-dl-6,0,18,18,1.000000
bugs-in-py_youtube-dl-7,0,7,7,1.000000
bugs-in-py_youtube-dl-8,0,11,11,1.000000


In [1163]:
df['ratio'].describe(percentiles=[0.25, 0.50, 0.75, 0.90, 0.95, 0.99, 0.999, 0.9999, 0.99999])

count      496.000000
mean         0.966731
std          0.089689
min          0.440000
25%          1.000000
50%          1.000000
75%          1.000000
90%          1.000000
95%          1.000000
99%          1.000000
99.9%        1.000000
99.99%       1.000000
99.999%      1.000000
max          1.000000
Name: ratio, dtype: float64

### Extract post-processed auto

In [205]:
collective_df_auto = collective_df[collective_df['auto'] == True].copy()
collective_df_auto.head()

Unnamed: 0,id,bundle,file,fcat,image,line,annotation,user,auto,ds,bug
,,,,,,,,,,,
195965.0,cve_CVE-2020-10289,auto_B_6_13,actionlib_tools/scripts/library.py,programming,beforeChange,103.0,bug(fix),U1,True,cve,CVE-2020-10289
195966.0,cve_CVE-2020-10289,auto_B_6_13,actionlib_tools/scripts/library.py,programming,beforeChange,137.0,bug(fix),U1,True,cve,CVE-2020-10289
195967.0,cve_CVE-2020-10289,auto_B_6_13,actionlib_tools/scripts/library.py,programming,afterChange,103.0,bug(fix),U1,True,cve,CVE-2020-10289
195968.0,cve_CVE-2020-10289,auto_B_6_13,actionlib_tools/scripts/library.py,programming,afterChange,137.0,bug(fix),U1,True,cve,CVE-2020-10289
195969.0,cve_CVE-2020-10289,auto_C_4_9,actionlib_tools/scripts/library.py,programming,beforeChange,103.0,bug(fix),U2,True,cve,CVE-2020-10289


In [206]:
collective_df[collective_df['auto'] == True].shape

(195953, 11)

In [207]:
collective_df_auto_simpl = collective_df_auto.drop(["user", "bundle"], axis=1).drop_duplicates()
collective_df_auto_simpl

Unnamed: 0,id,file,fcat,image,line,annotation,auto,ds,bug
,,,,,,,,,
195965,cve_CVE-2020-10289,actionlib_tools/scripts/library.py,programming,beforeChange,103,bug(fix),True,cve,CVE-2020-10289
195966,cve_CVE-2020-10289,actionlib_tools/scripts/library.py,programming,beforeChange,137,bug(fix),True,cve,CVE-2020-10289
195967,cve_CVE-2020-10289,actionlib_tools/scripts/library.py,programming,afterChange,103,bug(fix),True,cve,CVE-2020-10289
195968,cve_CVE-2020-10289,actionlib_tools/scripts/library.py,programming,afterChange,137,bug(fix),True,cve,CVE-2020-10289
195977,cve_CVE-2016-10516,werkzeug/debug/tbtools.py,programming,beforeChange,361,bug(fix),True,cve,CVE-2016-10516
...,...,...,...,...,...,...,...,...,...
391879,cve_CVE-2018-16876,lib/ansible/plugins/connection/ssh.py,programming,afterChange,361,bug(fix),True,cve,CVE-2018-16876
391880,cve_CVE-2018-16876,lib/ansible/plugins/connection/ssh.py,programming,afterChange,362,bug(fix),True,cve,CVE-2018-16876
391881,cve_CVE-2018-16876,lib/ansible/plugins/connection/ssh.py,programming,afterChange,363,bug(fix),True,cve,CVE-2018-16876


In [208]:
collective_df_auto_simpl_bip = collective_df_auto_simpl[collective_df_auto_simpl['ds'] == 'bugs-in-py']
collective_df_auto_simpl_bip

Unnamed: 0,id,file,fcat,image,line,annotation,auto,ds,bug
,,,,,,,,,
212379,bugs-in-py_keras-17,keras/metrics.py,programming,beforeChange,37,bug(fix),True,bugs-in-py,keras-17
212380,bugs-in-py_keras-17,keras/metrics.py,programming,afterChange,37,documentation,True,bugs-in-py,keras-17
212381,bugs-in-py_keras-17,keras/metrics.py,programming,afterChange,38,bug(fix),True,bugs-in-py,keras-17
212382,bugs-in-py_keras-17,tests/keras/metrics_test.py,test,afterChange,50,test,True,bugs-in-py,keras-17
212383,bugs-in-py_keras-17,tests/keras/metrics_test.py,test,afterChange,51,test,True,bugs-in-py,keras-17
...,...,...,...,...,...,...,...,...,...
391842,bugs-in-py_pandas-54,pandas/tests/dtypes/test_dtypes.py,test,afterChange,133,test,True,bugs-in-py,pandas-54
391843,bugs-in-py_pandas-54,pandas/tests/dtypes/test_dtypes.py,test,afterChange,134,test,True,bugs-in-py,pandas-54
391844,bugs-in-py_pandas-54,pandas/tests/indexes/common.py,test,beforeChange,608,test,True,bugs-in-py,pandas-54


### Compare manual with auto (from collective)

In [209]:
G_bip_consensus_sel_2

Unnamed: 0,ds,id,file,image,line,n_reviewers,common_count,consensus
0,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/pycompat.py,afterChange,11,3,2,bug(fix)
1,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/tracer.py,afterChange,17,3,2,bug(fix)
2,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/tracer.py,afterChange,18,3,2,bug(fix)
3,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/tracer.py,afterChange,89,3,3,bug(fix)
4,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/tracer.py,afterChange,135,3,2,bug(fix) + refactoring
...,...,...,...,...,...,...,...,...
20216,bugs-in-py,bugs-in-py_youtube-dl-9,youtube_dl/YoutubeDL.py,beforeChange,955,3,3,bug(fix)
20217,bugs-in-py,bugs-in-py_youtube-dl-9,youtube_dl/YoutubeDL.py,beforeChange,966,3,3,bug(fix)
20218,bugs-in-py,bugs-in-py_youtube-dl-9,youtube_dl/YoutubeDL.py,beforeChange,969,3,3,bug(fix)
20219,bugs-in-py,bugs-in-py_youtube-dl-9,youtube_dl/YoutubeDL.py,beforeChange,970,3,3,bug(fix)


In [210]:
G_bip_consensus_sel_2.columns

Index(['ds', 'id', 'file', 'image', 'line', 'n_reviewers', 'common_count',
       'consensus'],
      dtype='object')

In [211]:
collective_df_auto_simpl_bip.columns

Index(['id', 'file', 'fcat', 'image', 'line', 'annotation', 'auto', 'ds',
       'bug'],
      dtype='object')

In [212]:
collective_df_auto_simpl_bip_2 = collective_df_auto_simpl_bip[['ds', 'id', 'file', 'image', 'line', 'annotation']]

In [213]:
merge_sel_vs_auto = pd.merge(
    G_bip_consensus_sel_2, collective_df_auto_simpl_bip_2,
    how='outer', on=['ds', 'id', 'file', 'image', 'line'],
    indicator="indicator_column", suffixes=("_consensus", "_is_auto"),
)
merge_sel_vs_auto.head()

Unnamed: 0,ds,id,file,image,line,n_reviewers,common_count,consensus,annotation,indicator_column
0,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/pycompat.py,afterChange,11,3.0,2.0,bug(fix),bug(fix),both
1,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/tracer.py,afterChange,17,3.0,2.0,bug(fix),bug(fix),both
2,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/tracer.py,afterChange,18,3.0,2.0,bug(fix),bug(fix),both
3,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/tracer.py,afterChange,89,3.0,3.0,bug(fix),bug(fix),both
4,bugs-in-py,bugs-in-py_PySnooper-1,pysnooper/tracer.py,afterChange,135,3.0,2.0,bug(fix) + refactoring,bug(fix),both


In [214]:
merge_sel_vs_auto['annotation_neq'] = merge_sel_vs_auto['consensus'] != merge_sel_vs_auto['annotation']

merge_sel_vs_auto['annotation_neq'].value_counts()

annotation_neq
False    18425
True      1796
Name: count, dtype: int64

In [215]:
merge_sel_consensus['annotation_neq'].value_counts()

annotation_neq
False    18624
True      1808
Name: count, dtype: int64

In [216]:
merge_sel_vs_auto['annotation_neq'].value_counts()/merge_sel_vs_auto.shape[0]

annotation_neq
False    0.911181
True     0.088819
Name: count, dtype: float64

In [217]:
merge_sel_vs_auto.shape[0]

20221

In [218]:
merge_sel_consensus['annotation_neq'].value_counts()/merge_sel_consensus.shape[0]

annotation_neq
False    0.911511
True     0.088489
Name: count, dtype: float64

In [219]:
merge_sel_consensus.shape[0]

20432

### Compare dataset with from-repo (BugsInPy from HaPy-Bug)

Using `diff-annotator from-repo` with `--line-callback`, via 'run_annotation_hapy_bip_repos.sh' script.

```
        Command being timed: "./run_annotation_hapy_bip_repos.sh"
        User time (seconds): 321.45 = 5m21.45s
        System time (seconds): 7.26
        Percent of CPU this job got: 157%
        Elapsed (wall clock) time (h:mm:ss or m:ss): 3:28.30
        Average shared text size (kbytes): 0
        Average unshared data size (kbytes): 0
        Average stack size (kbytes): 0
        Average total size (kbytes): 0
        Maximum resident set size (kbytes): 148508
        Average resident set size (kbytes): 0
        Major (requiring I/O) page faults: 0
        Minor (reclaiming a frame) page faults: 2051209
        Voluntary context switches: 19340
        Involuntary context switches: 1595
        Swaps: 0
        File system inputs: 0
        File system outputs: 20248
        Socket messages sent: 0
        Socket messages received: 0
        Signals delivered: 0
        Page size (bytes): 4096
        Exit status: 0
```

In [220]:
hapy_bip_from_repos_df.columns

Index(['id', 'ds', 'bug', 'sha', 'file', 'fcat', 'image', 'line',
       'annotation'],
      dtype='object')

In [221]:
hapy_bip_from_repos_df.shape

(19890, 9)

In [222]:
hapy_bip_from_repos_df.sample(4)

Unnamed: 0,id,ds,bug,sha,file,fcat,image,line,annotation
1014,bugs-in-py_keras-42,bugs-in-py,keras-42,2f3edf96078d78450b985bdf3bfffe7e0c627169,keras/engine/training.py,programming,afterChange,2025,bug(fix)
6375,bugs-in-py_pandas-2,bugs-in-py,pandas-2,55e8891f6d33be14e0db73ac06513129503f995c,pandas/tests/indexing/test_scalar.py,test,afterChange,381,test
3685,bugs-in-py_pandas-151,bugs-in-py,pandas-151,5a227a410c520ceec2d94369a44e2ab774a40dc3,pandas/tests/arrays/test_numpy.py,test,afterChange,228,test
10839,bugs-in-py_scrapy-4,bugs-in-py,scrapy-4,16dad81715d3970149c0cf7a318e73a0d84be1ff,tests/test_contracts.py,test,afterChange,201,test


Using `diff-annotator dataset ...` with `--line-callback`

```
        User time (seconds): 12.63
        System time (seconds): 0.21
        Percent of CPU this job got: 222%
        Elapsed (wall clock) time (h:mm:ss or m:ss): 0:05.77
        Average shared text size (kbytes): 0
        Average unshared data size (kbytes): 0
        Average stack size (kbytes): 0
        Average total size (kbytes): 0
        Maximum resident set size (kbytes): 99328
        Average resident set size (kbytes): 0
        Major (requiring I/O) page faults: 0
        Minor (reclaiming a frame) page faults: 52504
        Voluntary context switches: 153
        Involuntary context switches: 40
        Swaps: 0
        File system inputs: 0
        File system outputs: 19040
        Socket messages sent: 0
        Socket messages received: 0
        Signals delivered: 0
        Page size (bytes): 4096
        Exit status: 0
```

In [223]:
hapy_bip_from_dataset_df.columns

Index(['id', 'ds', 'bug', 'sha', 'file', 'fcat', 'image', 'line',
       'annotation'],
      dtype='object')

In [224]:
hapy_bip_from_dataset_df.shape

(20268, 9)

In [225]:
hapy_bip_from_dataset_df.sample(4)

Unnamed: 0,id,ds,bug,sha,file,fcat,image,line,annotation
16046,bugs-in-py_sanic-1,bugs-in-py,sanic-1,44973125c15304b4262c51c78b5a86bd1daafa86,examples/blueprint_middlware_execution_order.py,programming,afterChange,20,bug(fix)
14955,bugs-in-py_pandas-84,bugs-in-py,pandas-84,24d7c06130f9c2aeebedc26971b244ce076f7d0a,pandas/tests/series/test_reshaping.py,test,afterChange,17,test
18059,bugs-in-py_thefuck-15,bugs-in-py,thefuck-15,41707b80c61acadb7c87b0efcbf10f4186dc5937,tests/rules/test_git_add.py,test,afterChange,19,test
6178,bugs-in-py_keras-42,bugs-in-py,keras-42,2f3edf96078d78450b985bdf3bfffe7e0c627169,keras/engine/training.py,programming,beforeChange,2063,bug(fix)


In [226]:
print(f"User time: {321.45} vs {12.63}, ratio {321.45/12.63}, 1/ratio {12.63/321.45}")
print(f"Wall time: {3*60+28.30} vs {5.77}, ratio {(3*60+28.30)/5.77}")

User time: 321.45 vs 12.63, ratio 25.451306413301662, 1/ratio 0.039290713952403175
Wall time: 208.3 vs 5.77, ratio 36.100519930675915


In [227]:
print(f"Shape: {hapy_bip_from_repos_df.shape} from-repo vs {hapy_bip_from_dataset_df.shape} dataset")
print(f"Difference: {hapy_bip_from_dataset_df.shape[0] - hapy_bip_from_repos_df.shape[0]} (should be 0)")

Shape: (19890, 9) from-repo vs (20268, 9) dataset
Difference: 378 (should be 0)


In [228]:
sel_columns = ['ds', 'bug', 'sha', 'file', 'image', 'line', 'fcat', 'annotation']
hapy_bip_dataset_vs_from_repos_merge = pd.merge(
    hapy_bip_from_dataset_df[sel_columns], 
    hapy_bip_from_repos_df[sel_columns],
    how='outer', on=['ds', 'bug', 'sha', 'file', 'image', 'line'],
    indicator="indicator_column", suffixes=("_dataset", "_repos"),
)

hapy_bip_dataset_vs_from_repos_merge

Unnamed: 0,ds,bug,sha,file,image,line,fcat_dataset,annotation_dataset,fcat_repos,annotation_repos,indicator_column
0,bugs-in-py,PySnooper-1,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,pysnooper/pycompat.py,afterChange,11,programming,bug(fix),programming,bug(fix),both
1,bugs-in-py,PySnooper-1,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,pysnooper/tracer.py,afterChange,17,programming,bug(fix),programming,bug(fix),both
2,bugs-in-py,PySnooper-1,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,pysnooper/tracer.py,afterChange,18,programming,bug(fix),programming,bug(fix),both
3,bugs-in-py,PySnooper-1,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,pysnooper/tracer.py,afterChange,89,programming,bug(fix),programming,bug(fix),both
4,bugs-in-py,PySnooper-1,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,pysnooper/tracer.py,afterChange,135,programming,bug(fix),programming,bug(fix),both
...,...,...,...,...,...,...,...,...,...,...,...
20415,bugs-in-py,youtube-dl-9,cf2ac6df6896dac4d23918867bb86fac1e1088d9,youtube_dl/YoutubeDL.py,beforeChange,955,programming,bug(fix),programming,bug(fix),both
20416,bugs-in-py,youtube-dl-9,cf2ac6df6896dac4d23918867bb86fac1e1088d9,youtube_dl/YoutubeDL.py,beforeChange,966,programming,bug(fix),programming,bug(fix),both
20417,bugs-in-py,youtube-dl-9,cf2ac6df6896dac4d23918867bb86fac1e1088d9,youtube_dl/YoutubeDL.py,beforeChange,969,programming,bug(fix),programming,bug(fix),both
20418,bugs-in-py,youtube-dl-9,cf2ac6df6896dac4d23918867bb86fac1e1088d9,youtube_dl/YoutubeDL.py,beforeChange,970,programming,bug(fix),programming,bug(fix),both


In [229]:
hapy_bip_dataset_vs_from_repos_merge['indicator_column'].value_counts()

indicator_column
both          19738
left_only       530
right_only      152
Name: count, dtype: int64

In [230]:
print(f"Shape: {hapy_bip_from_repos_df.shape} from-repo vs {hapy_bip_from_dataset_df.shape} dataset")
print(f"Difference: {hapy_bip_from_dataset_df.shape[0] - hapy_bip_from_repos_df.shape[0]} (should be 0)")

Shape: (19890, 9) from-repo vs (20268, 9) dataset
Difference: 378 (should be 0)


In [231]:
530-152

378

In [232]:
hapy_bip_dataset_vs_from_repos_merge.columns

Index(['ds', 'bug', 'sha', 'file', 'image', 'line', 'fcat_dataset',
       'annotation_dataset', 'fcat_repos', 'annotation_repos',
       'indicator_column'],
      dtype='object')

In [233]:
hapy_bip_dataset_vs_from_repos_merge['annotation_neq'] = \
    hapy_bip_dataset_vs_from_repos_merge['annotation_dataset'] != hapy_bip_dataset_vs_from_repos_merge['annotation_repos']

hapy_bip_dataset_vs_from_repos_merge['annotation_neq'].value_counts()

annotation_neq
False    19334
True      1086
Name: count, dtype: int64

In [234]:
hapy_bip_dataset_vs_from_repos_merge_inner = hapy_bip_dataset_vs_from_repos_merge[
    hapy_bip_dataset_vs_from_repos_merge['annotation_dataset'].notna() &
    hapy_bip_dataset_vs_from_repos_merge['annotation_repos'].notna()
]

hapy_bip_dataset_vs_from_repos_merge_inner['annotation_neq'].value_counts()

annotation_neq
False    19334
True       404
Name: count, dtype: int64

In [235]:
hapy_bip_dataset_vs_from_repos_merge_inner[hapy_bip_dataset_vs_from_repos_merge_inner['annotation_neq']][[
    'annotation_dataset', 'annotation_repos'
]].value_counts()

annotation_dataset  annotation_repos
bug(fix)            documentation       273
documentation       bug(fix)             96
                    test                 21
test                documentation        14
Name: count, dtype: int64

In [236]:
hapy_bip_dataset_vs_from_repos_merge_inner[
    (hapy_bip_dataset_vs_from_repos_merge_inner['annotation_dataset'] == 'bug(fix)') &
    (hapy_bip_dataset_vs_from_repos_merge_inner['annotation_repos'] == 'documentation')
].sample(4)

Unnamed: 0,ds,bug,sha,file,image,line,fcat_dataset,annotation_dataset,fcat_repos,annotation_repos,indicator_column,annotation_neq
19188,bugs-in-py,tornado-15,ecb3ea7543cc942659faf3d2144853018afa6139,tornado/web.py,afterChange,2382,programming,bug(fix),programming,documentation,both,True
8930,bugs-in-py,pandas-105,cb5f9d1ff407f5ccef7c717e0c23bbd6ed96cf5f,pandas/core/frame.py,afterChange,2500,programming,bug(fix),programming,documentation,both,True
2653,bugs-in-py,cookiecutter-4,457a1a4e862aab4102b644ff1d2b2e2b5a766b3c,cookiecutter/exceptions.py,afterChange,88,programming,bug(fix),programming,documentation,both,True
1400,bugs-in-py,black-15,df2ae3bbe6c45298aabb6c04e85cb353205626f1,black.py,beforeChange,1531,programming,bug(fix),programming,documentation,both,True


Results of manual analysis (**spot check**), where "annotation_dataset" is "bug(fix)" (i.e. "code"), and "annotation_repos" is "documentation":
- tornado-15, ecb3ea7543cc942659faf3d2144853018afa6139, tornado/web.py, afterChange, 2382<br>
  it is ordinary comment in Python, don't know why `diff-annotate dataset ...` says code
- pandas-90, 1c3d64bae7c07b5ae1be337e0ebd751385b7ce27, pandas/io/pickle.py, beforeChange, 19<br>
  it is docstring, where start of docstring might be outside the context `diff-annotate dataset ...` sees
- matplotlib-15, c7df5d2770030fe4588a0fc1ab4449a689554dfc, lib/matplotlib/colors.py, beforeChange, 1219<br>
  it is last line of docstring, see previous entry
- black-15, df2ae3bbe6c45298aabb6c04e85cb353205626f1, black.py, beforeChange, 95<br>
  empty line inside a docstring, see previous entry

Example of manual analysis:
```console
$ cd ../matplotlib/
$ git show c7df5d2770030fe4588a0fc1ab4449a689554dfc^:lib/matplotlib/colors.py | sed -n '1219p'
            to one decade in the logarithmic range.
$ git show c7df5d2770030fe4588a0fc1ab4449a689554dfc^:lib/matplotlib/colors.py | sed -n '1200,1230p'
[...]
```

Here {commit}^ is because of beforeChange; it would be {commit} for afterChange

In [237]:
hapy_bip_dataset_vs_from_repos_merge_inner[
    (hapy_bip_dataset_vs_from_repos_merge_inner['annotation_dataset'] == 'documentation') &
    (hapy_bip_dataset_vs_from_repos_merge_inner['annotation_repos'] == 'bug(fix)')
].sample(4)

Unnamed: 0,ds,bug,sha,file,image,line,fcat_dataset,annotation_dataset,fcat_repos,annotation_repos,indicator_column,annotation_neq
10367,bugs-in-py,pandas-132,bd8f07fb29d2ac819f4c8e8e1b8e6d40f8b0f40c,pandas/core/nanops.py,beforeChange,712,programming,documentation,programming,bug(fix),both,True
17937,bugs-in-py,spacy-6,afe4a428f78abe45d6104d74ef42a066570fa43d,spacy/language.py,afterChange,408,programming,documentation,programming,bug(fix),both,True
7825,bugs-in-py,luigi-6,ce881b2a95743887c6147ff4ba23ce5f622b3f5e,luigi/parameter.py,afterChange,950,programming,documentation,programming,bug(fix),both,True
10365,bugs-in-py,pandas-132,bd8f07fb29d2ac819f4c8e8e1b8e6d40f8b0f40c,pandas/core/nanops.py,afterChange,715,programming,documentation,programming,bug(fix),both,True


Results of manual analysis (spot check), where "annotation_repos" is "bug(fix)" (i.e. "code"), and "annotation_dataset" is "documentation":
- scrapy-16, 68dedf54cb27847f6d035099b61aa06226549fad, scrapy/utils/url.py, afterChange, 159<br>
  is an ordinary code, not a comment, but there is end of docstring close to it

#### Limit to differences with consensus

In [238]:
hapy_bit_repos_vs_consensus[['id', 'sha', 'file', 'image', 'line', 'consensus', 'annotation', 'common_count', 'n_reviewers']].sample(4)

Unnamed: 0,id,sha,file,image,line,consensus,annotation,common_count,n_reviewers
9388,bugs-in-py_pandas-112,8a354b7630f74739212725c38cbaa9b069191a88,pandas/tests/indexes/interval/test_indexing.py,afterChange,8,test + refactoring,test,2.0,3.0
18178,bugs-in-py_thefuck-15,41707b80c61acadb7c87b0efcbf10f4186dc5937,tests/rules/test_git_add.py,afterChange,17,test + refactoring,test,2.0,3.0
828,bugs-in-py_ansible-6,4881af2e7e0506ada0225fd764e874e20569d5b2,test/units/galaxy/test_collection_install.py,afterChange,193,test + refactoring,test,2.0,3.0
16409,bugs-in-py_scrapy-10,db408528928b2d15043593032913fe40d6eb6783,tests/test_downloadermiddleware_redirect.py,beforeChange,163,refactoring,test,2.0,3.0


In [239]:
hapy_bip_dataset_vs_from_repos_merge_inner.columns

Index(['ds', 'bug', 'sha', 'file', 'image', 'line', 'fcat_dataset',
       'annotation_dataset', 'fcat_repos', 'annotation_repos',
       'indicator_column', 'annotation_neq'],
      dtype='object')

In [240]:
hapy_bip_dataset_vs_from_repos_merge_inner_sel = hapy_bip_dataset_vs_from_repos_merge_inner[
    (hapy_bip_dataset_vs_from_repos_merge_inner['indicator_column'] == 'both') &
    hapy_bip_dataset_vs_from_repos_merge_inner['annotation_neq']
][[
    'ds', 'bug', 'sha', 'file', 'image', 'line', 'annotation_dataset', 'annotation_repos'
]]

hapy_bip_dataset_vs_from_repos_merge_inner_sel.sample(4)

Unnamed: 0,ds,bug,sha,file,image,line,annotation_dataset,annotation_repos
16599,bugs-in-py,scrapy-16,68dedf54cb27847f6d035099b61aa06226549fad,scrapy/utils/url.py,afterChange,162,documentation,bug(fix)
1310,bugs-in-py,black-15,df2ae3bbe6c45298aabb6c04e85cb353205626f1,black.py,beforeChange,114,bug(fix),documentation
7831,bugs-in-py,luigi-6,ce881b2a95743887c6147ff4ba23ce5f622b3f5e,luigi/parameter.py,beforeChange,862,documentation,bug(fix)
11149,bugs-in-py,pandas-15,71d610596ed128055614eb660f13c88168bfe22f,pandas/core/arrays/datetimelike.py,beforeChange,413,bug(fix),documentation


In [241]:
hapy_bip_consensus_vs_dataset_vs_from_repos_merge = pd.merge(
    hapy_bit_repos_vs_consensus[['id', 'sha', 'file', 'image', 'line', 'consensus', 'annotation', 'common_count', 'n_reviewers']], 
    hapy_bip_dataset_vs_from_repos_merge_inner_sel,
    how='inner', on=['sha', 'file', 'image', 'line'],
    indicator="indicator_column", suffixes=("_r_vs_cons", "_d_vs_r"),
).reindex(columns=[
    'id', 'ds', 'bug', 'sha', 'file', 'image', 'line', 'consensus', 'annotation', 'annotation_dataset', 'annotation_repos',
    'common_count', 'n_reviewers', 'indicator_column',
])

df = hapy_bip_consensus_vs_dataset_vs_from_repos_merge[
    hapy_bip_consensus_vs_dataset_vs_from_repos_merge['consensus'] == hapy_bip_consensus_vs_dataset_vs_from_repos_merge['annotation_dataset']
]
df

Unnamed: 0,id,ds,bug,sha,file,image,line,consensus,annotation,annotation_dataset,annotation_repos,common_count,n_reviewers,indicator_column
0,bugs-in-py_black-15,bugs-in-py,black-15,df2ae3bbe6c45298aabb6c04e85cb353205626f1,black.py,beforeChange,95,bug(fix),documentation,bug(fix),documentation,2.0,3.0,both
1,bugs-in-py_black-15,bugs-in-py,black-15,df2ae3bbe6c45298aabb6c04e85cb353205626f1,black.py,beforeChange,96,bug(fix),documentation,bug(fix),documentation,2.0,3.0,both
2,bugs-in-py_black-15,bugs-in-py,black-15,df2ae3bbe6c45298aabb6c04e85cb353205626f1,black.py,beforeChange,97,bug(fix),documentation,bug(fix),documentation,2.0,3.0,both
21,bugs-in-py_pandas-24,bugs-in-py,pandas-24,6367bd23b935a85f1bcd2ae762c7f08433d0efbd,pandas/core/arrays/datetimes.py,afterChange,889,bug(fix),documentation,bug(fix),documentation,3.0,3.0,both
22,bugs-in-py_pandas-24,bugs-in-py,pandas-24,6367bd23b935a85f1bcd2ae762c7f08433d0efbd,pandas/core/arrays/datetimes.py,afterChange,897,bug(fix),documentation,bug(fix),documentation,3.0,3.0,both
23,bugs-in-py_pandas-24,bugs-in-py,pandas-24,6367bd23b935a85f1bcd2ae762c7f08433d0efbd,pandas/core/arrays/datetimes.py,beforeChange,889,bug(fix),documentation,bug(fix),documentation,3.0,3.0,both
24,bugs-in-py_pandas-24,bugs-in-py,pandas-24,6367bd23b935a85f1bcd2ae762c7f08433d0efbd,pandas/core/arrays/datetimes.py,beforeChange,897,bug(fix),documentation,bug(fix),documentation,3.0,3.0,both
25,bugs-in-py_scrapy-7,bugs-in-py,scrapy-7,074caf434e255bc96f106e57e3e288028f372485,tests/test_http_request.py,afterChange,991,test,documentation,test,documentation,3.0,3.0,both
26,bugs-in-py_scrapy-7,bugs-in-py,scrapy-7,074caf434e255bc96f106e57e3e288028f372485,tests/test_http_request.py,beforeChange,992,test,documentation,test,documentation,3.0,3.0,both
27,bugs-in-py_thefuck-28,bugs-in-py,thefuck-28,9b30ae0424607a4e268bd26eaee8ccb91a5588f9,tests/rules/test_fix_file.py,afterChange,116,test,documentation,test,documentation,2.0,3.0,both


In [242]:
df['id'].value_counts()

id
bugs-in-py_pandas-24     4
bugs-in-py_thefuck-28    4
bugs-in-py_black-15      3
bugs-in-py_scrapy-7      2
Name: count, dtype: int64

Analysis of those differences (consensus == tool dataset != tool from-repo):
- black, df2ae3bbe6c45298aabb6c04e85cb353205626f1^:black.py:95-97<br>
  "consensus" (2 out of 3) and "annotation_dataset" give "bug(fix)", while "annotation_repos" gives "documentation"<br>
  `git show df2ae3bbe6c45298aabb6c04e85cb353205626f1^:black.py | sed -n '95,97p'`<br>
  it is actually inside docstring, so "annotation_repos" is correct in giving "documentation"
- pandas, 6367bd23b935a85f1bcd2ae762c7f08433d0efbd (before and after changes), pandas/core/arrays/datetimes.py, 889, 897<br>
  "consensus" (3 out of 3) and "annotation_dataset" give "bug(fix)", while "annotation_repos" gives "documentation"<br>
  \[...]<br>
  lines are `                      dtype='datetime64[ns]', freq='D')`, etc.<br>
  it is actually an example code, or rather result returned by example code, inside a _long_ docstring (doctest),
  so "annotation_repos" is probably correct in giving "documentation" to those lines
- scrapy, af434e255bc96f106e57e3e288028f372485:tests/test_http_request.py, 991 (beforeChanges) -> 992 (afterChanges)<br>
  "consensus" (3 out of 3) of human annotators is that it is "test" (i.e. code in test file), while "annotation_repos" says "documentation"<br>
  it is triple quoted string passed as parameter to a function ("""..."""), which may look for Pygments like dostring<br>
  Pygments mis-detects the `                <body>` line as `Literal.String.Doc`, i.e. docstring; it is not docstring
- thefuck, 9b30ae0424607a4e268bd26eaee8ccb91a5588f9:tests/rules/test_fix_file.py, 115,123 (beforeChange) -> 116,124 (afterChange)<br>
  "consensus" (2  out of 3) and "annotation_dataset" give "test", while "annotation_repos" gives "documentation"<br>
  e.g. `llc: a.ll:1:2: error: expected top-level entity`<br>
  it is actually triple quoted string as an element of tuple<br>
  Pygments somehow misdetects it as docstring (or `Literal.String.Doc` is used not only for docstrings)

2nd case:

```python
    def tz_localize(self, tz, ambiguous="raise", nonexistent="raise"):
        """
        Localize tz-naive Datetime Array/Index to tz-aware
        Datetime Array/Index.

        [...]

        Examples
        --------
        >>> tz_naive = pd.date_range('2018-03-01 09:00', periods=3)
        >>> tz_naive
        DatetimeIndex(['2018-03-01 09:00:00', '2018-03-02 09:00:00',
                       '2018-03-03 09:00:00'],
                      dtype='datetime64[ns]', freq='D')
        [...]
        """
```

3rd case

```python
    def test_html_base_form_action(self):
        response = _buildresponse(
            """
            <html>
                <head>
                    <base href=" http://b.com/">
                </head>
                <body>
                    <form action="test_form">
                    </form>
                </body>
            </html>
            """,
            url='http://a.com/'
        )
```

4th case

```python
('llc a.ll', 'a.ll', 1, 2, '',
"""
llc: a.ll:1:2: error: expected top-level entity
+
^
"""),
```

# Comparison with line annotations in the Herbold et al. dataset<br>(hunk_labels.json only)

- S. Herbold, A. Trautsch, B. Ledel, A. Aghamohammadi, T. A. Ghaleb, K. K. Chahal, T. Bossenmaier, B. Nagaria, P. Makedonski, M. N. Ahmadabadi, K. Szabados, H. Spieker, M. Madeja, N. Hoy, V. Lenarduzzi, S. Wang, G. Rodríguez-Pérez, R. Colomo-Palacios, R. Verdecchia, P. Singh, Y. Qin, D. Chakroborti, W. Davis, V. Walunj, H. Wu, D. Marcilio, O. Alam, A. Aldaeej, I. Amit, B. Turhan, S. Eismann, A.-K. Wickert, I. Malavolta, M. Sulír, F. Fard, A. Z. Henley, S. Kourtzanidis, E. Tuzun, C. Treude, S. M. Shamasbi, I. Pashchenko, M. Wyrich,
J. Davis, A. Serebrenik, E. Albrecht, E. U. Aktas, D. Strüber, and J. Erbel, _“A fine-grained data set and analysis of tangling in bug fixing
commits,”_ Empirical Software Engineering, vol. 27, no. 6, p. 125, Nov. 2022<br>https://doi.org/10.1007/s10664-021-10083-5
- S. Herbold, A. Trautsch, B. Ledel, _“Large-Scale Manual Validation of Bugfixing Changes.”_ In Proceedings of the 17th International Conference on Mining Software Repositories (MSR '20). 2020. Association for Computing Machinery, New York, NY, USA, 611–614.<br> https://doi.org/10.1145/3379597.3387504

Replication package: https://github.com/sherbold/replication-kit-2020-line-validation

Release of Replication Kit on May 6, 2022

Python data collection library for SmartSHARK: https://github.com/smartshark/pycoSHARK

> **Contents of the repository**
> - The `Replication-Notebook.ipynb` with all code required to reproduce our results from the raw data.
> - The data folder with the `hunk_labels.json` and the `leaderboard.json` file.
>     - The `hunk_labels.json` contains the relevant raw data for this study, i.e., the manual labels for the hunks.
>     - The `leaderboard.json` contains daily snapshots of the progress per user and per project.
> - The `figures/` folder with all result figures that are generated by the Replication-Notebook.

Attempt at comparing annotations using only the data from `hunk_labels.json` file with serialized data

See [`02-compare_annotations_Herbold.ipynb`](./02-compare_annotations_Herbold.ipynb) for an attempt using also the data from SmartSHARK MongoDB database.

Regular expressions for excluding non-production code files:
- ... (**TODO**)

In [452]:
smartshark_dir = '/mnt/data/msr/smartshark_repositories/replication-kit-2020-line-validation'

In [453]:
hunk_data_file = 'data/hunk_labels.json'

In [454]:
hunk_data_path = Path(smartshark_dir).joinpath(hunk_data_file)
hunk_data_path.is_file()

True

In [455]:
with open(hunk_data_path, mode='r') as json_fp:
    smartshark_data = json.load(json_fp)

type(smartshark_data)

list

## Examining the format of SmartSHARK's 'hunk_labels.json'

In [456]:
smartshark_data[0]

{'lines_manual': {'KKC': {'test': [0,
    1,
    2,
    3,
    4,
    5,
    6,
    7,
    8,
    9,
    10,
    11,
    12,
    13,
    14,
    15,
    16,
    17,
    18,
    19,
    20,
    21,
    22,
    23,
    24,
    25,
    26]},
  'aaghamohammadi': {'documentation': [0,
    1,
    2,
    3,
    4,
    5,
    6,
    7,
    9,
    11,
    12,
    13,
    14,
    15,
    16,
    17],
   'whitespace': [8, 10],
   'test': [18, 19, 20, 21, 22, 23, 24, 25, 26]},
  'riruk': {'test': [0,
    1,
    2,
    3,
    4,
    5,
    6,
    7,
    8,
    9,
    10,
    11,
    12,
    13,
    14,
    15,
    16,
    17,
    18,
    19,
    20,
    21,
    22,
    23,
    24,
    25,
    26]},
  'davisjam': {'test': [0,
    1,
    2,
    3,
    4,
    5,
    6,
    7,
    8,
    9,
    10,
    11,
    12,
    13,
    14,
    15,
    16,
    17,
    18,
    19,
    20,
    21,
    22,
    23,
    24,
    25,
    26]}},
 'file': 'test/repositories/IVY-1300/ivysettings.xml',
 'issue_id': 'IVY-130

https://github.com/apache/ant-ivy/commit/406704aae851f4a5f09d42031ddfaa1756639db4#diff-eb780b45958946b5cf8bfc6efb6cf6ebd5369a4e09d7e1b50537d7413d9180db
- changed file `test/repositories/IVY-1300/ivysettings.xml` has single hunk of changes:<br>
  `@@ -0,0 +1,27 @@`

In [457]:
smartshark_data[1]

{'lines_manual': {'KKC': {'documentation': [0]},
  'kaka727': {'documentation': [0]},
  'aaghamohammadi': {'documentation': [0]},
  'omalam': {'documentation': [0]}},
 'file': 'CHANGES.txt',
 'issue_id': 'IVY-1356',
 'revision_hash': 'de976b00e98b730a3a532c0378e9eb4177d386a6',
 'hunk_id': {'$oid': '5a82f5aa912063217a8893ef'},
 'repository_url': 'https://github.com/apache/ant-ivy',
 'project': 'ant-ivy'}

Examining this change (via GitHub):

https://github.com/apache/ant-ivy/commit/de976b00e98b730a3a532c0378e9eb4177d386a6#diff-59130575b4fb2932c957db2922977d7d89afb0b2085357db1a14615a2fcad776

- changed file `CHANGES.txt` has single hunk of changes:<br>
  `@@ -129,6 +129,7 @@`
- the diff for that file looks like this:

  -----

  ```diff
  diff --git a/CHANGES.txt b/CHANGES.txt
  index 74896162a..097bc9b9b 100644
  --- a/CHANGES.txt
  +++ b/CHANGES.txt
  @@ -129,6 +129,7 @@ for detailed view of each issue, please consult http://issues.apache.org/jira/br
   	
      2.3.x
   =====================================
  +- FIX: Ivy descriptors are merged incorrectly when there is an <exclude> element (IVY-1356)
   - FIX: SimpleDateFormat is not thread safe (IVY-1373)
   - FIX: Maven 'hk2-jar' packaging is now supported (IVY-1357)
   - FIX: Maven 'orbit' and 'pear' packaging is now supported (IVY-899)
  ```

  -----

  See https://github.com/apache/ant-ivy/commit/de976b00e98b730a3a532c0378e9eb4177d386a6.diff

This means that those numbers are indexes of changed lines in hunk

In [458]:
# for trying to find changes where there is more than one hunk, and there are - and + lines
def github_diff(smartshark_data, idx):
    print(f"{smartshark_data[idx]['repository_url']}/commit/{smartshark_data[idx]['revision_hash']}.diff")
    print(f"  file={smartshark_data[idx]['file']}")
    print(f"  hunk={smartshark_data[idx]['hunk_id']}")
    line_numbers = []
    for user, user_data in smartshark_data[idx]['lines_manual'].items():
        for line_type, lines_list in user_data.items():
            for line_no in lines_list:
                if line_no not in line_numbers:
                    line_numbers.append(line_no)
    print(f"  lines={line_numbers}")

In [459]:
github_diff(smartshark_data, 23)

https://github.com/apache/ant-ivy/commit/441accf45b674608d624d6d79d919de2c3ff08e2.diff
  file=src/java/org/apache/ivy/util/url/ApacheURLLister.java
  hunk={'$oid': '5a82f5ab912063217d88943b'}
  lines=[0, 1, 2, 3, 4, 5]


- `github_diff(smartshark_data, 17)` - one hunk, one added and one removed line
- `github_diff(smartshark_data, 23)`: - two hunks:
   1. one with 0 × '−', 1 × '+' which gives 1 line in total
   2. one with 2 × '−', 4 × '+' which gives 6 lines in total: 0..5

```diff
diff --git a/src/java/org/apache/ivy/util/url/ApacheURLLister.java b/src/java/org/apache/ivy/util/url/ApacheURLLister.java
index e15d37b02..a55df0e30 100644
--- a/src/java/org/apache/ivy/util/url/ApacheURLLister.java
+++ b/src/java/org/apache/ivy/util/url/ApacheURLLister.java
@@ -19,6 +19,7 @@
 
 import java.io.BufferedReader;
 import java.io.IOException;
+import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.net.URL;
 import java.util.ArrayList;
@@ -106,8 +107,10 @@ public List retrieveListing(URL url, boolean includeFiles, boolean includeDirect
             url = new URL(url.getProtocol(), url.getHost(), url.getPort(), url.getPath() + "/");
         }
 
-        BufferedReader r = new BufferedReader(new InputStreamReader(URLHandlerRegistry.getDefault()
-                .openStream(url)));
+        URLHandler urlHandler = URLHandlerRegistry.getDefault();
+        String charset = urlHandler.getURLInfo(url).getBodyCharset();
+        InputStream contentStream = urlHandler.openStream(url);
+        BufferedReader r = new BufferedReader(new InputStreamReader(contentStream, charset));
 
         String htmlText = FileUtil.readEntirely(r);
 

In [460]:
github_diff(smartshark_data, 22)

https://github.com/apache/ant-ivy/commit/441accf45b674608d624d6d79d919de2c3ff08e2.diff
  file=src/java/org/apache/ivy/util/url/ApacheURLLister.java
  hunk={'$oid': '5a82f5ab912063217d88943a'}
  lines=[0]


It looks like previous entry _is_ previous hunk

In [461]:
len(smartshark_data)

31065

## Comparing the format of `diff-annotate` (PatchScope)

Let's take a look if it would be easy to match this data with annotations generated with `diff-annotate from-repo ...`

In [462]:
smartshark_repos_dir = '/mnt/data/msr/smartshark_repositories'  # local for the computer I work from; adjust accordingly

In [463]:
!pwd

/home/jnareb/python-diff-annotator/notebooks/experiments


In [464]:
!diff-annotate --version

Diff Annotator version: 0.2.0


In [465]:
!printenv VIRTUAL_ENV

/home/jnareb/python-diff-annotator/.venv


In [466]:
!diff-annotate from-repo --output-dir=/mnt/data/python-diff-annotator/example_annotations/smartshark_repos \
    /mnt/data/msr/smartshark_repositories/ant-ivy \
    --no-merges --no-walk=sorted 441accf45b674608d624d6d79d919de2c3ff08e2

Computing patch sizes and spreads (# files, # change groups, # spanned lines,...)
Storing annotations in <output_dir>/<commit_id>.json
  with output dir: '/mnt/data/python-diff-annotator/example_annotations/smartshark_repos'
Ensuring that output directory '/mnt/data/python-diff-annotator/example_annotations/smartshark_repos' exists
Generating patches from local Git repo '/mnt/data/msr/smartshark_repositories/ant-ivy'
  using `git log -p '--no-merges' '--no-walk=sorted' '441accf45b674608d624d6d79d919de2c3ff08e2'`
  took 0.013 seconds (includes parsing unified diffs)
Annotating commits and saving annotated data, for 1 commits
  lexing pre- and post-image file contents, from repo 'ant-ivy'
  using sequential processing
commits: 100%|████████████████████████████████████| 1/1 [00:00<00:00,  3.33it/s]


In [467]:
%ls -l /mnt/data/python-diff-annotator/example_annotations/smartshark_repos

total 32
-rw-r--r-- 1 jnareb jnareb 31241 Dec  2 14:36 441accf45b674608d624d6d79d919de2c3ff08e2.v2.json


In [468]:
annotation_data_path = '/mnt/data/python-diff-annotator/example_annotations/smartshark_repos/441accf45b674608d624d6d79d919de2c3ff08e2.v2.json'

In [469]:
with open(annotation_data_path, mode='r') as json_fp:
    annotation_data = json.load(json_fp)

type(annotation_data)

dict

In [470]:
annotation_data.keys()

dict_keys(['commit_metadata', 'changes', 'diff_metadata'])

In [471]:
annotation_data['changes'].keys()

dict_keys(['CHANGES.txt', 'src/java/org/apache/ivy/util/url/ApacheURLLister.java', 'src/java/org/apache/ivy/util/url/BasicURLHandler.java', 'src/java/org/apache/ivy/util/url/HttpClientHandler.java', 'src/java/org/apache/ivy/util/url/URLHandler.java'])

In [472]:
annotation_data['changes']['src/java/org/apache/ivy/util/url/ApacheURLLister.java']

{'language': 'Java',
 'type': 'programming',
 'purpose': 'programming',
 '+': [{'id': 3,
   'hunk_idx': 0,
   'in_hunk_chg_idx': 0,
   'file_line_no': 22,
   'type': 'code',
   'purpose': 'programming',
   'tokens': [[910, ['Keyword', 'Namespace'], 'import'],
    [916, ['Text', 'Whitespace'], ' '],
    [917, ['Name', 'Namespace'], 'java.io.InputStream'],
    [936, ['Punctuation'], ';'],
    [937, ['Text', 'Whitespace'], '\n']]},
  {'id': 5,
   'hunk_idx': 1,
   'in_hunk_chg_idx': 0,
   'file_line_no': 110,
   'type': 'code',
   'purpose': 'programming',
   'tokens': [[4168, ['Text', 'Whitespace'], '        '],
    [4176, ['Name'], 'URLHandler'],
    [4186, ['Text', 'Whitespace'], ' '],
    [4187, ['Name'], 'urlHandler'],
    [4197, ['Text', 'Whitespace'], ' '],
    [4198, ['Operator'], '='],
    [4199, ['Text', 'Whitespace'], ' '],
    [4200, ['Name'], 'URLHandlerRegistry'],
    [4218, ['Punctuation'], '.'],
    [4219, ['Name', 'Attribute'], 'getDefault'],
    [4229, ['Punctuation'], '

If there is no `'hunk_idx'` and `'in_hunk_chg_idx'` fields, you need to upgrade PatchScope, and re-run cells in this subsection

## DataFrame from SmartSHARK

In [473]:
[len(lines) for user_data in smartshark_data[0]['lines_manual'].values() for lines in user_data.values()]

[27, 16, 2, 9, 27, 27]

In [474]:
[max(lines) for user_data in smartshark_data[0]['lines_manual'].values() for lines in user_data.values()]

[26, 17, 10, 26, 26, 26]

In [475]:
max([max(lines) for user_data in smartshark_data[0]['lines_manual'].values() for lines in user_data.values()])+1

27

In [476]:
16+2+9

27

In [477]:
int('5a82f5ab912063217d88943a', base=16)

28011971385109964975598244922

In [478]:
def extract_smartshark_records(smartshark_data: list) -> list:
    smartshark_records = []

    sha = ''
    file_name = ''
    hunk_idx = 0
    per_file_hunk_no = Counter()
    for entry in smartshark_data:
        per_file_hunk_no[f"{entry['project']}:{entry['revision_hash']}:{entry['file']}"] += 1
        if sha != entry['revision_hash']:  # different revisions can change the same file
            sha = entry['revision_hash']
            file_name = entry['file']
            hunk_idx = 0
        elif file_name != entry['file']:
            file_name = entry['file']
            hunk_idx = 0
        else:
            hunk_idx += 1
        
        entry_data = {
            'project': entry['project'],
            'issue': entry['issue_id'],
            'sha': entry['revision_hash'],
            'file': entry['file'],
            'hunk_idx': hunk_idx,  # not sure (?)
            'hunk_idx_hash': per_file_hunk_no[f"{entry['project']}:{entry['revision_hash']}:{entry['file']}"] - 1,
            'hunk_oid': entry['hunk_id']['$oid'],
            'hunk_oid_num': int(entry['hunk_id']['$oid'], base=16),
            'n_lines_in_hunk': max([
                max(lines) 
                for user_data in entry['lines_manual'].values()
                for lines in user_data.values()
            ])+1,
        }

        lines_info = defaultdict(list)
        for user, user_data in entry['lines_manual'].items():
            for line_type, lines_list in user_data.items():
                for line_no in lines_list:
                    lines_info[line_no].append((user, line_type))
        #print(f"{entry_data=}, {lines_info=}")

        for i, line_ann in lines_info.items():
            line_data = {
                **entry_data,
                'in_hunk_idx': i,
                **{
                    f"user_{user}": line_type  # NOTE: alternative would be to use hierarchical index for columns
                    for user, line_type in line_ann
                }
            }

            smartshark_records.append(line_data)

    return smartshark_records
        

In [479]:
extract_smartshark_records(smartshark_data[:2])[:2]

[{'project': 'ant-ivy',
  'issue': 'IVY-1300',
  'sha': '406704aae851f4a5f09d42031ddfaa1756639db4',
  'file': 'test/repositories/IVY-1300/ivysettings.xml',
  'hunk_idx': 0,
  'hunk_idx_hash': 0,
  'hunk_oid': '5a82f5aa912063217b88940a',
  'hunk_oid_num': 28011971366663220901855138826,
  'n_lines_in_hunk': 27,
  'in_hunk_idx': 0,
  'user_KKC': 'test',
  'user_aaghamohammadi': 'documentation',
  'user_riruk': 'test',
  'user_davisjam': 'test'},
 {'project': 'ant-ivy',
  'issue': 'IVY-1300',
  'sha': '406704aae851f4a5f09d42031ddfaa1756639db4',
  'file': 'test/repositories/IVY-1300/ivysettings.xml',
  'hunk_idx': 0,
  'hunk_idx_hash': 0,
  'hunk_oid': '5a82f5aa912063217b88940a',
  'hunk_oid_num': 28011971366663220901855138826,
  'n_lines_in_hunk': 27,
  'in_hunk_idx': 1,
  'user_KKC': 'test',
  'user_aaghamohammadi': 'documentation',
  'user_riruk': 'test',
  'user_davisjam': 'test'}]

In [480]:
extract_smartshark_records(smartshark_data[:2])[-2:]

[{'project': 'ant-ivy',
  'issue': 'IVY-1300',
  'sha': '406704aae851f4a5f09d42031ddfaa1756639db4',
  'file': 'test/repositories/IVY-1300/ivysettings.xml',
  'hunk_idx': 0,
  'hunk_idx_hash': 0,
  'hunk_oid': '5a82f5aa912063217b88940a',
  'hunk_oid_num': 28011971366663220901855138826,
  'n_lines_in_hunk': 27,
  'in_hunk_idx': 26,
  'user_KKC': 'test',
  'user_aaghamohammadi': 'test',
  'user_riruk': 'test',
  'user_davisjam': 'test'},
 {'project': 'ant-ivy',
  'issue': 'IVY-1356',
  'sha': 'de976b00e98b730a3a532c0378e9eb4177d386a6',
  'file': 'CHANGES.txt',
  'hunk_idx': 0,
  'hunk_idx_hash': 0,
  'hunk_oid': '5a82f5aa912063217a8893ef',
  'hunk_oid_num': 28011971366663220901838361583,
  'n_lines_in_hunk': 1,
  'in_hunk_idx': 0,
  'user_KKC': 'documentation',
  'user_kaka727': 'documentation',
  'user_aaghamohammadi': 'documentation',
  'user_omalam': 'documentation'}]

In [481]:
smartshark_records = extract_smartshark_records(smartshark_data)

In [482]:
smartshark_df = pd.DataFrame.from_records(smartshark_records)
smartshark_df

Unnamed: 0,project,issue,sha,file,hunk_idx,hunk_idx_hash,hunk_oid,hunk_oid_num,n_lines_in_hunk,in_hunk_idx,...,user_ap,user_Apollo,user_vAvXBcvvrFJwUfHc,user_deepblue,user_atuz,user_JG,user_yyh,user_LuCH,user_LeBron,user_Mohammad
0,ant-ivy,IVY-1300,406704aae851f4a5f09d42031ddfaa1756639db4,test/repositories/IVY-1300/ivysettings.xml,0,0,5a82f5aa912063217b88940a,28011971366663220901855138826,27,0,...,,,,,,,,,,
1,ant-ivy,IVY-1300,406704aae851f4a5f09d42031ddfaa1756639db4,test/repositories/IVY-1300/ivysettings.xml,0,0,5a82f5aa912063217b88940a,28011971366663220901855138826,27,1,...,,,,,,,,,,
2,ant-ivy,IVY-1300,406704aae851f4a5f09d42031ddfaa1756639db4,test/repositories/IVY-1300/ivysettings.xml,0,0,5a82f5aa912063217b88940a,28011971366663220901855138826,27,2,...,,,,,,,,,,
3,ant-ivy,IVY-1300,406704aae851f4a5f09d42031ddfaa1756639db4,test/repositories/IVY-1300/ivysettings.xml,0,0,5a82f5aa912063217b88940a,28011971366663220901855138826,27,3,...,,,,,,,,,,
4,ant-ivy,IVY-1300,406704aae851f4a5f09d42031ddfaa1756639db4,test/repositories/IVY-1300/ivysettings.xml,0,0,5a82f5aa912063217b88940a,28011971366663220901855138826,27,4,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
290800,santuario-java,SANTUARIO-262,b7739627e8f7cd33f58c10e2df30d1dc1d311f5c,src/org/apache/xml/security/utils/XMLUtils.java,0,0,5caeed464dd2d95741417447,28684094493913000519472870471,2,1,...,,,,,,,,,,
290801,santuario-java,SANTUARIO-262,b7739627e8f7cd33f58c10e2df30d1dc1d311f5c,src/org/apache/xml/security/utils/resolver/imp...,0,0,5caeed464dd2d9574141744a,28684094493913000519472870474,2,0,...,,,,,,,,,,
290802,santuario-java,SANTUARIO-262,b7739627e8f7cd33f58c10e2df30d1dc1d311f5c,src/org/apache/xml/security/utils/resolver/imp...,0,0,5caeed464dd2d9574141744a,28684094493913000519472870474,2,1,...,,,,,,,,,,
290803,santuario-java,SANTUARIO-262,b7739627e8f7cd33f58c10e2df30d1dc1d311f5c,src/org/apache/xml/security/utils/resolver/imp...,1,1,5caeed464dd2d9574141744b,28684094493913000519472870475,2,0,...,,,,,,,,,,


In [483]:
smartshark_df.columns

Index(['project', 'issue', 'sha', 'file', 'hunk_idx', 'hunk_idx_hash',
       'hunk_oid', 'hunk_oid_num', 'n_lines_in_hunk', 'in_hunk_idx',
       'user_KKC', 'user_aaghamohammadi', 'user_riruk', 'user_davisjam',
       'user_kaka727', 'user_omalam', 'user_bossenti', 'user_aserebrenik',
       'user_sherbold', 'user_LDlnwznmYPWdNVSA', 'user_atrautsch',
       'user_dvmarcilio', 'user_evidencebp', 'user_bledel', 'user_vijaybw',
       'user_rcolomo', 'user_uGAVaWPHVmMZKjBZ', 'user_melkor54248',
       'user_matin', 'user_grodrig', 'user_JjGyDTFSqlCnXIzk', 'user_Jiong',
       'user_CUE', 'user_TeeKea', 'user_NGKRdSgaxGAtmSqP', 'user_ladybug',
       'user_lancelot', 'user_simin', 'user_ealbrec', 'user_psingh',
       'user_perfwxc', 'user_nikoHu', 'user_ivanomalavolta', 'user_Pyrrhon',
       'user_ppp23', 'user_LastButNotLeast', 'user_ctreude', 'user_turtle',
       'user_Badger', 'user_azhenley', 'user_sulir', 'user_ethemutku',
       'user_joydeba', 'user_rNSKGGMSeMXtAEET', 'user_aar

In [484]:
smartshark_df[smartshark_df['hunk_idx'] != smartshark_df['hunk_idx_hash']]

Unnamed: 0,project,issue,sha,file,hunk_idx,hunk_idx_hash,hunk_oid,hunk_oid_num,n_lines_in_hunk,in_hunk_idx,...,user_ap,user_Apollo,user_vAvXBcvvrFJwUfHc,user_deepblue,user_atuz,user_JG,user_yyh,user_LuCH,user_LeBron,user_Mohammad
4391,ant-ivy,IVY-1186,7f640b0badf5106821908fcae610400849cdccc5,src/java/org/apache/ivy/core/cache/DefaultRepo...,0,3,5a82f5b2912063217d889bad,28011971514237173491565108141,2,0,...,,,,,,,,,,
4392,ant-ivy,IVY-1186,7f640b0badf5106821908fcae610400849cdccc5,src/java/org/apache/ivy/core/cache/DefaultRepo...,0,3,5a82f5b2912063217d889bad,28011971514237173491565108141,2,1,...,,,,,,,,,,
4393,ant-ivy,IVY-1186,7f640b0badf5106821908fcae610400849cdccc5,src/java/org/apache/ivy/core/cache/DefaultRepo...,1,4,5a82f5b2912063217d889bae,28011971514237173491565108142,2,0,...,,,,,,,,,,
4394,ant-ivy,IVY-1186,7f640b0badf5106821908fcae610400849cdccc5,src/java/org/apache/ivy/core/cache/DefaultRepo...,1,4,5a82f5b2912063217d889bae,28011971514237173491565108142,2,1,...,,,,,,,,,,
4395,ant-ivy,IVY-1186,7f640b0badf5106821908fcae610400849cdccc5,src/java/org/apache/ivy/core/cache/DefaultRepo...,2,5,5a82f5b2912063217d889baf,28011971514237173491565108143,2,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251019,giraph,GIRAPH-34,9dd9af5294cca4ef0e4a19ec4829294b2a5d850f,src/main/java/org/apache/giraph/graph/BasicVer...,0,5,5bf51ca9d2f8190dadf3c51f,28459458078511182297471108383,10,4,...,,,,,,,,,,
251020,giraph,GIRAPH-34,9dd9af5294cca4ef0e4a19ec4829294b2a5d850f,src/main/java/org/apache/giraph/graph/BasicVer...,0,5,5bf51ca9d2f8190dadf3c51f,28459458078511182297471108383,10,6,...,,,,,,,,,,
251021,giraph,GIRAPH-34,9dd9af5294cca4ef0e4a19ec4829294b2a5d850f,src/main/java/org/apache/giraph/graph/BasicVer...,0,5,5bf51ca9d2f8190dadf3c51f,28459458078511182297471108383,10,7,...,,,,,,,,,,
251022,giraph,GIRAPH-34,9dd9af5294cca4ef0e4a19ec4829294b2a5d850f,src/main/java/org/apache/giraph/graph/BasicVer...,0,5,5bf51ca9d2f8190dadf3c51f,28459458078511182297471108383,10,8,...,,,,,,,,,,


In [485]:
smartshark_df[smartshark_df['hunk_idx'] != smartshark_df['hunk_idx_hash']].shape[0]

3199

In [486]:
smartshark_df.shape[0]

290805

In [487]:
smartshark_df[smartshark_df['hunk_idx'] != smartshark_df['hunk_idx_hash']].shape[0]/smartshark_df.shape[0]

0.011000498615911005

In [488]:
df = smartshark_df.groupby(['project', 'sha', 'file'])['hunk_oid_num'].agg(['first', 'min'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,first,min
project,sha,file,Unnamed: 3_level_1,Unnamed: 4_level_1
ant-ivy,001e2caa07a1dbfd92fbf86a26f30aaef23be5f5,src/java/org/apache/ivy/ant/IvyAntSettings.java,28011971661811126081191190463,28011971661811126081191190463
ant-ivy,001e2caa07a1dbfd92fbf86a26f30aaef23be5f5,src/java/org/apache/ivy/ant/IvyConfigure.java,28011971661811126081191190472,28011971661811126081191190472
ant-ivy,001e2caa07a1dbfd92fbf86a26f30aaef23be5f5,src/java/org/apache/ivy/ant/IvyConvertPom.java,28011971661811126081191190478,28011971661811126081191190478
ant-ivy,001e2caa07a1dbfd92fbf86a26f30aaef23be5f5,src/java/org/apache/ivy/ant/IvyExtractFromSources.java,28011971680257870154900742096,28011971680257870154900742096
ant-ivy,001e2caa07a1dbfd92fbf86a26f30aaef23be5f5,src/java/org/apache/ivy/ant/IvyMakePom.java,28011971680257870154900742100,28011971680257870154900742100
...,...,...,...,...
wss4j,fec38d39c4c4980f119bb3d23bf034a47939ced3,src/main/java/org/apache/ws/security/message/DOMCallbackLookup.java,28251151403369693544709657713,28251151403369693544709657713
wss4j,fec38d39c4c4980f119bb3d23bf034a47939ced3,src/main/java/org/apache/ws/security/util/WSSecurityUtil.java,28251151403369693544709657719,28251151403369693544709657719
wss4j,feda7818be88e1b8bd8c6bf42a63da6f3ef1f9bf,src/org/apache/ws/security/util/WSSecurityUtil.java,28251151015988067996825851870,28251151015988067996825851870
wss4j,feda7818be88e1b8bd8c6bf42a63da6f3ef1f9bf,test/wssec/PackageTests.java,28251151015988067996825851874,28251151015988067996825851874


In [489]:
(df['first'] != df['min']).value_counts()

False    11232
True         8
Name: count, dtype: int64

In [490]:
hunk_idx_max = smartshark_df['hunk_idx'].max()
print(f"{hunk_idx_max=}")
smartshark_df[smartshark_df['hunk_idx'] == hunk_idx_max]

hunk_idx_max=np.int64(180)


Unnamed: 0,project,issue,sha,file,hunk_idx,hunk_idx_hash,hunk_oid,hunk_oid_num,n_lines_in_hunk,in_hunk_idx,...,user_ap,user_Apollo,user_vAvXBcvvrFJwUfHc,user_deepblue,user_atuz,user_JG,user_yyh,user_LuCH,user_LeBron,user_Mohammad
203968,commons-jcs,JCS-153,32fb3adb49f4e729ea541f9dabe960d7aee04de0,commons-jcs-core/src/main/java/org/apache/comm...,180,180,5bee7d6f544cee7992de2fc3,28451452588284126088285532099,3,0,...,,,,,,,,,,
203969,commons-jcs,JCS-153,32fb3adb49f4e729ea541f9dabe960d7aee04de0,commons-jcs-core/src/main/java/org/apache/comm...,180,180,5bee7d6f544cee7992de2fc3,28451452588284126088285532099,3,1,...,,,,,,,,,,
203970,commons-jcs,JCS-153,32fb3adb49f4e729ea541f9dabe960d7aee04de0,commons-jcs-core/src/main/java/org/apache/comm...,180,180,5bee7d6f544cee7992de2fc3,28451452588284126088285532099,3,2,...,,,,,,,,,,


In [491]:
smartshark_df['hunk_idx_hash'].max()

np.int64(196)

In [492]:
smartshark_users_cols = [col for col in smartshark_df.columns if col.startswith('user_')]
smartshark_users = [col[5:] for col in smartshark_df.columns if col.startswith('user_')]
print(f"there were {len(smartshark_users)} SmartSHARK dataset annotators (users)")
smartshark_users

there were 64 SmartSHARK dataset annotators (users)


['KKC',
 'aaghamohammadi',
 'riruk',
 'davisjam',
 'kaka727',
 'omalam',
 'bossenti',
 'aserebrenik',
 'sherbold',
 'LDlnwznmYPWdNVSA',
 'atrautsch',
 'dvmarcilio',
 'evidencebp',
 'bledel',
 'vijaybw',
 'rcolomo',
 'uGAVaWPHVmMZKjBZ',
 'melkor54248',
 'matin',
 'grodrig',
 'JjGyDTFSqlCnXIzk',
 'Jiong',
 'CUE',
 'TeeKea',
 'NGKRdSgaxGAtmSqP',
 'ladybug',
 'lancelot',
 'simin',
 'ealbrec',
 'psingh',
 'perfwxc',
 'nikoHu',
 'ivanomalavolta',
 'Pyrrhon',
 'ppp23',
 'LastButNotLeast',
 'ctreude',
 'turtle',
 'Badger',
 'azhenley',
 'sulir',
 'ethemutku',
 'joydeba',
 'rNSKGGMSeMXtAEET',
 'aarvQeMWXRSETYJu',
 'matt',
 'turhanb',
 'tAkmSjmapQPEZrVe',
 'ekrisza',
 'danielstrueber',
 'Ruuku',
 'erbel',
 'Vale',
 'kentu713',
 'ap',
 'Apollo',
 'vAvXBcvvrFJwUfHc',
 'deepblue',
 'atuz',
 'JG',
 'yyh',
 'LuCH',
 'LeBron',
 'Mohammad']

In [493]:
uniq_annotations = pd.unique(smartshark_df[smartshark_users_cols].values.flatten())
uniq_annotations = uniq_annotations[~pd.isnull(uniq_annotations)]
sorted(uniq_annotations.tolist())

['bugfix', 'documentation', 'refactoring', 'test', 'unrelated', 'whitespace']

SmartSHARK / Herbold used the following pre-processing steps:

> - Heuristic pre-labeling of lines as "documentation" changes using regular expressions.
> - Heuristic pre-labeling of lines with only "whitespace" changes.
> - Heuristic pre-labeling of lines as "refactoring" by automatically marking changed lines as refactorings, in case they were detected as refactorings by the RefactoringMiner 1.0 ([Tsantalis et al. 2018](https://doi.org/10.1145/3180155.3180206)).

- "bugfix" - contributes to the bug fix;
- "whitespace" - only changes to whitespaces;
- "documentation" - documentation change;
- "refactoring" - refactoring;
- "test" - change to tests; and
- "unrelated" - unrelated improvement not required for the bug fix.

### Consensus in SmartSHARK

In [494]:
users_ids = smartshark_users

In [495]:
def consensus_value(row):
    ret = Counter()
    val = None
    for u in users_ids:
        u_val = row['user_' + u]
        if not pd.isnull(u_val):
            ret[u_val] += 1
            val = u_val
            
    #if sum(ret.values()) < 3: # Check if all 3 reviewers annotated line
    #    print(f"Error at {row['id']}: {sum(ret.values())} < 3 reviewers")

    return Counter(ret).most_common(1)[0][0]

In [496]:
def n_reviewers(row):
    ret = 0
    for u in users_ids:
        u_val = row['user_' + u]
        if not pd.isnull(u_val):
            ret += 1

    return ret

In [497]:
def most_common_count(row):
    ret = Counter()
    val = None
    for u in users_ids:
        u_val = row['user_' + u]
        if not pd.isnull(u_val):
            ret[u_val] += 1
            val = u_val
            
    #if sum(ret.values()) < 3: # Check if all 3 reviewers annotated line
    #    print(f"Error at {row['id']}: {ret.values()} < 3 reviewers")

    return Counter(ret).most_common(1)[0][1]

In [498]:
%%time

smartshark_df['most_common'] = pd.DataFrame(smartshark_df.apply(consensus_value, axis=1))
smartshark_df['n_reviewers'] = pd.DataFrame(smartshark_df.apply(n_reviewers, axis=1))
smartshark_df['common_count'] = pd.DataFrame(smartshark_df.apply(most_common_count, axis=1))

smartshark_df.columns

CPU times: user 1min 34s, sys: 252 ms, total: 1min 34s
Wall time: 1min 34s


Index(['project', 'issue', 'sha', 'file', 'hunk_idx', 'hunk_idx_hash',
       'hunk_oid', 'hunk_oid_num', 'n_lines_in_hunk', 'in_hunk_idx',
       'user_KKC', 'user_aaghamohammadi', 'user_riruk', 'user_davisjam',
       'user_kaka727', 'user_omalam', 'user_bossenti', 'user_aserebrenik',
       'user_sherbold', 'user_LDlnwznmYPWdNVSA', 'user_atrautsch',
       'user_dvmarcilio', 'user_evidencebp', 'user_bledel', 'user_vijaybw',
       'user_rcolomo', 'user_uGAVaWPHVmMZKjBZ', 'user_melkor54248',
       'user_matin', 'user_grodrig', 'user_JjGyDTFSqlCnXIzk', 'user_Jiong',
       'user_CUE', 'user_TeeKea', 'user_NGKRdSgaxGAtmSqP', 'user_ladybug',
       'user_lancelot', 'user_simin', 'user_ealbrec', 'user_psingh',
       'user_perfwxc', 'user_nikoHu', 'user_ivanomalavolta', 'user_Pyrrhon',
       'user_ppp23', 'user_LastButNotLeast', 'user_ctreude', 'user_turtle',
       'user_Badger', 'user_azhenley', 'user_sulir', 'user_ethemutku',
       'user_joydeba', 'user_rNSKGGMSeMXtAEET', 'user_aar

> Each commit was shown to four participants. Consensus is achieved if at least three participants agree on the same label. If this is not the case, no consensus for the line is achieved, i.e., the participants could not clearly identify which type of change a line is.

In [499]:
smartshark_df['n_reviewers'].value_counts()

n_reviewers
4     290572
43       219
64        14
Name: count, dtype: int64

In [500]:
smartshark_df['common_count'].value_counts()

common_count
4     172544
3      74589
2      42046
1       1393
40        90
39        40
41        31
42        31
58         7
38         5
26         5
36         4
43         4
57         3
61         2
31         2
37         2
33         1
54         1
52         1
22         1
32         1
27         1
30         1
Name: count, dtype: int64

In [501]:
smartshark_df['has_consensus'] = smartshark_df['common_count'] >= 3
smartshark_df['has_consensus'].value_counts()

has_consensus
True     247366
False     43439
Name: count, dtype: int64

In [502]:
smartshark_df_sel = smartshark_df[[
    'project', 'issue', 'sha', 'file', 'hunk_idx', 'n_lines_in_hunk', 'in_hunk_idx',
    'most_common', 'n_reviewers', 'common_count', 'has_consensus',
]]
smartshark_df_sel

Unnamed: 0,project,issue,sha,file,hunk_idx,n_lines_in_hunk,in_hunk_idx,most_common,n_reviewers,common_count,has_consensus
0,ant-ivy,IVY-1300,406704aae851f4a5f09d42031ddfaa1756639db4,test/repositories/IVY-1300/ivysettings.xml,0,27,0,test,4,3,True
1,ant-ivy,IVY-1300,406704aae851f4a5f09d42031ddfaa1756639db4,test/repositories/IVY-1300/ivysettings.xml,0,27,1,test,4,3,True
2,ant-ivy,IVY-1300,406704aae851f4a5f09d42031ddfaa1756639db4,test/repositories/IVY-1300/ivysettings.xml,0,27,2,test,4,3,True
3,ant-ivy,IVY-1300,406704aae851f4a5f09d42031ddfaa1756639db4,test/repositories/IVY-1300/ivysettings.xml,0,27,3,test,4,3,True
4,ant-ivy,IVY-1300,406704aae851f4a5f09d42031ddfaa1756639db4,test/repositories/IVY-1300/ivysettings.xml,0,27,4,test,4,3,True
...,...,...,...,...,...,...,...,...,...,...,...
290800,santuario-java,SANTUARIO-262,b7739627e8f7cd33f58c10e2df30d1dc1d311f5c,src/org/apache/xml/security/utils/XMLUtils.java,0,2,1,bugfix,4,3,True
290801,santuario-java,SANTUARIO-262,b7739627e8f7cd33f58c10e2df30d1dc1d311f5c,src/org/apache/xml/security/utils/resolver/imp...,0,2,0,bugfix,4,3,True
290802,santuario-java,SANTUARIO-262,b7739627e8f7cd33f58c10e2df30d1dc1d311f5c,src/org/apache/xml/security/utils/resolver/imp...,0,2,1,bugfix,4,3,True
290803,santuario-java,SANTUARIO-262,b7739627e8f7cd33f58c10e2df30d1dc1d311f5c,src/org/apache/xml/security/utils/resolver/imp...,1,2,0,bugfix,4,4,True


## DataFrame from diff-annotator on SmartSHARK repos

First step is to generate annotation data

### Commits for repo

In [503]:
smartshark_commits_df = smartshark_df[['project', 'issue', 'sha']].drop_duplicates()
smartshark_commits_df

Unnamed: 0,project,issue,sha
0,ant-ivy,IVY-1300,406704aae851f4a5f09d42031ddfaa1756639db4
27,ant-ivy,IVY-1356,de976b00e98b730a3a532c0378e9eb4177d386a6
224,ant-ivy,IVY-1357,e8d6dde98a14744219dcb19eb71e8e9214c210e6
225,ant-ivy,IVY-1343,b9059448ca6f9e787b0c87dd47000d46f2c0f605
357,ant-ivy,IVY-1060,441accf45b674608d624d6d79d919de2c3ff08e2
...,...,...,...
290581,santuario-java,SANTUARIO-266,eab37d86220586067c468d2b31540ecd71ca275a
290584,santuario-java,SANTUARIO-253,ee49598cda495738e3020be37c606dfcd3763163
290585,santuario-java,SANTUARIO-253,3752cfbf7fb39193ed2bce69d4b7889548ee4c62
290608,santuario-java,SANTUARIO-263,53e9483cc70cd351e3de884f3e6da6cb34233dfb


In [504]:
smartshark_projects = smartshark_commits_df['project'].unique()
smartshark_projects

array(['ant-ivy', 'commons-math', 'opennlp', 'parquet-mr', 'wss4j',
       'archiva', 'deltaspike', 'systemml', 'commons-lang', 'commons-net',
       'commons-collections', 'commons-beanutils', 'commons-codec',
       'commons-compress', 'commons-configuration', 'commons-digester',
       'commons-jcs', 'commons-imaging', 'commons-io', 'commons-scxml',
       'commons-validator', 'commons-vfs', 'giraph', 'jspwiki', 'eagle',
       'commons-bcel', 'commons-dbcp', 'gora', 'santuario-java'],
      dtype=object)

In [1151]:
len(smartshark_projects)

29

In [505]:
sorted(smartshark_projects)

['ant-ivy',
 'archiva',
 'commons-bcel',
 'commons-beanutils',
 'commons-codec',
 'commons-collections',
 'commons-compress',
 'commons-configuration',
 'commons-dbcp',
 'commons-digester',
 'commons-imaging',
 'commons-io',
 'commons-jcs',
 'commons-lang',
 'commons-math',
 'commons-net',
 'commons-scxml',
 'commons-validator',
 'commons-vfs',
 'deltaspike',
 'eagle',
 'giraph',
 'gora',
 'jspwiki',
 'opennlp',
 'parquet-mr',
 'santuario-java',
 'systemml',
 'wss4j']

In [506]:
%ls -1 /mnt/data/msr/smartshark_repositories

[0m[01;34mant-ivy[0m/
[01;34marchiva[0m/
[01;34mcommons-bcel[0m/
[01;34mcommons-beanutils[0m/
[01;34mcommons-codec[0m/
[01;34mcommons-collections[0m/
[01;34mcommons-compress[0m/
[01;34mcommons-configuration[0m/
[01;34mcommons-dbcp[0m/
[01;34mcommons-digester[0m/
[01;34mcommons-imaging[0m/
[01;34mcommons-io[0m/
[01;34mcommons-jcs[0m/
[01;34mcommons-lang[0m/
[01;34mcommons-math[0m/
[01;34mcommons-net[0m/
[01;34mcommons-scxml[0m/
[01;34mcommons-validator[0m/
[01;34mcommons-vfs[0m/
[01;34mdeltaspike[0m/
[01;34meagle[0m/
[01;34mgiraph[0m/
[01;34mgora[0m/
[01;34mjspwiki[0m/
[01;34mopennlp[0m/
[01;34mparquet-java[0m/
[01;34mparquet-mr[0m/
[01;34mreplication-kit-2020-line-validation[0m/
repositories.txt
[01;34msantuario-java[0m/
[01;34msantuario-xml-security-java[0m/
[01;34msystemds[0m/
[01;34msystemml[0m/
[01;34mwss4j[0m/
[01;34mws-wss4j[0m/


In [507]:
dir_list = [path.name for path in Path('/mnt/data/msr/smartshark_repositories').iterdir() if path.is_dir()]
set(smartshark_projects) - set(dir_list)

set()

On previous runs, the following projects were missing:
- commons-imaging: https://github.com/apache/commons-imaging.git
- parquet-mr: https://github.com/apache/parquet-mr.git -> https://github.com/apache/parquet-java.git
- santuario-java: https://github.com/apache/santuario-java.git (archived)
- systemml: https://github.com/apache/systemml.git -> https://github.com/apache/systemds
- wss4j: https://github.com/apache/wss4j.git -> 404 (repository **no longer exists** at that URL) -> https://gitbox.apache.org/repos/asf/ws-wss4j.git

In [508]:
smartshark_projects_shas = {}
for project in smartshark_projects:
    print(f"{project} has ", end="")

    smartshark_projects_shas[project] = smartshark_commits_df[smartshark_commits_df['project'] == project]['sha'].unique()
    print(f"{len(smartshark_projects_shas[project])} commits")

ant-ivy has 548 commits
commons-math has 391 commits
opennlp has 150 commits
parquet-mr has 119 commits
wss4j has 245 commits
archiva has 5 commits
deltaspike has 8 commits
systemml has 6 commits
commons-lang has 225 commits
commons-net has 176 commits
commons-collections has 92 commits
commons-beanutils has 61 commits
commons-codec has 58 commits
commons-compress has 204 commits
commons-configuration has 253 commits
commons-digester has 26 commits
commons-jcs has 73 commits
commons-imaging has 20 commits
commons-io has 116 commits
commons-scxml has 67 commits
commons-validator has 75 commits
commons-vfs has 118 commits
giraph has 146 commits
jspwiki has 1 commits
eagle has 2 commits
commons-bcel has 52 commits
commons-dbcp has 89 commits
gora has 98 commits
santuario-java has 95 commits


In [1152]:
[len(shas) for shas in smartshark_projects_shas.values()]

[548,
 391,
 150,
 119,
 245,
 5,
 8,
 6,
 225,
 176,
 92,
 61,
 58,
 204,
 253,
 26,
 73,
 20,
 116,
 67,
 75,
 118,
 146,
 1,
 2,
 52,
 89,
 98,
 95]

In [1153]:
sum([len(shas) for shas in smartshark_projects_shas.values()])

3519

### Generate script for running diff-annotate

In [509]:
sorted(uniq_annotations.tolist())

['bugfix', 'documentation', 'refactoring', 'test', 'unrelated', 'whitespace']

In [510]:
smartshark_df[smartshark_df['has_consensus']]['most_common'].value_counts()

most_common
test             115073
bugfix            73068
documentation     40650
whitespace        11937
refactoring        5297
unrelated          1341
Name: count, dtype: int64

In [511]:
smartshark_df[smartshark_df['has_consensus']]['most_common'].value_counts()/smartshark_df['has_consensus'].sum()

most_common
test             0.465193
bugfix           0.295384
documentation    0.164331
whitespace       0.048256
refactoring      0.021414
unrelated        0.005421
Name: count, dtype: float64

We need to decide on how `diff-annotate` is to annotate lines, what possible line types we can assign.

PatchScope does not implement refactoring detection - it is out of scope of the project (for now?).  This means that it would not assign the "refactoring" label (5297 = 2.14%).  Similarly, we would not be able to assign the "unrelated" label (1341 = 0.54%).

Currently PatchScope cannot detect whitespace-only **_changes_** (11937 = 4.83%), as it currently does not do any matching between removed and added lines.  We can however assign "whitespace" label to whitespace-only _lines_.

The paper does not say whether documentation in test should be labelled "documentation" or "test".  We can try either way.

Most probably comments in the code should be considered "documentation", and changes that are not comments to a file in programming language should be considered "bugfix"

Now the goal is to write line callback code for SmartSHARK-compatibile labelling

In [512]:
%%writefile '../../smartshark_line_callback.py'
def line_callback(file_data, tokens):
    # NOTE: function definition *must* currently be first line of the file

    line_type = "bugfix"

    if file_data['type'] != "programming":
        if file_data['purpose'] not in ["documentation", "test"]:
            line_type = "bugfix"  # or "unrelated"
    else:
        # For programming languages
        if line_is_whitespace(tokens):
            line_type = "whitespace"
        elif line_is_comment(tokens):
            line_type = "documentation"  # or "test", for test files
        elif file_data['purpose'] == "test":
            line_type = "test"
        else:
            line_type = "bugfix"

    return line_type


Overwriting ../../smartshark_line_callback.py


Next step is generate the script that would run `diff-annotate from-repo` on SmartSHARK repos for SmartSHARK commits

In [740]:
run_smartshark_script = '../../run_annotation_smartshark_repos.sh'
smartshark_callback_file_relative = 'smartshark_line_callback.py'  # relative to script workdir
smartshark_repos_dir = '/mnt/data/msr/smartshark_repositories'  # NOTE: this is value for this local computer (!!!)
smartshark_annotations_dir = '/mnt/data/python-diff-annotator/example_annotations/smartshark'  # NOTE: configure this

with open(run_smartshark_script, 'wt') as fp:
    print('#!/usr/bin/sh', file=fp)
    print('', file=fp)
    print(f'CALLBACK_FILE="{smartshark_callback_file_relative}"', file=fp)
    print(f'REPOS_DIR="{smartshark_repos_dir}"', file=fp)
    print(f'ANNOTATIONS_DIR="{smartshark_annotations_dir}"', file=fp)
    print('', file=fp)
    print('if [ ! -f "$CALLBACK_FILE" ]; then', file=fp)
    print('    echo "Could not find file $CALLBACK_FILE"', file=fp)
    print('    echo "You are in directory $PWD"', file=fp)
    print('    echo "Change directory to the top dir of this repo $(git rev-parse --show-toplevel 2>/dev/null)"', file=fp)
    print('    exit 1', file=fp)
    print('fi', file=fp)
    print('if [ ! -d "$REPOS_DIR" ]; then', file=fp)
    print('    echo "Could not find directory $REPOS_DIR"', file=fp)
    print('    exit 2', file=fp)
    print('fi', file=fp)
    print('', file=fp)
    print('echo "running annotations on SmartSHARK repos for SmartSHARK buggy commits, v2"', file=fp)
    print('echo "trying to generate the same line annotations as used by SmartSHARK dataset"', file=fp)
    print('echo "saving annotations to $ANNOTATIONS_DIR"', file=fp)
    print('', file=fp)

Path(run_smartshark_script).chmod(0o755)  # 0755/-rwxr-xr-x

One thing to consider is whether to use `--use-fanout` option, or not.

In [741]:
for repo_name, repo_shas in smartshark_projects_shas.items():
    print(f"{repo_name:20s}", end='')
    cmd_str = ''.join([
        'diff-annotate ',
        '--ext-to-language=".pom:Maven POM" ',
        '--ext-to-language=".plist:XML Property List" ',
        '--ext-to-language=".jar:Java Archive" ',
        '--pattern-to-purpose="*.jar:other" ',
        '--filename-to-language=RELEASE_NOTES:Text ',
        # OLD VERSION
        #f'--line-callback="$CALLBACK_FILE" ',
        # NEW VERSION
        '--purpose-to-annotation=test ',
        '--purpose-to-annotation=documentation ',
        'from-repo ',
        f'--output-dir="$ANNOTATIONS_DIR/{repo_name}" ',
        f'"$REPOS_DIR/{repo_name}" --no-walk=sorted {" ".join(repo_shas)}',
    ])
    print("\targ_length <=", len(cmd_str))
    
    with open(run_smartshark_script, 'at') as fp:
        print(f"# {repo_name}", file=fp)
        print(cmd_str, file=fp)

ant-ivy             	arg_length <= 22831
commons-math        	arg_length <= 16404
opennlp             	arg_length <= 6513
parquet-mr          	arg_length <= 5248
wss4j               	arg_length <= 10404
archiva             	arg_length <= 568
deltaspike          	arg_length <= 697
systemml            	arg_length <= 611
commons-lang        	arg_length <= 9598
commons-net         	arg_length <= 7587
commons-collections 	arg_length <= 4159
commons-beanutils   	arg_length <= 2884
commons-codec       	arg_length <= 2753
commons-compress    	arg_length <= 8745
commons-configuration	arg_length <= 10764
commons-digester    	arg_length <= 1447
commons-jcs         	arg_length <= 3364
commons-imaging     	arg_length <= 1199
commons-io          	arg_length <= 5125
commons-scxml       	arg_length <= 3122
commons-validator   	arg_length <= 3458
commons-vfs         	arg_length <= 5209
giraph              	arg_length <= 6347
jspwiki             	arg_length <= 404
eagle               	arg_length <= 441


### Read annotations, create DataFrame

If needed, change directory to top directory of the PatchScope repository (this repository).

Run the just generated script, `./run_annotation_smartshark_repos.sh`, or with
```console
/usr/bin/time --verbose --append --output=time.run_smartshark.log ./run_annotation_smartshark_repos.sh
```

In [742]:
smartshark_annotations_dir

'/mnt/data/python-diff-annotator/example_annotations/smartshark'

In [743]:
smartshark_df_sel.columns

Index(['project', 'issue', 'sha', 'file', 'hunk_idx', 'hunk_idx_hash',
       'hunk_oid', 'hunk_oid_num', 'n_lines_in_hunk', 'in_hunk_idx',
       'most_common', 'n_reviewers', 'common_count', 'has_consensus'],
      dtype='object')

In [744]:
annotation_data['changes']['src/java/org/apache/ivy/util/url/ApacheURLLister.java']

{'language': 'Java',
 'type': 'programming',
 'purpose': 'programming',
 '+': [{'id': 3,
   'hunk_idx': 0,
   'in_hunk_chg_idx': 0,
   'file_line_no': 22,
   'type': 'code',
   'purpose': 'programming',
   'tokens': [[910, ['Keyword', 'Namespace'], 'import'],
    [916, ['Text', 'Whitespace'], ' '],
    [917, ['Name', 'Namespace'], 'java.io.InputStream'],
    [936, ['Punctuation'], ';'],
    [937, ['Text', 'Whitespace'], '\n']]},
  {'id': 5,
   'hunk_idx': 1,
   'in_hunk_chg_idx': 0,
   'file_line_no': 110,
   'type': 'code',
   'purpose': 'programming',
   'tokens': [[4168, ['Text', 'Whitespace'], '        '],
    [4176, ['Name'], 'URLHandler'],
    [4186, ['Text', 'Whitespace'], ' '],
    [4187, ['Name'], 'urlHandler'],
    [4197, ['Text', 'Whitespace'], ' '],
    [4198, ['Operator'], '='],
    [4199, ['Text', 'Whitespace'], ' '],
    [4200, ['Name'], 'URLHandlerRegistry'],
    [4218, ['Punctuation'], '.'],
    [4219, ['Name', 'Attribute'], 'getDefault'],
    [4229, ['Punctuation'], '

In [1037]:
%%time

records_from_repos = []

for subdir in sorted(Path(smartshark_annotations_dir).iterdir()):
    print(f"{subdir.name:22s}", end='')
    repo = subdir.name
    
    n_commits = 0
    n_files = 0
    n_lines = 0

    for json_file in subdir.glob('*.json'):
        sha = json_file.name.split('.', maxsplit=1)[0]
        n_commits += 1

        with open(json_file, mode='r') as json_fp:
            json_data = json.load(json_fp)

        for patched_file, file_data in json_data['changes'].items():
            if patched_file == '/dev/null':
                continue

            n_files += 1

            n_lines_in_file = 0
            for pm in list("-+"):
                if pm in file_data:
                    n_lines_in_file += len(file_data[pm])

            hunk_max_line = Counter()
            for pm in list("-+"):
                if pm not in file_data:
                    continue
                hunk_max_line[line_data['hunk_idx']] = max(
                    hunk_max_line[line_data['hunk_idx']],
                    line_data['in_hunk_chg_idx']
                )
            
            for pm in list("-+"):
                if pm not in file_data:
                    continue

                for line_data in file_data[pm]:
                    n_lines += 1

                    # adjusted for better fit
                    if line_data['type'] == 'code':
                        annotation = 'bugfix'  # just a name for code change in bugfix
                    elif line_data['type'] == 'data':
                        # we lost information abou whether it is in documentation directory, or test directory
                        annotation = 'documentation'  # most common case
                    else:
                        annotation = line_data['type']
                    
                    records_from_repos.append({
                        'project': repo,
                        'sha': sha,
                        'file': patched_file,
                        'pm': pm,
                        'line_id': line_data['id'],
                        'file_line': line_data['file_line_no'],
                        'hunk_idx': line_data['hunk_idx'],
                        'in_hunk_idx': line_data['in_hunk_chg_idx'],
                        # This change is generic enough that could stay as is
                        #'annotation': 'bugfix' if line_data['type'] == 'code' else line_data['type'],
                        'annotation': annotation,
                        'n_lines_in_file': n_lines_in_file,
                        'n_lines_in_hunk': hunk_max_line[line_data['hunk_idx']]+1,
                        'purpose': line_data['purpose'],
                    })

    print(f" {n_commits} commits, {n_files} changed files, {n_lines} changed lines")

ant-ivy                548 commits, 2766 changed files, 43348 changed lines
archiva                5 commits, 62 changed files, 1941 changed lines
commons-bcel           52 commits, 185 changed files, 4472 changed lines
commons-beanutils      61 commits, 131 changed files, 4746 changed lines
commons-codec          58 commits, 167 changed files, 5145 changed lines
commons-collections    92 commits, 283 changed files, 6412 changed lines
commons-compress       204 commits, 523 changed files, 8806 changed lines
commons-configuration  253 commits, 853 changed files, 22563 changed lines
commons-dbcp           89 commits, 268 changed files, 6811 changed lines
commons-digester       26 commits, 76 changed files, 1400 changed lines
commons-imaging        20 commits, 58 changed files, 1024 changed lines
commons-io             116 commits, 232 changed files, 5226 changed lines
commons-jcs            73 commits, 451 changed files, 15640 changed lines
commons-lang           225 commits, 495 changed

In [1038]:
for project in sorted(smartshark_projects):
    print(f"{project:22s} {len(smartshark_projects_shas[project])} commits")

ant-ivy                548 commits
archiva                5 commits
commons-bcel           52 commits
commons-beanutils      61 commits
commons-codec          58 commits
commons-collections    92 commits
commons-compress       204 commits
commons-configuration  253 commits
commons-dbcp           89 commits
commons-digester       26 commits
commons-imaging        20 commits
commons-io             116 commits
commons-jcs            73 commits
commons-lang           225 commits
commons-math           391 commits
commons-net            176 commits
commons-scxml          67 commits
commons-validator      75 commits
commons-vfs            118 commits
deltaspike             8 commits
eagle                  2 commits
giraph                 146 commits
gora                   98 commits
jspwiki                1 commits
opennlp                150 commits
parquet-mr             119 commits
santuario-java         95 commits
systemml               6 commits
wss4j                  245 commits


In [1039]:
projects_to_drop = []

for project in sorted(smartshark_projects):
    n_commits_smartshark = len(smartshark_projects_shas[project])
    print(f"{project:22s} {n_commits_smartshark:3d} commits smartshark", end='')
    if Path(smartshark_annotations_dir).joinpath(project).is_dir():
        n_commits_from_repo = len(list(Path(smartshark_annotations_dir).joinpath(project).glob('*.json')))
        print(f" {n_commits_from_repo:3d} commits from-repo", end='')
        if n_commits_smartshark != n_commits_from_repo:
            projects_to_drop.append(project)
            print(" *")
        else:
            print("")
    else:
        print("")

sorted(projects_to_drop)

ant-ivy                548 commits smartshark 548 commits from-repo
archiva                  5 commits smartshark   5 commits from-repo
commons-bcel            52 commits smartshark  52 commits from-repo
commons-beanutils       61 commits smartshark  61 commits from-repo
commons-codec           58 commits smartshark  58 commits from-repo
commons-collections     92 commits smartshark  92 commits from-repo
commons-compress       204 commits smartshark 204 commits from-repo
commons-configuration  253 commits smartshark 253 commits from-repo
commons-dbcp            89 commits smartshark  89 commits from-repo
commons-digester        26 commits smartshark  26 commits from-repo
commons-imaging         20 commits smartshark  20 commits from-repo
commons-io             116 commits smartshark 116 commits from-repo
commons-jcs             73 commits smartshark  73 commits from-repo
commons-lang           225 commits smartshark 225 commits from-repo
commons-math           391 commits smartshark 39

['santuario-java', 'wss4j']

In [1040]:
records_from_repos[:2]

[{'project': 'ant-ivy',
  'sha': '535c0c34334862d47ad339a99d0933e5eba76ae0',
  'file': 'CHANGES.txt',
  'pm': '+',
  'line_id': 0,
  'file_line': 1,
  'hunk_idx': 0,
  'in_hunk_idx': 0,
  'annotation': 'documentation',
  'n_lines_in_file': 2,
  'n_lines_in_hunk': 1,
  'purpose': 'documentation'},
 {'project': 'ant-ivy',
  'sha': '535c0c34334862d47ad339a99d0933e5eba76ae0',
  'file': 'CHANGES.txt',
  'pm': '+',
  'line_id': 1,
  'file_line': 2,
  'hunk_idx': 0,
  'in_hunk_idx': 1,
  'annotation': 'documentation',
  'n_lines_in_file': 2,
  'n_lines_in_hunk': 1,
  'purpose': 'documentation'}]

In [1041]:
from_repos_df = pd.DataFrame.from_records(records_from_repos)
from_repos_df

Unnamed: 0,project,sha,file,pm,line_id,file_line,hunk_idx,in_hunk_idx,annotation,n_lines_in_file,n_lines_in_hunk,purpose
0,ant-ivy,535c0c34334862d47ad339a99d0933e5eba76ae0,CHANGES.txt,+,0,1,0,0,documentation,2,1,documentation
1,ant-ivy,535c0c34334862d47ad339a99d0933e5eba76ae0,CHANGES.txt,+,1,2,0,1,documentation,2,1,documentation
2,ant-ivy,535c0c34334862d47ad339a99d0933e5eba76ae0,src/java/fr/jayasoft/ivy/ModuleId.java,-,3,55,1,0,bugfix,7,1,programming
3,ant-ivy,535c0c34334862d47ad339a99d0933e5eba76ae0,src/java/fr/jayasoft/ivy/ModuleId.java,-,7,58,1,1,bugfix,7,1,programming
4,ant-ivy,535c0c34334862d47ad339a99d0933e5eba76ae0,src/java/fr/jayasoft/ivy/ModuleId.java,-,10,60,1,2,bugfix,7,1,programming
...,...,...,...,...,...,...,...,...,...,...,...,...
252530,systemml,ebdf770c872429a03ca125f2d69b25db59d0fa8c,src/main/java/org/apache/sysml/runtime/matrix/...,+,25,2691,2,7,bugfix,25,1,programming
252531,systemml,ebdf770c872429a03ca125f2d69b25db59d0fa8c,src/main/java/org/apache/sysml/runtime/matrix/...,+,4,2719,3,0,bugfix,25,1,programming
252532,systemml,ebdf770c872429a03ca125f2d69b25db59d0fa8c,src/main/java/org/apache/sysml/runtime/matrix/...,+,4,2733,4,0,bugfix,25,1,programming
252533,systemml,ebdf770c872429a03ca125f2d69b25db59d0fa8c,src/main/java/org/apache/sysml/runtime/matrix/...,+,6,2735,4,1,bugfix,25,1,programming


## Compare SmartSHARK and diff-annotate

In [1042]:
print(f"{smartshark_df.shape=}, {smartshark_df_sel.shape=}")
print(f"{from_repos_df.shape=}")

smartshark_df.shape=(290805, 78), smartshark_df_sel.shape=(290805, 14)
from_repos_df.shape=(252535, 12)


### Join/merge for comparison

In [1043]:
smartshark_df.columns

Index(['project', 'issue', 'sha', 'file', 'hunk_idx', 'hunk_idx_hash',
       'hunk_oid', 'hunk_oid_num', 'n_lines_in_hunk', 'in_hunk_idx',
       'user_KKC', 'user_aaghamohammadi', 'user_riruk', 'user_davisjam',
       'user_kaka727', 'user_omalam', 'user_bossenti', 'user_aserebrenik',
       'user_sherbold', 'user_LDlnwznmYPWdNVSA', 'user_atrautsch',
       'user_dvmarcilio', 'user_evidencebp', 'user_bledel', 'user_vijaybw',
       'user_rcolomo', 'user_uGAVaWPHVmMZKjBZ', 'user_melkor54248',
       'user_matin', 'user_grodrig', 'user_JjGyDTFSqlCnXIzk', 'user_Jiong',
       'user_CUE', 'user_TeeKea', 'user_NGKRdSgaxGAtmSqP', 'user_ladybug',
       'user_lancelot', 'user_simin', 'user_ealbrec', 'user_psingh',
       'user_perfwxc', 'user_nikoHu', 'user_ivanomalavolta', 'user_Pyrrhon',
       'user_ppp23', 'user_LastButNotLeast', 'user_ctreude', 'user_turtle',
       'user_Badger', 'user_azhenley', 'user_sulir', 'user_ethemutku',
       'user_joydeba', 'user_rNSKGGMSeMXtAEET', 'user_aar

In [1044]:
smartshark_df_sel = smartshark_df[[col for col in smartshark_df.columns if not col.startswith('user_')]]
smartshark_df_sel.columns

Index(['project', 'issue', 'sha', 'file', 'hunk_idx', 'hunk_idx_hash',
       'hunk_oid', 'hunk_oid_num', 'n_lines_in_hunk', 'in_hunk_idx',
       'most_common', 'n_reviewers', 'common_count', 'has_consensus'],
      dtype='object')

In [1045]:
smartshark_df_sel.shape

(290805, 14)

In [1046]:
from_repos_df.columns

Index(['project', 'sha', 'file', 'pm', 'line_id', 'file_line', 'hunk_idx',
       'in_hunk_idx', 'annotation', 'n_lines_in_file', 'n_lines_in_hunk',
       'purpose'],
      dtype='object')

In [1047]:
from_repos_df.shape

(252535, 12)

In [1048]:
smartshark_vs_from_repos_df_merge = pd.merge(
    smartshark_df_sel, from_repos_df,
    how='outer', on=['project', 'sha', 'file', 'hunk_idx', 'in_hunk_idx'],
    indicator="indicator_column", suffixes=("_shark", "_repos"),
)
smartshark_vs_from_repos_df_merge.head()

Unnamed: 0,project,issue,sha,file,hunk_idx,hunk_idx_hash,hunk_oid,hunk_oid_num,n_lines_in_hunk_shark,in_hunk_idx,...,common_count,has_consensus,pm,line_id,file_line,annotation,n_lines_in_file,n_lines_in_hunk_repos,purpose,indicator_column
0,ant-ivy,IVY-639,001e2caa07a1dbfd92fbf86a26f30aaef23be5f5,src/java/org/apache/ivy/ant/IvyAntSettings.java,0,0.0,5a82f5ba912063217a889fbf,28011971661811126081191190463,4.0,0,...,2.0,False,-,3.0,112.0,bugfix,30.0,1.0,programming,both
1,ant-ivy,IVY-639,001e2caa07a1dbfd92fbf86a26f30aaef23be5f5,src/java/org/apache/ivy/ant/IvyAntSettings.java,0,0.0,5a82f5ba912063217a889fbf,28011971661811126081191190463,4.0,0,...,2.0,False,+,5.0,112.0,bugfix,30.0,1.0,programming,both
2,ant-ivy,IVY-639,001e2caa07a1dbfd92fbf86a26f30aaef23be5f5,src/java/org/apache/ivy/ant/IvyAntSettings.java,0,0.0,5a82f5ba912063217a889fbf,28011971661811126081191190463,4.0,1,...,3.0,True,-,4.0,113.0,bugfix,30.0,1.0,programming,both
3,ant-ivy,IVY-639,001e2caa07a1dbfd92fbf86a26f30aaef23be5f5,src/java/org/apache/ivy/ant/IvyAntSettings.java,0,0.0,5a82f5ba912063217a889fbf,28011971661811126081191190463,4.0,1,...,3.0,True,+,6.0,113.0,bugfix,30.0,1.0,programming,both
4,ant-ivy,IVY-639,001e2caa07a1dbfd92fbf86a26f30aaef23be5f5,src/java/org/apache/ivy/ant/IvyAntSettings.java,0,0.0,5a82f5ba912063217a889fbf,28011971661811126081191190463,4.0,2,...,2.0,False,,,,,,,,left_only


### Examining the results

In [1049]:
smartshark_vs_from_repos_df_merge['indicator_column'].value_counts()

indicator_column
both          170094
left_only     139339
right_only     83278
Name: count, dtype: int64

In [1050]:
smartshark_vs_from_repos_df_merge['indicator_column'].value_counts()/smartshark_vs_from_repos_df_merge.shape[0]

indicator_column
both          0.433128
left_only     0.354813
right_only    0.212059
Name: count, dtype: float64

In [1051]:
(smartshark_vs_from_repos_df_merge['indicator_column'] == 'both').sum()

np.int64(170094)

In [1052]:
(smartshark_vs_from_repos_df_merge['indicator_column'] == 'both').sum()/smartshark_df_sel.shape[0]

np.float64(0.5849074121834219)

In [1053]:
(smartshark_vs_from_repos_df_merge['indicator_column'] == 'both').sum()/from_repos_df.shape[0]

np.float64(0.673546241115093)

Quite a few mismatches

In [1054]:
projects_to_drop

['santuario-java', 'wss4j']

In [1055]:
smartshark_vs_from_repos_df_merge[smartshark_vs_from_repos_df_merge['indicator_column'] == 'left_only'].sample(4)

Unnamed: 0,project,issue,sha,file,hunk_idx,hunk_idx_hash,hunk_oid,hunk_oid_num,n_lines_in_hunk_shark,in_hunk_idx,...,common_count,has_consensus,pm,line_id,file_line,annotation,n_lines_in_file,n_lines_in_hunk_repos,purpose,indicator_column
391468,wss4j,WSS-126,f92e91faf3c72f0dee92a9fcf64cf79fbf844873,test/wssec/TestWSSecurityNew17.java,0,0.0,5b48cce056677a7efff890fb,28251146053813912168839024891,221.0,67,...,4.0,True,,,,,,,,left_only
358486,santuario-java,SANTUARIO-307,3c4217d48a99010f8fd0fa2d952ad6ad609fa4fe,src/main/java/org/apache/xml/security/c14n/imp...,0,0.0,5caeecfb4dd2d957414158ac,28684093110407194991256492204,441.0,196,...,3.0,True,,,,,,,,left_only
290518,giraph,GIRAPH-259,4309753f2eddfa193ae8e8ab2dd5cc6b0b16410e,src/test/java/org/apache/giraph/aggregators/Te...,2,2.0,5bf51c9ed2f8190d8af3c521,28459457875596997486078838049,2.0,1,...,4.0,True,,,,,,,,left_only
201840,commons-math,MATH-1138,31fae6431438e26d6b47b988164847048ceab314,src/main/java/org/apache/commons/math3/analysi...,13,13.0,5a8562a877d2b433dd7101e1,28014903917532289044375994849,207.0,71,...,2.0,False,,,,,,,,left_only


In [1056]:
smartshark_vs_from_repos_df_merge.loc[327611]

project                                                            opennlp
issue                                                           OPENNLP-59
sha                               2a3be6b98e0a7f19996e977eafa84da514cb8299
file                     opennlp-tools/src/main/java/opennlp/tools/util...
hunk_idx                                                                 1
hunk_idx_hash                                                          1.0
hunk_oid                                          5b27c659eee553716c77d4b7
hunk_oid_num                                 28211220688245459438980814007
n_lines_in_hunk_shark                                                  3.0
in_hunk_idx                                                              2
most_common                                                  documentation
n_reviewers                                                            4.0
common_count                                                           4.0
has_consensus            

In [1057]:
smartshark_vs_from_repos_df_merge.loc[327611]['file']

'opennlp-tools/src/main/java/opennlp/tools/util/eval/FMeasure.java'

In [1058]:
with open(Path(smartshark_annotations_dir).joinpath('opennlp', '4496b1e746451527bfb05e7ea48358c4ae509e1b.v2.json')) as fp:
    example_data = json.load(fp)

example_data['changes']['opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java'].keys()

dict_keys(['language', 'type', 'purpose', '+', '-'])

In [1059]:
max([elem['hunk_idx'] for elem in example_data['changes']['opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java']['+']])

13

In [1060]:
max([elem['hunk_idx'] for elem in example_data['changes']['opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java']['-']])

13

In [1061]:
example_data['diff_metadata']

{'n_files': 1,
 'hunk_span_src': 434,
 'hunk_span_dst': 434,
 'n_hunks': 14,
 'n_lines_added': 86,
 'n_lines_removed': 86,
 'n_lines_all': 409,
 'spread_inner': 153,
 'n_mod': 86,
 'n_groups': 77,
 'patch_size': 86,
 'groups_spread': 343,
 'hunk_spread_src': 125,
 'hunk_spread_dst': 125}

In [1062]:
smartshark_data[0]['hunk_id']['$oid']

'5a82f5aa912063217b88940a'

In [1063]:
[elem for elem in smartshark_data if elem['hunk_id']['$oid'] == '5b27c661eee553716977ddcb']

[{'lines_manual': {'KKC': {'whitespace': [0, 1, 2, 3, 5, 6, 8, 9, 11, 12]},
   'grodrig': {'unrelated': [0, 1, 2, 3], 'whitespace': [5, 6, 8, 9, 11, 12]},
   'bossenti': {'whitespace': [0, 1, 2, 3, 5, 6, 8, 9, 11, 12]},
   'evidencebp': {'unrelated': [0, 1, 2, 3],
    'whitespace': [5, 6, 8, 9, 11, 12]}},
  'file': 'opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java',
  'issue_id': 'OPENNLP-417',
  'revision_hash': '4496b1e746451527bfb05e7ea48358c4ae509e1b',
  'hunk_id': {'$oid': '5b27c661eee553716977ddcb'},
  'repository_url': 'https://github.com/apache/opennlp.git',
  'project': 'opennlp'}]

https://github.com/apache/opennlp/commit/4496b1e746451527bfb05e7ea48358c4ae509e1b.diff

In [1064]:
sha_file_matches = [
    elem for elem in smartshark_data 
     if elem['revision_hash'] == '4496b1e746451527bfb05e7ea48358c4ae509e1b' 
     and elem['file'] == 'opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java'
]
len(sha_file_matches)

55

In [1065]:
len([elem['hunk_id']['$oid'] for elem in sha_file_matches])

55

In [1066]:
len(set([elem['hunk_id']['$oid'] for elem in sha_file_matches]))

55

This means that **'hunk_id'** is not about hunk in the git diff sense, and it cannot be in the sense of group of consecutive changed lines (chunk).

In the example above: number of hunks in diff for a (single) changed file is 14, number of different hunk_id.$oid is 55, and number of change groups is 77.

#### Actual comparison: 'most_common' vs 'annotation'

In [1067]:
smartshark_vs_from_repos_df_merge['eq'] = smartshark_vs_from_repos_df_merge['most_common'] == smartshark_vs_from_repos_df_merge['annotation']
smartshark_vs_from_repos_df_merge['eq'].value_counts()

eq
False    255517
True     137194
Name: count, dtype: int64

In [1068]:
mask_1 = smartshark_vs_from_repos_df_merge['indicator_column'] == 'both'

In [1069]:
smartshark_vs_from_repos_df_merge[mask_1]['eq'].value_counts()

eq
True     137194
False     32900
Name: count, dtype: int64

In [1070]:
mask_2 = smartshark_vs_from_repos_df_merge['has_consensus']

In [1071]:
(mask_1 & mask_2).sum()/smartshark_vs_from_repos_df_merge.shape[0]

np.float64(0.37626397019691327)

In [1072]:
smartshark_vs_from_repos_df_merge[mask_1 & mask_2]['eq'].value_counts()

eq
True     128982
False     18781
Name: count, dtype: int64

In [1073]:
smartshark_vs_from_repos_df_merge[mask_1 & mask_2]['eq'].value_counts()/smartshark_vs_from_repos_df_merge[mask_1 & mask_2].shape[0]

eq
True     0.872898
False    0.127102
Name: count, dtype: float64

In [1074]:
mask_3 = smartshark_vs_from_repos_df_merge['most_common'] != 'refactoring'
mask_3.sum()

np.int64(382103)

In [1075]:
smartshark_vs_from_repos_df_merge[mask_1 & mask_2 & mask_3]['eq'].value_counts()

eq
True     128982
False     16958
Name: count, dtype: int64

In [1076]:
smartshark_vs_from_repos_df_merge[
    mask_1 & mask_2 & mask_3
]['eq'].value_counts()/smartshark_vs_from_repos_df_merge[
    mask_1 & mask_2 & mask_3
].shape[0]

eq
True     0.883802
False    0.116198
Name: count, dtype: float64

In [1077]:
mask_4 = smartshark_vs_from_repos_df_merge['most_common'] != 'unrelated'

In [1078]:
smartshark_vs_from_repos_df_merge[
    mask_1 & mask_2 & mask_3 & mask_4
]['eq'].value_counts()/smartshark_vs_from_repos_df_merge[
    mask_1 & mask_2 & mask_3 & mask_4
].shape[0]

eq
True     0.889267
False    0.110733
Name: count, dtype: float64

In [1079]:
per_repo_df = smartshark_vs_from_repos_df_merge.groupby('project')['eq'].agg(['sum', 'count'])
per_repo_df['ratio'] = per_repo_df['sum']/per_repo_df['count']
per_repo_df

Unnamed: 0_level_0,sum,count,ratio
project,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ant-ivy,23558,62386,0.377617
archiva,515,3483,0.147861
commons-bcel,2269,6404,0.35431
commons-beanutils,3157,6025,0.523983
commons-codec,1924,6900,0.278841
commons-collections,3646,9014,0.404482
commons-compress,6051,11577,0.522674
commons-configuration,14064,30286,0.464373
commons-dbcp,5533,7774,0.711731
commons-digester,998,1834,0.544166


In [1080]:
per_repo_df_2 = smartshark_vs_from_repos_df_merge[mask_1 & mask_2].groupby('project')['eq'].agg(['sum', 'count'])
per_repo_df_2['ratio'] = per_repo_df_2['sum']/per_repo_df_2['count']
per_repo_df_2

Unnamed: 0_level_0,sum,count,ratio
project,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ant-ivy,20514,23201,0.884186
archiva,355,488,0.727459
commons-bcel,2166,2572,0.842146
commons-beanutils,3121,3212,0.971669
commons-codec,1857,2983,0.622528
commons-collections,3605,4091,0.881203
commons-compress,5962,6682,0.892248
commons-configuration,13456,14467,0.930117
commons-dbcp,5466,5961,0.91696
commons-digester,989,1101,0.898274


In [1081]:
per_repo_df_2.sort_values(by='ratio')

Unnamed: 0_level_0,sum,count,ratio
project,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
commons-codec,1857,2983,0.622528
eagle,13,20,0.65
archiva,355,488,0.727459
systemml,83,113,0.734513
gora,3323,4404,0.754541
commons-jcs,5762,7581,0.760058
giraph,4485,5781,0.775817
deltaspike,58,71,0.816901
opennlp,4072,4885,0.833572
commons-net,3445,4114,0.837385


#### Limit to single-hunk changes

In [1082]:
smartshark_groupby_n_hunks_df = smartshark_df.groupby(['project', 'issue', 'sha', 'file'])[['hunk_idx', 'hunk_idx_hash']].max()+1
smartshark_groupby_n_hunks_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,hunk_idx,hunk_idx_hash
project,issue,sha,file,Unnamed: 4_level_1,Unnamed: 5_level_1
ant-ivy,IVY-1005,7129090417afd60dad011c1823f927c0f1a1fd37,CHANGES.txt,3,3
ant-ivy,IVY-1005,7129090417afd60dad011c1823f927c0f1a1fd37,doc/use/makepom.html,1,1
ant-ivy,IVY-1005,7129090417afd60dad011c1823f927c0f1a1fd37,src/java/org/apache/ivy/ant/IvyMakePom.java,3,3
ant-ivy,IVY-1005,7129090417afd60dad011c1823f927c0f1a1fd37,src/java/org/apache/ivy/plugins/parser/m2/PomModuleDescriptorWriter.java,6,6
ant-ivy,IVY-1005,7129090417afd60dad011c1823f927c0f1a1fd37,src/java/org/apache/ivy/plugins/parser/xml/XmlModuleDescriptorWriter.java,1,1
...,...,...,...,...,...
wss4j,WSS-93,349c010f966f5e69b878d4c3d6936f0098516d09,test/components/PackageTests.java,1,1
wss4j,WSS-93,349c010f966f5e69b878d4c3d6936f0098516d09,test/components/TestReference.java,1,1
wss4j,WSS-94,a7c52da1d167bfaf2a3b465b6e9d05bf40cc4001,src/org/apache/ws/security/components/crypto/CryptoBase.java,5,5
wss4j,WSS-94,a7c52da1d167bfaf2a3b465b6e9d05bf40cc4001,src/org/apache/ws/security/handler/WSHandler.java,1,1


In [1083]:
smartshark_groupby_n_hunks_df.describe()

Unnamed: 0,hunk_idx,hunk_idx_hash
count,11240.0,11240.0
mean,2.742883,2.76379
std,4.206657,4.339657
min,1.0,1.0
25%,1.0,1.0
50%,1.0,1.0
75%,3.0,3.0
max,181.0,197.0


In [1084]:
smartshark_groupby_single_hunk_df = smartshark_groupby_n_hunks_df[smartshark_groupby_n_hunks_df['hunk_idx_hash'] == 1]
smartshark_groupby_single_hunk_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,hunk_idx,hunk_idx_hash
project,issue,sha,file,Unnamed: 4_level_1,Unnamed: 5_level_1
ant-ivy,IVY-1005,7129090417afd60dad011c1823f927c0f1a1fd37,doc/use/makepom.html,1,1
ant-ivy,IVY-1005,7129090417afd60dad011c1823f927c0f1a1fd37,src/java/org/apache/ivy/plugins/parser/xml/XmlModuleDescriptorWriter.java,1,1
ant-ivy,IVY-1005,7129090417afd60dad011c1823f927c0f1a1fd37,test/java/org/apache/ivy/plugins/parser/m2/test-write-compile-dependencies.xml,1,1
ant-ivy,IVY-1007,ac5fc2cacdea64878aa0bb9c0c397d45caff9f63,CHANGES.txt,1,1
ant-ivy,IVY-1007,ac5fc2cacdea64878aa0bb9c0c397d45caff9f63,src/java/org/apache/ivy/plugins/repository/url/URLRepository.java,1,1
...,...,...,...,...,...
wss4j,WSS-77,78b04449a2d8a1e554d5efe2d9d41032a9ce2ca8,src/org/apache/ws/security/message/WSSecDKEncrypt.java,1,1
wss4j,WSS-89,33a22319878b598a1b04911e5d9d09fc5a7b540e,src/org/apache/ws/security/processor/SignatureProcessor.java,1,1
wss4j,WSS-93,349c010f966f5e69b878d4c3d6936f0098516d09,test/components/PackageTests.java,1,1
wss4j,WSS-93,349c010f966f5e69b878d4c3d6936f0098516d09,test/components/TestReference.java,1,1


In [1085]:
smartshark_groupby_single_hunk_df['hunk_idx'].value_counts()

hunk_idx
1    6071
Name: count, dtype: int64

In [1086]:
smartshark_groupby_single_hunk_df.index.tolist()[:3]

[('ant-ivy',
  'IVY-1005',
  '7129090417afd60dad011c1823f927c0f1a1fd37',
  'doc/use/makepom.html'),
 ('ant-ivy',
  'IVY-1005',
  '7129090417afd60dad011c1823f927c0f1a1fd37',
  'src/java/org/apache/ivy/plugins/parser/xml/XmlModuleDescriptorWriter.java'),
 ('ant-ivy',
  'IVY-1005',
  '7129090417afd60dad011c1823f927c0f1a1fd37',
  'test/java/org/apache/ivy/plugins/parser/m2/test-write-compile-dependencies.xml')]

In [1087]:
#smartshark_groupby_single_hunk_df.index.to_flat_index()
#smartshark_groupby_single_hunk_df.index.to_frame()
smartshark_groupby_single_hunk_df_reset = smartshark_groupby_single_hunk_df.reset_index()
smartshark_groupby_single_hunk_df_reset

Unnamed: 0,project,issue,sha,file,hunk_idx,hunk_idx_hash
0,ant-ivy,IVY-1005,7129090417afd60dad011c1823f927c0f1a1fd37,doc/use/makepom.html,1,1
1,ant-ivy,IVY-1005,7129090417afd60dad011c1823f927c0f1a1fd37,src/java/org/apache/ivy/plugins/parser/xml/Xml...,1,1
2,ant-ivy,IVY-1005,7129090417afd60dad011c1823f927c0f1a1fd37,test/java/org/apache/ivy/plugins/parser/m2/tes...,1,1
3,ant-ivy,IVY-1007,ac5fc2cacdea64878aa0bb9c0c397d45caff9f63,CHANGES.txt,1,1
4,ant-ivy,IVY-1007,ac5fc2cacdea64878aa0bb9c0c397d45caff9f63,src/java/org/apache/ivy/plugins/repository/url...,1,1
...,...,...,...,...,...,...
6066,wss4j,WSS-77,78b04449a2d8a1e554d5efe2d9d41032a9ce2ca8,src/org/apache/ws/security/message/WSSecDKEncr...,1,1
6067,wss4j,WSS-89,33a22319878b598a1b04911e5d9d09fc5a7b540e,src/org/apache/ws/security/processor/Signature...,1,1
6068,wss4j,WSS-93,349c010f966f5e69b878d4c3d6936f0098516d09,test/components/PackageTests.java,1,1
6069,wss4j,WSS-93,349c010f966f5e69b878d4c3d6936f0098516d09,test/components/TestReference.java,1,1


In [1088]:
projects_to_drop

['santuario-java', 'wss4j']

In [1089]:
smartshark_groupby_single_hunk_subsel_df = smartshark_groupby_single_hunk_df_reset[
    (smartshark_groupby_single_hunk_df_reset['project'] != 'santuario-java') &
    (smartshark_groupby_single_hunk_df_reset['project'] != 'wss4j')
][['project', 'issue', 'sha', 'file']]
smartshark_groupby_single_hunk_subsel_df

Unnamed: 0,project,issue,sha,file
0,ant-ivy,IVY-1005,7129090417afd60dad011c1823f927c0f1a1fd37,doc/use/makepom.html
1,ant-ivy,IVY-1005,7129090417afd60dad011c1823f927c0f1a1fd37,src/java/org/apache/ivy/plugins/parser/xml/Xml...
2,ant-ivy,IVY-1005,7129090417afd60dad011c1823f927c0f1a1fd37,test/java/org/apache/ivy/plugins/parser/m2/tes...
3,ant-ivy,IVY-1007,ac5fc2cacdea64878aa0bb9c0c397d45caff9f63,CHANGES.txt
4,ant-ivy,IVY-1007,ac5fc2cacdea64878aa0bb9c0c397d45caff9f63,src/java/org/apache/ivy/plugins/repository/url...
...,...,...,...,...
5542,parquet-mr,PARQUET-952,cc8bdf1d13639d12d02170d40cc4890180bbabc5,parquet-avro/src/test/java/org/apache/parquet/...
5736,systemml,SYSTEMML-1668,d0c5c5d29d4cc299649573fed3a2e112e828412d,src/main/java/org/apache/sysml/hops/BinaryOp.java
5737,systemml,SYSTEMML-1715,723a7517ab937096135e911631c18188a634a922,src/main/java/org/apache/sysml/runtime/control...
5738,systemml,SYSTEMML-2431,f1bf97baf342035764c676b50d361e36e2bbae62,src/main/java/org/apache/sysml/hops/codegen/te...


In [1090]:
smartshark_single_hunk_df_sel = pd.merge(
    smartshark_df_sel, smartshark_groupby_single_hunk_subsel_df,
    how='inner', on=['project', 'issue', 'sha', 'file'],
)
smartshark_single_hunk_df_sel

Unnamed: 0,project,issue,sha,file,hunk_idx,hunk_idx_hash,hunk_oid,hunk_oid_num,n_lines_in_hunk,in_hunk_idx,most_common,n_reviewers,common_count,has_consensus
0,ant-ivy,IVY-1300,406704aae851f4a5f09d42031ddfaa1756639db4,test/repositories/IVY-1300/ivysettings.xml,0,0,5a82f5aa912063217b88940a,28011971366663220901855138826,27,0,test,4,3,True
1,ant-ivy,IVY-1300,406704aae851f4a5f09d42031ddfaa1756639db4,test/repositories/IVY-1300/ivysettings.xml,0,0,5a82f5aa912063217b88940a,28011971366663220901855138826,27,1,test,4,3,True
2,ant-ivy,IVY-1300,406704aae851f4a5f09d42031ddfaa1756639db4,test/repositories/IVY-1300/ivysettings.xml,0,0,5a82f5aa912063217b88940a,28011971366663220901855138826,27,2,test,4,3,True
3,ant-ivy,IVY-1300,406704aae851f4a5f09d42031ddfaa1756639db4,test/repositories/IVY-1300/ivysettings.xml,0,0,5a82f5aa912063217b88940a,28011971366663220901855138826,27,3,test,4,3,True
4,ant-ivy,IVY-1300,406704aae851f4a5f09d42031ddfaa1756639db4,test/repositories/IVY-1300/ivysettings.xml,0,0,5a82f5aa912063217b88940a,28011971366663220901855138826,27,4,test,4,3,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95387,gora,GORA-174,db3a06c85ab229c7b868755ce6d8b112c5403745,gora-core/src/test/java/org/apache/gora/mapred...,0,0,5cae09c8be4a167297ba978f,28683020200535649064706545551,12,9,test,4,4,True
95388,gora,GORA-174,db3a06c85ab229c7b868755ce6d8b112c5403745,gora-core/src/test/java/org/apache/gora/mapred...,0,0,5cae09c8be4a167297ba978f,28683020200535649064706545551,12,10,test,4,4,True
95389,gora,GORA-174,db3a06c85ab229c7b868755ce6d8b112c5403745,gora-core/src/test/java/org/apache/gora/mapred...,0,0,5cae09c8be4a167297ba978f,28683020200535649064706545551,12,11,test,4,4,True
95390,gora,GORA-210,187d7b98c60a9b7beef7f546f2aa9ba1735b623d,CHANGES.txt,0,0,5cae09c8be4a167297ba979f,28683020200535649064706545567,2,0,documentation,4,4,True


In [1091]:
smartshark_single_hunk_df_sel.shape[0]/smartshark_df_sel.shape[0]

0.328027372294149

In [1092]:
smartshark_single_hunk_df_sel[['hunk_idx', 'hunk_idx_hash']].value_counts()

hunk_idx  hunk_idx_hash
0         0                95392
Name: count, dtype: int64

------

------

Same for `diff-annotate from-repo ...` results

In [1093]:
from_repos_df.columns

Index(['project', 'sha', 'file', 'pm', 'line_id', 'file_line', 'hunk_idx',
       'in_hunk_idx', 'annotation', 'n_lines_in_file', 'n_lines_in_hunk',
       'purpose'],
      dtype='object')

In [1094]:
from_repos_groupby_n_hunks_df = (smartshark_df.groupby(['project', 'sha', 'file'])[['hunk_idx']].max()+1).rename(columns={'hunk_idx': 'n_hunks_per_file'})
from_repos_groupby_n_hunks_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n_hunks_per_file
project,sha,file,Unnamed: 3_level_1
ant-ivy,001e2caa07a1dbfd92fbf86a26f30aaef23be5f5,src/java/org/apache/ivy/ant/IvyAntSettings.java,8
ant-ivy,001e2caa07a1dbfd92fbf86a26f30aaef23be5f5,src/java/org/apache/ivy/ant/IvyConfigure.java,5
ant-ivy,001e2caa07a1dbfd92fbf86a26f30aaef23be5f5,src/java/org/apache/ivy/ant/IvyConvertPom.java,1
ant-ivy,001e2caa07a1dbfd92fbf86a26f30aaef23be5f5,src/java/org/apache/ivy/ant/IvyExtractFromSources.java,3
ant-ivy,001e2caa07a1dbfd92fbf86a26f30aaef23be5f5,src/java/org/apache/ivy/ant/IvyMakePom.java,1
...,...,...,...
wss4j,fec38d39c4c4980f119bb3d23bf034a47939ced3,src/main/java/org/apache/ws/security/message/DOMCallbackLookup.java,4
wss4j,fec38d39c4c4980f119bb3d23bf034a47939ced3,src/main/java/org/apache/ws/security/util/WSSecurityUtil.java,1
wss4j,feda7818be88e1b8bd8c6bf42a63da6f3ef1f9bf,src/org/apache/ws/security/util/WSSecurityUtil.java,2
wss4j,feda7818be88e1b8bd8c6bf42a63da6f3ef1f9bf,test/wssec/PackageTests.java,2


In [1095]:
from_repos_groupby_n_hunks_df.describe()

Unnamed: 0,n_hunks_per_file
count,11240.0
mean,2.742883
std,4.206657
min,1.0
25%,1.0
50%,1.0
75%,3.0
max,181.0


In [1096]:
from_repos_groupby_single_hunk_df = from_repos_groupby_n_hunks_df[from_repos_groupby_n_hunks_df['n_hunks_per_file'] == 1]
from_repos_groupby_single_hunk_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n_hunks_per_file
project,sha,file,Unnamed: 3_level_1
ant-ivy,001e2caa07a1dbfd92fbf86a26f30aaef23be5f5,src/java/org/apache/ivy/ant/IvyConvertPom.java,1
ant-ivy,001e2caa07a1dbfd92fbf86a26f30aaef23be5f5,src/java/org/apache/ivy/ant/IvyMakePom.java,1
ant-ivy,002f8ec7aa01956b282ffd78062ee6fed6ab1eb9,CHANGES.txt,1
ant-ivy,01ecd7c66d6965ad54e0180c822e6a5ba4731c10,CHANGES.txt,1
ant-ivy,01ecd7c66d6965ad54e0180c822e6a5ba4731c10,src/java/org/apache/ivy/plugins/parser/m2/PomModuleDescriptorParser.java,1
...,...,...,...
wss4j,fcda1dd8b9f048ae34192e7b72e09aa62f4ab99d,src/org/apache/ws/security/processor/ReferenceListProcessor.java,1
wss4j,fcda1dd8b9f048ae34192e7b72e09aa62f4ab99d,src/org/apache/ws/security/processor/SignatureProcessor.java,1
wss4j,fd576a33405378bd640e4fde2fe469cd8f84ac6a,test/wssec/SOAPUtil.java,1
wss4j,fec38d39c4c4980f119bb3d23bf034a47939ced3,src/main/java/org/apache/ws/security/util/WSSecurityUtil.java,1


In [1097]:
from_repos_groupby_single_hunk_df['n_hunks_per_file'].value_counts()

n_hunks_per_file
1    6078
Name: count, dtype: int64

In [1098]:
from_repos_groupby_single_hunk_df_reset = from_repos_groupby_single_hunk_df.reset_index()
from_repos_groupby_single_hunk_df_reset

Unnamed: 0,project,sha,file,n_hunks_per_file
0,ant-ivy,001e2caa07a1dbfd92fbf86a26f30aaef23be5f5,src/java/org/apache/ivy/ant/IvyConvertPom.java,1
1,ant-ivy,001e2caa07a1dbfd92fbf86a26f30aaef23be5f5,src/java/org/apache/ivy/ant/IvyMakePom.java,1
2,ant-ivy,002f8ec7aa01956b282ffd78062ee6fed6ab1eb9,CHANGES.txt,1
3,ant-ivy,01ecd7c66d6965ad54e0180c822e6a5ba4731c10,CHANGES.txt,1
4,ant-ivy,01ecd7c66d6965ad54e0180c822e6a5ba4731c10,src/java/org/apache/ivy/plugins/parser/m2/PomM...,1
...,...,...,...,...
6073,wss4j,fcda1dd8b9f048ae34192e7b72e09aa62f4ab99d,src/org/apache/ws/security/processor/Reference...,1
6074,wss4j,fcda1dd8b9f048ae34192e7b72e09aa62f4ab99d,src/org/apache/ws/security/processor/Signature...,1
6075,wss4j,fd576a33405378bd640e4fde2fe469cd8f84ac6a,test/wssec/SOAPUtil.java,1
6076,wss4j,fec38d39c4c4980f119bb3d23bf034a47939ced3,src/main/java/org/apache/ws/security/util/WSSe...,1


In [1099]:
from_repos_groupby_single_hunk_subsel_df = from_repos_groupby_single_hunk_df_reset[
    (from_repos_groupby_single_hunk_df_reset['project'] != 'santuario-java') &
    (from_repos_groupby_single_hunk_df_reset['project'] != 'wss4j')
][['project', 'sha', 'file']]
from_repos_groupby_single_hunk_subsel_df

Unnamed: 0,project,sha,file
0,ant-ivy,001e2caa07a1dbfd92fbf86a26f30aaef23be5f5,src/java/org/apache/ivy/ant/IvyConvertPom.java
1,ant-ivy,001e2caa07a1dbfd92fbf86a26f30aaef23be5f5,src/java/org/apache/ivy/ant/IvyMakePom.java
2,ant-ivy,002f8ec7aa01956b282ffd78062ee6fed6ab1eb9,CHANGES.txt
3,ant-ivy,01ecd7c66d6965ad54e0180c822e6a5ba4731c10,CHANGES.txt
4,ant-ivy,01ecd7c66d6965ad54e0180c822e6a5ba4731c10,src/java/org/apache/ivy/plugins/parser/m2/PomM...
...,...,...,...
5549,parquet-mr,fc2c29df71c8455346a00b43dd1c4f118c335d2c,parquet-hive/parquet-hive-storage-handler/src/...
5743,systemml,723a7517ab937096135e911631c18188a634a922,src/main/java/org/apache/sysml/runtime/control...
5744,systemml,7c12992af95b013b9380acd3a6843ee89fae7a3e,src/main/java/org/apache/sysml/hops/OptimizerU...
5745,systemml,d0c5c5d29d4cc299649573fed3a2e112e828412d,src/main/java/org/apache/sysml/hops/BinaryOp.java


In [1100]:
from_repos_groupby_single_hunk_subsel_df.shape[0]

5554

In [1101]:
smartshark_groupby_single_hunk_subsel_df.shape[0]

5547

In [1102]:
from_repos_single_hunk_df_sel = pd.merge(
    from_repos_df, from_repos_groupby_single_hunk_subsel_df,
    how='inner', on=['project', 'sha', 'file'],
)
from_repos_single_hunk_df_sel

Unnamed: 0,project,sha,file,pm,line_id,file_line,hunk_idx,in_hunk_idx,annotation,n_lines_in_file,n_lines_in_hunk,purpose
0,ant-ivy,535c0c34334862d47ad339a99d0933e5eba76ae0,CHANGES.txt,+,0,1,0,0,documentation,2,1,documentation
1,ant-ivy,535c0c34334862d47ad339a99d0933e5eba76ae0,CHANGES.txt,+,1,2,0,1,documentation,2,1,documentation
2,ant-ivy,0e7eb17fe72237e6e264d21d4d955a7843ea1d2b,CHANGES.txt,-,3,4,0,0,documentation,3,1,documentation
3,ant-ivy,0e7eb17fe72237e6e264d21d4d955a7843ea1d2b,CHANGES.txt,+,4,4,0,1,documentation,3,1,documentation
4,ant-ivy,0e7eb17fe72237e6e264d21d4d955a7843ea1d2b,CHANGES.txt,+,5,5,0,2,documentation,3,1,documentation
...,...,...,...,...,...,...,...,...,...,...,...,...
84010,systemml,f1bf97baf342035764c676b50d361e36e2bbae62,src/main/java/org/apache/sysml/hops/codegen/te...,+,12,198,0,7,bugfix,14,1,programming
84011,systemml,f1bf97baf342035764c676b50d361e36e2bbae62,src/main/java/org/apache/sysml/hops/codegen/te...,+,13,199,0,8,bugfix,14,1,programming
84012,systemml,f1bf97baf342035764c676b50d361e36e2bbae62,src/main/java/org/apache/sysml/hops/codegen/te...,+,17,201,0,9,bugfix,14,1,programming
84013,systemml,d0c5c5d29d4cc299649573fed3a2e112e828412d,src/main/java/org/apache/sysml/hops/BinaryOp.java,-,3,847,0,0,bugfix,2,10,programming


In [1103]:
from_repos_single_hunk_df_sel.shape

(84015, 12)

In [1104]:
smartshark_single_hunk_df_sel.shape

(95392, 14)

In [1105]:
from_repos_df.shape

(252535, 12)

In [1106]:
smartshark_df_sel.shape

(290805, 14)

#### Join/merge for comparison (single hunk)

In [1107]:
smartshark_vs_from_repos_single_hunk_df_merge = pd.merge(
    smartshark_single_hunk_df_sel, from_repos_single_hunk_df_sel,
    how='outer', on=['project', 'sha', 'file', 'hunk_idx', 'in_hunk_idx'],
    indicator="indicator_column", suffixes=("_shark", "_repos"),
)
smartshark_vs_from_repos_single_hunk_df_merge

Unnamed: 0,project,issue,sha,file,hunk_idx,hunk_idx_hash,hunk_oid,hunk_oid_num,n_lines_in_hunk_shark,in_hunk_idx,...,common_count,has_consensus,pm,line_id,file_line,annotation,n_lines_in_file,n_lines_in_hunk_repos,purpose,indicator_column
0,ant-ivy,IVY-639,001e2caa07a1dbfd92fbf86a26f30aaef23be5f5,src/java/org/apache/ivy/ant/IvyConvertPom.java,0,0.0,5a82f5ba912063217a889fce,28011971661811126081191190478,2.0,0,...,3.0,True,-,3.0,64.0,bugfix,2.0,1.0,programming,both
1,ant-ivy,IVY-639,001e2caa07a1dbfd92fbf86a26f30aaef23be5f5,src/java/org/apache/ivy/ant/IvyConvertPom.java,0,0.0,5a82f5ba912063217a889fce,28011971661811126081191190478,2.0,0,...,3.0,True,+,4.0,64.0,bugfix,2.0,1.0,programming,both
2,ant-ivy,IVY-639,001e2caa07a1dbfd92fbf86a26f30aaef23be5f5,src/java/org/apache/ivy/ant/IvyConvertPom.java,0,0.0,5a82f5ba912063217a889fce,28011971661811126081191190478,2.0,1,...,3.0,True,,,,,,,,left_only
3,ant-ivy,IVY-639,001e2caa07a1dbfd92fbf86a26f30aaef23be5f5,src/java/org/apache/ivy/ant/IvyMakePom.java,0,0.0,5a82f5bb912063217a889fd4,28011971680257870154900742100,2.0,0,...,3.0,True,-,3.0,95.0,bugfix,2.0,1.0,programming,both
4,ant-ivy,IVY-639,001e2caa07a1dbfd92fbf86a26f30aaef23be5f5,src/java/org/apache/ivy/ant/IvyMakePom.java,0,0.0,5a82f5bb912063217a889fd4,28011971680257870154900742100,2.0,0,...,3.0,True,+,4.0,95.0,bugfix,2.0,1.0,programming,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103090,systemml,SYSTEMML-2431,f1bf97baf342035764c676b50d361e36e2bbae62,src/main/java/org/apache/sysml/hops/codegen/te...,0,0.0,5b7adf49df629e0ae2f20b9f,28311679294169985112719100831,15.0,9,...,4.0,True,+,17.0,201.0,bugfix,14.0,1.0,programming,both
103091,systemml,SYSTEMML-2431,f1bf97baf342035764c676b50d361e36e2bbae62,src/main/java/org/apache/sysml/hops/codegen/te...,0,0.0,5b7adf49df629e0ae2f20b9f,28311679294169985112719100831,15.0,10,...,4.0,True,,,,,,,,left_only
103092,systemml,SYSTEMML-2431,f1bf97baf342035764c676b50d361e36e2bbae62,src/main/java/org/apache/sysml/hops/codegen/te...,0,0.0,5b7adf49df629e0ae2f20b9f,28311679294169985112719100831,15.0,12,...,2.0,False,,,,,,,,left_only
103093,systemml,SYSTEMML-2431,f1bf97baf342035764c676b50d361e36e2bbae62,src/main/java/org/apache/sysml/hops/codegen/te...,0,0.0,5b7adf49df629e0ae2f20b9f,28311679294169985112719100831,15.0,13,...,2.0,False,,,,,,,,left_only


In [1108]:
smartshark_vs_from_repos_single_hunk_df_merge[['hunk_idx', 'hunk_idx_hash']].value_counts()

hunk_idx  hunk_idx_hash
0         0.0              98054
Name: count, dtype: int64

#### Analysis: matches

In [1109]:
smartshark_vs_from_repos_single_hunk_df_merge["indicator_column"].value_counts()

indicator_column
both          78974
left_only     19080
right_only     5041
Name: count, dtype: int64

In [1110]:
smartshark_vs_from_repos_single_hunk_df_merge["has_consensus"].value_counts()

has_consensus
True     84170
False    13884
Name: count, dtype: int64

In [1111]:
mask_inner = (smartshark_vs_from_repos_single_hunk_df_merge["indicator_column"] == "both")
mask_inner.sum()

np.int64(78974)

In [1112]:
mask_has_consensus = smartshark_vs_from_repos_single_hunk_df_merge["has_consensus"]
mask_has_consensus.sum()

84170

In [1113]:
mask = (mask_inner & mask_has_consensus)
count_1 = mask.sum()
count_1

np.int64(69983)

In [1114]:
smartshark_vs_from_repos_single_hunk_df_merge.columns

Index(['project', 'issue', 'sha', 'file', 'hunk_idx', 'hunk_idx_hash',
       'hunk_oid', 'hunk_oid_num', 'n_lines_in_hunk_shark', 'in_hunk_idx',
       'most_common', 'n_reviewers', 'common_count', 'has_consensus', 'pm',
       'line_id', 'file_line', 'annotation', 'n_lines_in_file',
       'n_lines_in_hunk_repos', 'purpose', 'indicator_column'],
      dtype='object')

#### Analysis: annotation

In [1115]:
smartshark_vs_from_repos_single_hunk_df_merge['eq'] = \
    smartshark_vs_from_repos_single_hunk_df_merge['most_common'] == smartshark_vs_from_repos_single_hunk_df_merge['annotation']

In [1116]:
smartshark_vs_from_repos_single_hunk_df_merge_cmp = smartshark_vs_from_repos_single_hunk_df_merge[mask]
smartshark_vs_from_repos_single_hunk_df_merge_cmp

Unnamed: 0,project,issue,sha,file,hunk_idx,hunk_idx_hash,hunk_oid,hunk_oid_num,n_lines_in_hunk_shark,in_hunk_idx,...,has_consensus,pm,line_id,file_line,annotation,n_lines_in_file,n_lines_in_hunk_repos,purpose,indicator_column,eq
0,ant-ivy,IVY-639,001e2caa07a1dbfd92fbf86a26f30aaef23be5f5,src/java/org/apache/ivy/ant/IvyConvertPom.java,0,0.0,5a82f5ba912063217a889fce,28011971661811126081191190478,2.0,0,...,True,-,3.0,64.0,bugfix,2.0,1.0,programming,both,True
1,ant-ivy,IVY-639,001e2caa07a1dbfd92fbf86a26f30aaef23be5f5,src/java/org/apache/ivy/ant/IvyConvertPom.java,0,0.0,5a82f5ba912063217a889fce,28011971661811126081191190478,2.0,0,...,True,+,4.0,64.0,bugfix,2.0,1.0,programming,both,True
3,ant-ivy,IVY-639,001e2caa07a1dbfd92fbf86a26f30aaef23be5f5,src/java/org/apache/ivy/ant/IvyMakePom.java,0,0.0,5a82f5bb912063217a889fd4,28011971680257870154900742100,2.0,0,...,True,-,3.0,95.0,bugfix,2.0,1.0,programming,both,True
4,ant-ivy,IVY-639,001e2caa07a1dbfd92fbf86a26f30aaef23be5f5,src/java/org/apache/ivy/ant/IvyMakePom.java,0,0.0,5a82f5bb912063217a889fd4,28011971680257870154900742100,2.0,0,...,True,+,4.0,95.0,bugfix,2.0,1.0,programming,both,True
6,ant-ivy,IVY-1148,002f8ec7aa01956b282ffd78062ee6fed6ab1eb9,CHANGES.txt,0,0.0,5a82f5b4912063217b889c27,28011971551130661638950657063,1.0,0,...,True,+,3.0,104.0,documentation,1.0,12.0,documentation,both,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103086,systemml,SYSTEMML-2431,f1bf97baf342035764c676b50d361e36e2bbae62,src/main/java/org/apache/sysml/hops/codegen/te...,0,0.0,5b7adf49df629e0ae2f20b9f,28311679294169985112719100831,15.0,5,...,True,+,10.0,196.0,bugfix,14.0,1.0,programming,both,True
103087,systemml,SYSTEMML-2431,f1bf97baf342035764c676b50d361e36e2bbae62,src/main/java/org/apache/sysml/hops/codegen/te...,0,0.0,5b7adf49df629e0ae2f20b9f,28311679294169985112719100831,15.0,6,...,True,+,11.0,197.0,bugfix,14.0,1.0,programming,both,True
103088,systemml,SYSTEMML-2431,f1bf97baf342035764c676b50d361e36e2bbae62,src/main/java/org/apache/sysml/hops/codegen/te...,0,0.0,5b7adf49df629e0ae2f20b9f,28311679294169985112719100831,15.0,7,...,True,+,12.0,198.0,bugfix,14.0,1.0,programming,both,True
103089,systemml,SYSTEMML-2431,f1bf97baf342035764c676b50d361e36e2bbae62,src/main/java/org/apache/sysml/hops/codegen/te...,0,0.0,5b7adf49df629e0ae2f20b9f,28311679294169985112719100831,15.0,8,...,True,+,13.0,199.0,bugfix,14.0,1.0,programming,both,True


In [1117]:
eq_s = smartshark_vs_from_repos_single_hunk_df_merge_cmp['eq'].value_counts()
eq_s

eq
True     65253
False     4730
Name: count, dtype: int64

In [1118]:
eq_s/eq_s.sum()

eq
True     0.932412
False    0.067588
Name: count, dtype: float64

In [1165]:
df = smartshark_vs_from_repos_single_hunk_df_merge_cmp.groupby('sha')['eq'].agg(['sum', 'count'])
df['ratio'] = df['sum']/df['count']
df

Unnamed: 0_level_0,sum,count,ratio
sha,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
001e2caa07a1dbfd92fbf86a26f30aaef23be5f5,4,4,1.000000
002f8ec7aa01956b282ffd78062ee6fed6ab1eb9,1,1,1.000000
004124ac5dbf5edbf925078652526267468821e7,31,31,1.000000
00513a091d6cebf1ed4c5e2b6619dc36a4b5bbc6,4,4,1.000000
006d0566af0726b8ca5bdb07e28c4065b95238b6,7,7,1.000000
...,...,...,...
ffc38b1f3b35ddfadb942240b9d2f933ec83d1be,4,5,0.800000
ffc6adf7d493def070e26592bf828accbeecef2d,5,7,0.714286
ffdddff32e7ac6e2cbb5ae8471c1192c69ac6a94,1,1,1.000000
fff6d0f2fcaa034b7772dd1288c9222a6dcbbba0,1,1,1.000000


In [1166]:
df['ratio'].describe(percentiles=[0.25,0.5,0.75,0.9,0.95,0.99,0.999,0.9999,0.99999])

count      2534.000000
mean          0.933641
std           0.189914
min           0.000000
25%           1.000000
50%           1.000000
75%           1.000000
90%           1.000000
95%           1.000000
99%           1.000000
99.9%         1.000000
99.99%        1.000000
99.999%       1.000000
max           1.000000
Name: ratio, dtype: float64

In [1137]:
print(f"{smartshark_vs_from_repos_single_hunk_df_merge_cmp.shape[0]=}")
print(f"{smartshark_df.shape[0]=}")
print(f"{smartshark_vs_from_repos_single_hunk_df_merge_cmp.shape[0]/smartshark_df.shape[0]=}")

smartshark_vs_from_repos_single_hunk_df_merge_cmp.shape[0]=69983
smartshark_df.shape[0]=290805
smartshark_vs_from_repos_single_hunk_df_merge_cmp.shape[0]/smartshark_df.shape[0]=0.2406526710338543


In [1144]:
smartshark_df[smartshark_df['has_consensus']].shape

(247366, 78)

In [1145]:
print(f"{smartshark_vs_from_repos_single_hunk_df_merge_cmp.shape[0]=}")
print(f"{smartshark_df[smartshark_df['has_consensus']].shape[0]=}")
print(f"{smartshark_vs_from_repos_single_hunk_df_merge_cmp.shape[0]/smartshark_df[smartshark_df['has_consensus']].shape[0]=}")

smartshark_vs_from_repos_single_hunk_df_merge_cmp.shape[0]=69983
smartshark_df[smartshark_df['has_consensus']].shape[0]=247366
smartshark_vs_from_repos_single_hunk_df_merge_cmp.shape[0]/smartshark_df[smartshark_df['has_consensus']].shape[0]=0.28291276893348316


In [1119]:
smartshark_vs_from_repos_single_hunk_df_merge_cmp[['most_common', 'annotation']].value_counts()

most_common    annotation   
test           test             44628
bugfix         bugfix           10799
documentation  documentation     9826
whitespace     bugfix            1476
bugfix         documentation      947
documentation  bugfix             428
test           bugfix             320
documentation  test               255
whitespace     documentation      251
unrelated      other              202
refactoring    bugfix             128
test           documentation      116
documentation  project            105
bugfix         test               104
unrelated      bugfix              79
               test                71
documentation  markup              69
bugfix         other               66
whitespace     test                40
unrelated      project             35
documentation  other                8
test           other                8
               project              7
whitespace     other                5
bugfix         project              4
refactoring    docume

In [1120]:
smartshark_vs_from_repos_single_hunk_df_merge_cmp[
    ~smartshark_vs_from_repos_single_hunk_df_merge_cmp['eq'].astype(bool)
][[
    'most_common', 'annotation'
]].value_counts()

most_common    annotation   
whitespace     bugfix           1476
bugfix         documentation     947
documentation  bugfix            428
test           bugfix            320
documentation  test              255
whitespace     documentation     251
unrelated      other             202
refactoring    bugfix            128
test           documentation     116
documentation  project           105
bugfix         test              104
unrelated      bugfix             79
               test               71
documentation  markup             69
bugfix         other              66
whitespace     test               40
unrelated      project            35
documentation  other               8
test           other               8
               project             7
whitespace     other               5
bugfix         project             4
refactoring    documentation       4
unrelated      documentation       2
Name: count, dtype: int64

In [1138]:
ss = smartshark_vs_from_repos_single_hunk_df_merge_cmp[
    ~smartshark_vs_from_repos_single_hunk_df_merge_cmp['eq'].astype(bool) &
    (smartshark_vs_from_repos_single_hunk_df_merge_cmp['most_common'] != 'whitespace') &
    (smartshark_vs_from_repos_single_hunk_df_merge_cmp['most_common'] != 'refactoring') &
    (smartshark_vs_from_repos_single_hunk_df_merge_cmp['most_common'] != 'unrelated')
][[
    'most_common', 'annotation'
]].value_counts()
print(ss.sum())
ss

2437


most_common    annotation   
bugfix         documentation    947
documentation  bugfix           428
test           bugfix           320
documentation  test             255
test           documentation    116
documentation  project          105
bugfix         test             104
documentation  markup            69
bugfix         other             66
test           other              8
documentation  other              8
test           project            7
bugfix         project            4
Name: count, dtype: int64

In [1140]:
N = smartshark_vs_from_repos_single_hunk_df_merge_cmp[
    #~smartshark_vs_from_repos_single_hunk_df_merge_cmp['eq'].astype(bool) &
    (smartshark_vs_from_repos_single_hunk_df_merge_cmp['most_common'] != 'whitespace') &
    (smartshark_vs_from_repos_single_hunk_df_merge_cmp['most_common'] != 'refactoring') &
    (smartshark_vs_from_repos_single_hunk_df_merge_cmp['most_common'] != 'unrelated')
].shape[0]
N

67690

In [1141]:
ss.sum()/N

np.float64(0.036002363716944895)

In [1121]:
smartshark_vs_from_repos_single_hunk_df_merge_cmp.columns

Index(['project', 'issue', 'sha', 'file', 'hunk_idx', 'hunk_idx_hash',
       'hunk_oid', 'hunk_oid_num', 'n_lines_in_hunk_shark', 'in_hunk_idx',
       'most_common', 'n_reviewers', 'common_count', 'has_consensus', 'pm',
       'line_id', 'file_line', 'annotation', 'n_lines_in_file',
       'n_lines_in_hunk_repos', 'purpose', 'indicator_column', 'eq'],
      dtype='object')

In [1122]:
df = smartshark_vs_from_repos_single_hunk_df_merge_cmp[
    ~smartshark_vs_from_repos_single_hunk_df_merge_cmp['eq'].astype(bool) &
    (smartshark_vs_from_repos_single_hunk_df_merge_cmp['most_common'] == 'test')
][[
    'project', 'sha', 'file', 'pm', 'file_line', 'most_common', 'annotation',
]]
df

Unnamed: 0,project,sha,file,pm,file_line,most_common,annotation
4756,ant-ivy,27671a8bcd25de274c664aabfff6778ebc29af52,test/java/org/apache/ivy/util/url/ArtifactoryL...,+,1.0,test,documentation
4757,ant-ivy,27671a8bcd25de274c664aabfff6778ebc29af52,test/java/org/apache/ivy/util/url/ArtifactoryL...,+,2.0,test,documentation
4758,ant-ivy,27671a8bcd25de274c664aabfff6778ebc29af52,test/java/org/apache/ivy/util/url/ArtifactoryL...,+,3.0,test,documentation
4759,ant-ivy,27671a8bcd25de274c664aabfff6778ebc29af52,test/java/org/apache/ivy/util/url/ArtifactoryL...,+,4.0,test,documentation
4760,ant-ivy,27671a8bcd25de274c664aabfff6778ebc29af52,test/java/org/apache/ivy/util/url/ArtifactoryL...,+,5.0,test,documentation
...,...,...,...,...,...,...,...
93572,opennlp,d61bc3e942616a7be5740b4ba805f420bdb84bfb,opennlp-tools/src/test/java/opennlp/tools/ml/m...,+,35.0,test,bugfix
93573,opennlp,d61bc3e942616a7be5740b4ba805f420bdb84bfb,opennlp-tools/src/test/java/opennlp/tools/ml/m...,+,36.0,test,bugfix
93574,opennlp,d61bc3e942616a7be5740b4ba805f420bdb84bfb,opennlp-tools/src/test/java/opennlp/tools/ml/m...,+,37.0,test,bugfix
93575,opennlp,d61bc3e942616a7be5740b4ba805f420bdb84bfb,opennlp-tools/src/test/java/opennlp/tools/ml/m...,+,38.0,test,bugfix


In [1123]:
df.sample(1)

Unnamed: 0,project,sha,file,pm,file_line,most_common,annotation
90306,gora,9fabc724c8b86be6cba1b70098b9b1823434c33e,gora-core/src/test/java/org/apache/gora/persis...,+,32.0,test,bugfix


In [1124]:
#df.loc[62328]

In [1125]:
#df.loc[62328]['file']

https://github.com/apache/commons-math/commit/32d33210a92b1197a6c5a07f19aa25426af72723, line 459 on the target side

-----

```
     */
```

-----

It is comment in a test file

In [1126]:
#df.loc[49776]

In [1127]:
#df.loc[49776]['file']

https://github.com/apache/commons-dbcp/commit/c491c2db84c1a62624f110dc177c12433f157f98#diff-b01b18d51a81076dcc2068e0b137c2d38b11ae7bc2389886799713a8023c7d10

line 57 on the target side

It is empty line in Java code in test file, in a newly added file

In [1128]:
#df.loc[4838]

This is XML file in test

In [1129]:
s = smartshark_vs_from_repos_single_hunk_df_merge_cmp[
    ~smartshark_vs_from_repos_single_hunk_df_merge_cmp['eq'].astype(bool) &
    (
        (smartshark_vs_from_repos_single_hunk_df_merge_cmp['most_common'] == 'whitespace') |
        (smartshark_vs_from_repos_single_hunk_df_merge_cmp['annotation'] == 'whitespace')
    )
][[
    'most_common', 'annotation',
]].value_counts()
s

most_common  annotation   
whitespace   bugfix           1476
             documentation     251
             test               40
             other               5
Name: count, dtype: int64

In [1130]:
s.sum()

np.int64(1772)

In [1131]:
s2 = smartshark_vs_from_repos_single_hunk_df_merge_cmp[
    ~smartshark_vs_from_repos_single_hunk_df_merge_cmp['eq'].astype(bool) &
    (
        (smartshark_vs_from_repos_single_hunk_df_merge_cmp['annotation'] == 'whitespace')
    )
][[
    'most_common', 'annotation',
]].value_counts()
s2

Series([], Name: count, dtype: int64)

In [1132]:
s2.sum()

np.int64(0)

In [1133]:
smartshark_vs_from_repos_single_hunk_df_merge_cmp[
    (
        (smartshark_vs_from_repos_single_hunk_df_merge_cmp['most_common'] == 'whitespace') &
        (smartshark_vs_from_repos_single_hunk_df_merge_cmp['annotation'] == 'whitespace')
    )
].shape

(0, 23)


# ----

In [1134]:
l = [('a', 'A'), ('b', 'B')]
d = {k: v for k, v in l}
d

{'a': 'A', 'b': 'B'}

In [1135]:
for i, jk in {'i': ('j', 'k')}.items():
    j, k = jk
    print(f"{i=}, {j=}, {k=}")

i='i', j='j', k='k'


In [1136]:
d1 = {'a': 'A'}
d2 = { **d1 }
d2['a']=1
d1

{'a': 'A'}