In [1]:
import json
import stat
from pathlib import Path

import pandas as pd

# Comparison with line annotations in the HaPy-Bug dataset

**HaPy-Bug** dataset comprises annotated diff files from three sources. None
of them had previously been subjected to human annotation at the
granular (line-by-line) level.

$D_{BIP}$: **BugsInPy** subset is an extension of dataset of 496 real
bugs proposed in 
_"[BugsInPy: a database of existing bugs in Python programs to enable controlled testing and debugging studies][BugsInPy]"_ (2020).
This subset focuses on bugs in source code
and excludes issues related to configurations, build scripts, docu-
mentation, and test cases. It also requires bugs to be reproducible,
i.e. at least one test case from the fixed version must fail with the
faulty version. Only changes involving isolated bugs are included.

$D_{CVE}$: **Python CVE** and $D_{CRAWL}$: **Crawled Python CVE** are
new custom made, specialized collections of Python-related bugs
sourced from the [CVE DB](https://cve.mitre.org/) and projects git repositories.

$D_{CVE}$ comprises bugs identified through a comprehensive full-text search
of CVE DB. This subset was refined by selecting bugs with direct
links to source code fixes that involved modifications to Python code.

$D_{CRAWL}$ is a subset created by scanning repositories of most
popular Python projects for commits that have a CVE id pattern
inside. Each bug found was cross-referenced with CVE DB.

[BugsInPy]: https://doi.org/10.1145/3368089.3417943

## $D_{BIP}$: BugsInPy subset of HaPy-Bug dataset

Here all entries are **single diff**

### Experiments extracting data for a single entry (single bug)

> Larger outputs are stored collapsed

In [2]:
annotator_json = '/mnt/data/python-diff-annotator/example_annotations/HaPy-Bug/bugsinpy-dataset/cookiecutter-1/annotation/7f6804c4953a18386809f11faf4d86898570debc.v2.json'

In [3]:
with open(annotator_json, mode='r') as json_fp:
    annotator_data = json.load(json_fp)

In [4]:
annotator_data.keys()

dict_keys(['commit_metadata', 'changes', 'diff_metadata'])

In [5]:
annotator_data['changes'].keys()

dict_keys(['cookiecutter/generate.py', '/dev/null', 'tests/test-generate-context/non_ascii.json', 'tests/test_generate_context.py'])

In [6]:
hapybug_json = '/mnt/data/HaPy-Bug/raw_data/bugsinpy-dataset/cookiecutter-1/annotation/7f6804c4953a18386809f11faf4d86898570debc.json'

In [7]:
with open(hapybug_json, mode='r') as json_fp:
    hapybug_data = json.load(json_fp)

In [8]:
hapybug_data.keys()

dict_keys(['cookiecutter/generate.py', '/dev/null', 'tests/test-generate-context/non_ascii.json', 'tests/test_generate_context.py'])

In [9]:
annotator_data['changes']['cookiecutter/generate.py']

{'language': 'Python',
 'type': 'programming',
 'purpose': 'programming',
 '-': [{'id': 3,
   'file_line_no': 85,
   'type': 'code',
   'purpose': 'programming',
   'tokens': [[40, ['Text'], '        '],
    [48, ['Keyword'], 'with'],
    [52, ['Text'], ' '],
    [53, ['Name', 'Builtin'], 'open'],
    [57, ['Punctuation'], '('],
    [58, ['Name'], 'context_file'],
    [70, ['Punctuation'], ')'],
    [71, ['Text'], ' '],
    [72, ['Keyword'], 'as'],
    [74, ['Text'], ' '],
    [75, ['Name'], 'file_handle'],
    [86, ['Punctuation'], ':'],
    [87, ['Text', 'Whitespace'], '\n']]}],
 '+': [{'id': 4,
   'file_line_no': 85,
   'type': 'code',
   'purpose': 'programming',
   'tokens': [[40, ['Text'], '        '],
    [48, ['Keyword'], 'with'],
    [52, ['Text'], ' '],
    [53, ['Name', 'Builtin'], 'open'],
    [57, ['Punctuation'], '('],
    [58, ['Name'], 'context_file'],
    [70, ['Punctuation'], ','],
    [71, ['Text'], ' '],
    [72, ['Name'], 'encoding'],
    [80, ['Operator'], '='],
 

In [10]:
hapybug_data['cookiecutter/generate.py']

{'language': 'Python',
 'type': 'programming',
 'purpose': 'programming',
 '+': [{'id': 85, 'type': 'bug(fix)'}],
 '-': [{'id': 85, 'type': 'bug(fix)'}]}

In [11]:
hapybug_data['tests/test_generate_context.py']

{'language': 'Python',
 'type': 'programming',
 'purpose': 'test',
 '+': [{'id': 111, 'type': 'test'},
  {'id': 112, 'type': 'test'},
  {'id': 113, 'type': 'test'},
  {'id': 114, 'type': 'test'},
  {'id': 115, 'type': 'test'},
  {'id': 116, 'type': 'test'},
  {'id': 117, 'type': 'test'},
  {'id': 118, 'type': 'test'},
  {'id': 119, 'type': 'test'},
  {'id': 120, 'type': 'test'},
  {'id': 121, 'type': 'test'}],
 '-': []}

In [12]:
print(Path('/mnt/data/HaPy-Bug/raw_data/bugsinpy-dataset/cookiecutter-1/patches/7f6804c4953a18386809f11faf4d86898570debc.diff').read_text())

diff --git a/cookiecutter/generate.py b/cookiecutter/generate.py
index 37365a4..c526b97 100644
--- a/cookiecutter/generate.py
+++ b/cookiecutter/generate.py
@@ -82,7 +82,7 @@ def generate_context(
     context = OrderedDict([])
 
     try:
-        with open(context_file) as file_handle:
+        with open(context_file, encoding='utf-8') as file_handle:
             obj = json.load(file_handle, object_pairs_hook=OrderedDict)
     except ValueError as e:
         # JSON decoding error.  Let's throw a new exception that is more
diff --git a/tests/test-generate-context/non_ascii.json b/tests/test-generate-context/non_ascii.json
new file mode 100644
index 0000000..af0edf6
--- /dev/null
+++ b/tests/test-generate-context/non_ascii.json
@@ -0,0 +1,3 @@
+{
+    "full_name": "éèà"
+}
diff --git a/tests/test_generate_context.py b/tests/test_generate_context.py
index 26e7d4d..69d0148 100644
--- a/tests/test_generate_context.py
+++ b/tests/test_generate_context.py
@@ -108,6 +108,17 @@ def test_def

In [13]:
with open('/mnt/data/CVE/final_bugs_packages.json', mode='r') as json_fp:
    where_labeling_data = json.load(json_fp)

In [14]:
where_labeling_data['cookiecutter-1']

{'rA': 1, 'rB': 1, 'rC': 0, 'rD': 1, 'pA': 2, 'pB': 4, 'pC': 1, 'pD': 3}

In [15]:
label_studio_json_1 = '/mnt/data/HaPy-Bug/annotated_data/D_4_3.json'

In [16]:
with open(label_studio_json_1, mode='r') as json_fp:
    label_studio_data_1 = json.load(json_fp)

In [17]:
[elem['annotations'][0]['result'][3]['value'] for elem in label_studio_data_1]

[{'hyperlinks': [{'url': 'http://lists.fedoraproject.org/pipermail/package-announce/2013-May/106220.html',
    'dates': {'min': '2013-01-01', 'max': '2020-05-24'},
    'labels': ['lists.fedoraproject.org',
     'lists.fedoraproject.org/pipermail',
     'lists.fedoraproject.org/pipermail/package-announce',
     'lists.fedoraproject.org/pipermail/package-announce/2013-May']},
   {'url': 'http://lists.fedoraproject.org/pipermail/package-announce/2013-May/105916.html',
    'dates': {'min': '2001-05-22', 'max': '2013-05-14'},
    'labels': ['lists.fedoraproject.org',
     'lists.fedoraproject.org/pipermail',
     'lists.fedoraproject.org/pipermail/package-announce',
     'lists.fedoraproject.org/pipermail/package-announce/2013-May']},
   {'url': 'http://rhn.redhat.com/errata/RHSA-2013-0806.html',
    'dates': {'min': '2012-02-04', 'max': '2013-05-09'},
    'labels': ['Vendor Advisory']},
   {'url': 'https://bugs.launchpad.net/keystone/+bug/1172195',
    'dates': {'min': '2013-01-01', 'max':

### Using the collective.{csv,json}, generated by Paper.ipynb

In [18]:
collective_dir = '../../data/experiments/HaPy-Bug/'
list(Path(collective_dir).glob('*'))

[PosixPath('../../data/experiments/HaPy-Bug/run_annotation_bugsinpy_repos.sh'),
 PosixPath('../../data/experiments/HaPy-Bug/bip_blame.csv'),
 PosixPath('../../data/experiments/HaPy-Bug/repositories.json'),
 PosixPath('../../data/experiments/HaPy-Bug/crawl_blame.csv'),
 PosixPath('../../data/experiments/HaPy-Bug/collective.csv'),
 PosixPath('../../data/experiments/HaPy-Bug/cve_blame.csv')]

In [19]:
%ls -l '../../data/experiments/HaPy-Bug/'

total 63788
-rw-r--r-- 1 jnareb jnareb  2558150 Nov 27 15:58 bip_blame.csv
-rw-r--r-- 1 jnareb jnareb 50424028 Nov 27 15:58 collective.csv
-rw-r--r-- 1 jnareb jnareb  6895847 Nov 27 15:58 crawl_blame.csv
-rw-r--r-- 1 jnareb jnareb  5385717 Nov 27 15:58 cve_blame.csv
-rw-r--r-- 1 jnareb jnareb    15132 Nov 27 15:58 repositories.json
-rwxr-xr-x 1 jnareb jnareb    27120 Nov 27 15:58 [0m[01;32mrun_annotation_bugsinpy_repos.sh[0m*


In [20]:
collective_csv = Path(collective_dir) / 'collective.csv'
collective_csv

PosixPath('../../data/experiments/HaPy-Bug/collective.csv')

In [21]:
collective_df = pd.read_csv(collective_csv, index_col=0)
collective_df.index = collective_df.index.rename(name='')
collective_df

Unnamed: 0,id,bundle,file,fcat,image,line,annotation,user,auto,ds,bug
,,,,,,,,,,,
0,cve_CVE-2020-10289,B_6_13,actionlib_tools/scripts/library.py,programming,afterChange,103,bug(fix),U1,False,cve,CVE-2020-10289
1,cve_CVE-2020-10289,B_6_13,actionlib_tools/scripts/library.py,programming,afterChange,137,bug(fix),U1,False,cve,CVE-2020-10289
2,cve_CVE-2020-10289,B_6_13,actionlib_tools/scripts/library.py,programming,beforeChange,103,bug(fix),U1,False,cve,CVE-2020-10289
3,cve_CVE-2020-10289,B_6_13,actionlib_tools/scripts/library.py,programming,beforeChange,137,bug(fix),U1,False,cve,CVE-2020-10289
4,cve_CVE-2020-10289,C_4_9,actionlib_tools/scripts/library.py,programming,afterChange,103,bug(fix),U2,False,cve,CVE-2020-10289
...,...,...,...,...,...,...,...,...,...,...,...
391913,cve_CVE-2018-16876,auto_C_5_8,lib/ansible/plugins/connection/ssh.py,programming,afterChange,361,bug(fix),U2,True,cve,CVE-2018-16876
391914,cve_CVE-2018-16876,auto_C_5_8,lib/ansible/plugins/connection/ssh.py,programming,afterChange,362,bug(fix),U2,True,cve,CVE-2018-16876
391915,cve_CVE-2018-16876,auto_C_5_8,lib/ansible/plugins/connection/ssh.py,programming,afterChange,363,bug(fix),U2,True,cve,CVE-2018-16876


In [22]:
collective_df['ds'].value_counts()

ds
crawl         146366
cve           125176
bugs-in-py    120376
Name: count, dtype: int64

In [23]:
collective_df_manual = collective_df[collective_df['auto'] == False]
collective_df_manual['ds'].value_counts()

ds
crawl         73183
cve           62588
bugs-in-py    60194
Name: count, dtype: int64

### Running annotation on BugsInPy dataset

The annotation data was generated using the following command:

```console
diff-annotate \
    --purpose-to-annotation=data \
    --purpose-to-annotation=documentation \
    --purpose-to-annotation=markup \
    --purpose-to-annotation=other \
    --purpose-to-annotation=project \
    --purpose-to-annotation=test \
    dataset \
    --output-prefix=/mnt/data/python-diff-annotator/example_annotations/HaPy-Bug \
    /mnt/data/HaPy-Bug/raw_data/bugsinpy-dataset/
```

And as can be seen, it is present in `/mnt/data/python-diff-annotator/example_annotations/HaPy-Bug/bugsinpy-dataset/`

In [24]:
bugsinpy_annotated_from_dataset_dir = '/mnt/data/python-diff-annotator/example_annotations/HaPy-Bug/bugsinpy-dataset/'

In [25]:
%ls /mnt/data/python-diff-annotator/example_annotations/HaPy-Bug/bugsinpy-dataset/

[0m[01;34mansible-1[0m/       [01;34mkeras-40[0m/       [01;34mpandas-125[0m/  [01;34mpandas-64[0m/    [01;34mthefuck-12[0m/
[01;34mansible-10[0m/      [01;34mkeras-41[0m/       [01;34mpandas-126[0m/  [01;34mpandas-65[0m/    [01;34mthefuck-13[0m/
[01;34mansible-11[0m/      [01;34mkeras-42[0m/       [01;34mpandas-127[0m/  [01;34mpandas-66[0m/    [01;34mthefuck-14[0m/
[01;34mansible-12[0m/      [01;34mkeras-43[0m/       [01;34mpandas-128[0m/  [01;34mpandas-67[0m/    [01;34mthefuck-15[0m/
[01;34mansible-13[0m/      [01;34mkeras-44[0m/       [01;34mpandas-129[0m/  [01;34mpandas-68[0m/    [01;34mthefuck-16[0m/
[01;34mansible-14[0m/      [01;34mkeras-45[0m/       [01;34mpandas-13[0m/   [01;34mpandas-69[0m/    [01;34mthefuck-17[0m/
[01;34mansible-15[0m/      [01;34mkeras-5[0m/        [01;34mpandas-130[0m/  [01;34mpandas-7[0m/     [01;34mthefuck-18[0m/
[01;34mansible-16[0m/      [01;34mkeras-6[0m/        [01;34mpandas-

In [26]:
%ls /mnt/data/python-diff-annotator/example_annotations/HaPy-Bug/bugsinpy-dataset/cookiecutter-1/annotation/

7f6804c4953a18386809f11faf4d86898570debc.v2.json


In [27]:
example_repo = 'cookiecutter'
example_bug = 'cookiecutter-1'

example_path = next(Path(bugsinpy_annotated_from_dataset_dir).joinpath(example_bug, 'annotation').glob('*.json'))
example_path

PosixPath('/mnt/data/python-diff-annotator/example_annotations/HaPy-Bug/bugsinpy-dataset/cookiecutter-1/annotation/7f6804c4953a18386809f11faf4d86898570debc.v2.json')

In [28]:
with open(example_path, mode='r') as json_fp:
    example_data_from_dataset = json.load(json_fp)

type(example_data_from_dataset)

dict

In [29]:
example_data_from_dataset.keys()

dict_keys(['commit_metadata', 'changes', 'diff_metadata'])

In [30]:
example_data_from_dataset['commit_metadata']

{'id': '7f6804c4953a18386809f11faf4d86898570debc'}

In [31]:
example_data_from_dataset['diff_metadata']

{'n_files': 3,
 'hunk_span_src': 11,
 'hunk_span_dst': 24,
 'n_hunks': 3,
 'n_lines_added': 15,
 'n_lines_removed': 1,
 'n_lines_all': 28,
 'n_mod': 1,
 'n_groups': 3,
 'patch_size': 15,
 'n_added_files': 1,
 'n_add': 14}

In [32]:
example_data_from_dataset['changes'].keys()

dict_keys(['cookiecutter/generate.py', '/dev/null', 'tests/test-generate-context/non_ascii.json', 'tests/test_generate_context.py'])

In [33]:
example_data_from_dataset['changes']['cookiecutter/generate.py']

{'language': 'Python',
 'type': 'programming',
 'purpose': 'programming',
 '-': [{'id': 3,
   'file_line_no': 85,
   'type': 'code',
   'purpose': 'programming',
   'tokens': [[40, ['Text'], '        '],
    [48, ['Keyword'], 'with'],
    [52, ['Text'], ' '],
    [53, ['Name', 'Builtin'], 'open'],
    [57, ['Punctuation'], '('],
    [58, ['Name'], 'context_file'],
    [70, ['Punctuation'], ')'],
    [71, ['Text'], ' '],
    [72, ['Keyword'], 'as'],
    [74, ['Text'], ' '],
    [75, ['Name'], 'file_handle'],
    [86, ['Punctuation'], ':'],
    [87, ['Text', 'Whitespace'], '\n']]}],
 '+': [{'id': 4,
   'file_line_no': 85,
   'type': 'code',
   'purpose': 'programming',
   'tokens': [[40, ['Text'], '        '],
    [48, ['Keyword'], 'with'],
    [52, ['Text'], ' '],
    [53, ['Name', 'Builtin'], 'open'],
    [57, ['Punctuation'], '('],
    [58, ['Name'], 'context_file'],
    [70, ['Punctuation'], ','],
    [71, ['Text'], ' '],
    [72, ['Name'], 'encoding'],
    [80, ['Operator'], '='],
 

### Extracting commit ids from BugsInPy dataset

For each bug in **BugsInPy** dataset we want repository and commit id, to be able to use more powerful `diff-annotate from-repo`, rather than `diff-annotate dataset`.

In [34]:
bugsinpy_dir = '/mnt/data/HaPy-Bug/raw_data/bugsinpy-dataset/'

In [35]:
repo_commits = {}

for bug_dir in Path(bugsinpy_dir).iterdir():
    repo_name = bug_dir.name.rsplit('-', maxsplit=1)[0]

    #print(f"{bug_dir.name=}, {repo_name=}")
    if repo_name not in repo_commits:
        repo_commits[repo_name] = { 'commits': [], 'bugs': [] }

    repo_commits[repo_name]['bugs'].append(bug_dir.name)
    
    for diff_file in bug_dir.joinpath('patches').glob('*.diff'):
        #print(f"  {diff_file.stem=}")
        repo_commits[repo_name]['commits'].append(diff_file.stem)

repo_commits['cookiecutter']

{'commits': ['7f6804c4953a18386809f11faf4d86898570debc',
  '7129d474206761a6156925db78eee4b62a0e3944',
  '90434ff4ea4477941444f1e83313beb414838535',
  '457a1a4e862aab4102b644ff1d2b2e2b5a766b3c'],
 'bugs': ['cookiecutter-1',
  'cookiecutter-3',
  'cookiecutter-2',
  'cookiecutter-4']}

In [36]:
repo_commits.keys()

dict_keys(['pandas', 'thefuck', 'tornado', 'black', 'youtube-dl', 'spacy', 'keras', 'ansible', 'scrapy', 'fastapi', 'luigi', 'matplotlib', 'tqdm', 'sanic', 'cookiecutter', 'httpie', 'PySnooper'])

Find where repositories were cloned to (locally):

In [37]:
repositories_json = '../../data/experiments/HaPy-Bug/repositories.json'
%ls -l '../../data/experiments/HaPy-Bug/repositories.json'

-rw-r--r-- 1 jnareb jnareb 15132 Nov 27 15:58 ../../data/experiments/HaPy-Bug/repositories.json


In [38]:
with open(repositories_json, mode='r') as json_fp:
    repositories_data = json.load(json_fp)

repositories_data[:3]

[{'project': 'pandas',
  'repository_url': 'https://github.com/pandas-dev/pandas',
  'repository_path': '/mnt/data/python_bug_localization_data/repositories/pandas'},
 {'project': 'ansible',
  'repository_url': 'https://github.com/ansible/ansible',
  'repository_path': '/mnt/data/python_bug_localization_data/repositories/ansible'},
 {'project': 'black',
  'repository_url': 'https://github.com/psf/black',
  'repository_path': '/mnt/data/python_bug_localization_data/repositories/black'}]

In [39]:
repositories_map = {
    elem['project']: {'url': elem['repository_url'], 'path': elem['repository_path'] }
    for elem in repositories_data
}

repositories_map['cookiecutter']

{'url': 'https://github.com/cookiecutter/cookiecutter',
 'path': '/mnt/data/python_bug_localization_data/repositories/cookiecutter'}

### Running annotation on BugsInPy repos

In [40]:
script_file = '../../run_annotation_bugsinpy_repos.sh'

In [41]:
file_purpose_list = [
    "data",
    "documentation",
    "markup",
    "other",
    "project",
    "test",
]

with open(script_file, 'wt') as fp:
    print('#!/usr/bin/sh', file=fp)
    print('', file=fp)
    print('echo "running annotations on BugsInPy repos for BugsInPy buggy commits"', file=fp)
    print('', file=fp)

Path(script_file).chmod(0o755)  # 0755/-rwxr-xr-x

In [42]:
for repo_name, repo_data in repo_commits.items():
    print(f"{repo_name}:")
    cmd_str = ''.join([
        "diff-annotate ",
        *[f"--purpose-to-annotation={file_purpose} " for file_purpose in file_purpose_list],
        "from-repo ",
        f"--output-dir=/mnt/data/python-diff-annotator/example_annotations/bugsinpy-from-repo/{repo_name} ",
        f"{repositories_map[repo_name]['path']} --no-walk=sorted {' '.join(repo_data['commits'])}",
    ])
    print("  arg_length <=", len(cmd_str))
    
    with open(script_file, 'at') as fp:
        print(f"# {repo_name}", file=fp)
        print(cmd_str, file=fp)
    
    print("")

pandas:
  arg_length <= 7267

thefuck:
  arg_length <= 1693

tornado:
  arg_length <= 1037

black:
  arg_length <= 1320

youtube-dl:
  arg_length <= 2150

spacy:
  arg_length <= 787

keras:
  arg_length <= 2222

ansible:
  arg_length <= 1119

scrapy:
  arg_length <= 2019

fastapi:
  arg_length <= 1037

luigi:
  arg_length <= 1730

matplotlib:
  arg_length <= 1494

tqdm:
  arg_length <= 744

sanic:
  arg_length <= 582

cookiecutter:
  arg_length <= 555

httpie:
  arg_length <= 584

PySnooper:
  arg_length <= 508



Run for example:

```console
uptime && time diff-annotate \
    --purpose-to-annotation=data \
    --purpose-to-annotation=documentation \
    --purpose-to-annotation=markup \
    --purpose-to-annotation=other \
    --purpose-to-annotation=project \
    --purpose-to-annotation=test \
    from-repo \
    --output-dir=/mnt/data/python-diff-annotator/example_annotations/bugsinpy-from-repo/cookiecutter \
    /mnt/data/python_bug_localization_data/repositories/cookiecutter --no-walk=sorted \
    7f6804c4953a18386809f11faf4d86898570debc 7129d474206761a6156925db78eee4b62a0e3944 \
    90434ff4ea4477941444f1e83313beb414838535 457a1a4e862aab4102b644ff1d2b2e2b5a766b3c
```

The output below as for the run without `--purpose-to-annotation=` parameters

```
 02:36:41 up 289 days,  4:26, 12 users,  load average: 1.08, 1.18, 1.54
Logging to 'diff-annotate.log' file, with log level=WARNING
Computing patch sizes and spreads (# files, # change groups, # spanned lines,...)
Storing annotations in <output_dir>/<commit_id>.json
  with output dir: '/mnt/data/python-diff-annotator/example_annotations/bugsinpy-from-repo/cookiecutter'
Ensuring that output directory '/mnt/data/python-diff-annotator/example_annotations/bugsinpy-from-repo/cookiecutter' exists
Generating patches from local Git repo '/mnt/data/python_bug_localization_data/repositories/cookiecutter'
  using `git log -p '--no-walk=sorted' '7f6804c4953a18386809f11faf4d86898570debc' '7129d474206761a6156925db78eee4b62a0e3944' '90434ff4ea4477941444f1e83313beb414838535' '457a1a4e862aab4102b644ff1d2b2e2b5a766b3c'`
  took 0.212 seconds (includes parsing unified diffs)
Annotating commits and saving annotated data, for 4 commits
  lexing pre- and post-image file contents, from repo 'cookiecutter'
  using sequential processing
commits: 100%|█████████████████████████████████████████████████████| 4/4 [00:00<00:00,  7.38it/s]

real    0m1.720s
user    0m8.406s
sys     0m0.176s
```

Extract the same commit annotated data:

In [43]:
example_repo = 'cookiecutter'
example_commit = '7f6804c4953a18386809f11faf4d86898570debc'

In [44]:
bugsinpy_annotated_from_repo_dir = '/mnt/data/python-diff-annotator/example_annotations/bugsinpy-from-repo/'

%ls -1 '/mnt/data/python-diff-annotator/example_annotations/bugsinpy-from-repo/'

[0m[01;34mansible[0m/
[01;34mblack[0m/
[01;34mcookiecutter[0m/
[01;34mfastapi[0m/
[01;34mhttpie[0m/
[01;34mkeras[0m/
[01;34mluigi[0m/
[01;34mmatplotlib[0m/
[01;34mpandas[0m/
[01;34mPySnooper[0m/
[01;34msanic[0m/
[01;34mscrapy[0m/
[01;34mspacy[0m/
[01;34mthefuck[0m/
[01;34mtornado[0m/
[01;34mtqdm[0m/
[01;34myoutube-dl[0m/


In [45]:
%ls -1 '/mnt/data/python-diff-annotator/example_annotations/bugsinpy-from-repo/cookiecutter'

457a1a4e862aab4102b644ff1d2b2e2b5a766b3c.v2.json
7129d474206761a6156925db78eee4b62a0e3944.v2.json
7f6804c4953a18386809f11faf4d86898570debc.v2.json
90434ff4ea4477941444f1e83313beb414838535.v2.json


In [46]:
example_path_2 = Path(bugsinpy_annotated_from_repo_dir).joinpath(example_repo, f"{example_commit}.v2.json")
example_path_2

PosixPath('/mnt/data/python-diff-annotator/example_annotations/bugsinpy-from-repo/cookiecutter/7f6804c4953a18386809f11faf4d86898570debc.v2.json')

In [47]:
with open(example_path_2, mode='r') as json_fp:
    example_data_from_repo = json.load(json_fp)

type(example_data_from_repo)

dict

In [48]:
example_data_from_repo.keys()

dict_keys(['commit_metadata', 'changes', 'diff_metadata'])

There is more commit metadata, because `diff-annotate dataset ...` does not yet try to parse `*.message` files

In [49]:
example_data_from_repo['commit_metadata']

{'id': '7f6804c4953a18386809f11faf4d86898570debc',
 'parents': ['c15633745df6abdb24e02746b82aadb20b8cdf8c'],
 'tree': 'd04faaa47bc47a2f2cda28dcba057ac3865d842e',
 'author': {'author': 'Aurélien Gâteau <mail@agateau.com>',
  'name': 'Aurélien Gâteau',
  'email': 'mail@agateau.com',
  'timestamp': 1590790310,
  'tz_info': '+0200'},
 'committer': {'committer': 'GitHub <noreply@github.com>',
  'name': 'GitHub',
  'email': 'noreply@github.com',
  'timestamp': 1590790310,
  'tz_info': '+0300'},
 'message': 'Fix default values being loaded with wrong encoding on Windows (#1414)\n\nExplicitly set the encoding to utf-8 when reading the context file to\nensure values are correctly loaded.\n\nCo-authored-by: Andrey Shpak <insspb@users.noreply.github.com>\n'}

In [50]:
example_data_from_repo['diff_metadata']

{'n_files': 3,
 'hunk_span_src': 11,
 'hunk_span_dst': 24,
 'n_hunks': 3,
 'n_lines_added': 15,
 'n_lines_removed': 1,
 'n_lines_all': 28,
 'n_mod': 1,
 'n_groups': 3,
 'patch_size': 15,
 'n_added_files': 1,
 'n_add': 14}

In [51]:
example_data_from_dataset['changes'].keys()

dict_keys(['cookiecutter/generate.py', '/dev/null', 'tests/test-generate-context/non_ascii.json', 'tests/test_generate_context.py'])

In [52]:
example_data_from_dataset['changes']['cookiecutter/generate.py']

{'language': 'Python',
 'type': 'programming',
 'purpose': 'programming',
 '-': [{'id': 3,
   'file_line_no': 85,
   'type': 'code',
   'purpose': 'programming',
   'tokens': [[40, ['Text'], '        '],
    [48, ['Keyword'], 'with'],
    [52, ['Text'], ' '],
    [53, ['Name', 'Builtin'], 'open'],
    [57, ['Punctuation'], '('],
    [58, ['Name'], 'context_file'],
    [70, ['Punctuation'], ')'],
    [71, ['Text'], ' '],
    [72, ['Keyword'], 'as'],
    [74, ['Text'], ' '],
    [75, ['Name'], 'file_handle'],
    [86, ['Punctuation'], ':'],
    [87, ['Text', 'Whitespace'], '\n']]}],
 '+': [{'id': 4,
   'file_line_no': 85,
   'type': 'code',
   'purpose': 'programming',
   'tokens': [[40, ['Text'], '        '],
    [48, ['Keyword'], 'with'],
    [52, ['Text'], ' '],
    [53, ['Name', 'Builtin'], 'open'],
    [57, ['Punctuation'], '('],
    [58, ['Name'], 'context_file'],
    [70, ['Punctuation'], ','],
    [71, ['Text'], ' '],
    [72, ['Name'], 'encoding'],
    [80, ['Operator'], '='],
 

### Creating DataFrame for comparison

In [53]:
collective_df_manual.columns

Index(['id', 'bundle', 'file', 'fcat', 'image', 'line', 'annotation', 'user',
       'auto', 'ds', 'bug'],
      dtype='object')

In [54]:
collective_df_manual.dtypes

id            object
bundle        object
file          object
fcat          object
image         object
line           int64
annotation    object
user          object
auto            bool
ds            object
bug           object
dtype: object

In [55]:
collective_df_manual.head(5)

Unnamed: 0,id,bundle,file,fcat,image,line,annotation,user,auto,ds,bug
,,,,,,,,,,,
0.0,cve_CVE-2020-10289,B_6_13,actionlib_tools/scripts/library.py,programming,afterChange,103.0,bug(fix),U1,False,cve,CVE-2020-10289
1.0,cve_CVE-2020-10289,B_6_13,actionlib_tools/scripts/library.py,programming,afterChange,137.0,bug(fix),U1,False,cve,CVE-2020-10289
2.0,cve_CVE-2020-10289,B_6_13,actionlib_tools/scripts/library.py,programming,beforeChange,103.0,bug(fix),U1,False,cve,CVE-2020-10289
3.0,cve_CVE-2020-10289,B_6_13,actionlib_tools/scripts/library.py,programming,beforeChange,137.0,bug(fix),U1,False,cve,CVE-2020-10289
4.0,cve_CVE-2020-10289,C_4_9,actionlib_tools/scripts/library.py,programming,afterChange,103.0,bug(fix),U2,False,cve,CVE-2020-10289


In [56]:
collective_df_manual['ds'].value_counts()

ds
crawl         73183
cve           62588
bugs-in-py    60194
Name: count, dtype: int64

In [57]:
collective_df_manual_bugsinpy = collective_df_manual[collective_df_manual['ds'] == 'bugs-in-py']
collective_df_manual_bugsinpy

Unnamed: 0,id,bundle,file,fcat,image,line,annotation,user,auto,ds,bug
,,,,,,,,,,,
16414,bugs-in-py_keras-17,B_6_13,keras/metrics.py,programming,afterChange,37,documentation,U1,False,bugs-in-py,keras-17
16415,bugs-in-py_keras-17,B_6_13,keras/metrics.py,programming,afterChange,38,bug(fix),U1,False,bugs-in-py,keras-17
16416,bugs-in-py_keras-17,B_6_13,keras/metrics.py,programming,beforeChange,37,bug(fix),U1,False,bugs-in-py,keras-17
16417,bugs-in-py_keras-17,B_6_13,tests/keras/metrics_test.py,test,afterChange,50,test,U1,False,bugs-in-py,keras-17
16418,bugs-in-py_keras-17,B_6_13,tests/keras/metrics_test.py,test,afterChange,51,test,U1,False,bugs-in-py,keras-17
...,...,...,...,...,...,...,...,...,...,...,...
195909,bugs-in-py_pandas-54,A_1_24,pandas/tests/dtypes/test_dtypes.py,test,afterChange,133,test,E1,False,bugs-in-py,pandas-54
195910,bugs-in-py_pandas-54,A_1_24,pandas/tests/dtypes/test_dtypes.py,test,afterChange,134,test,E1,False,bugs-in-py,pandas-54
195911,bugs-in-py_pandas-54,A_1_24,pandas/tests/indexes/common.py,test,afterChange,608,test,E1,False,bugs-in-py,pandas-54


In [58]:
collective_df_manual_bugsinpy[collective_df_manual_bugsinpy['bug'] == 'cookiecutter-1']['bundle'].value_counts()

bundle
D_4_3     16
B_5_14    16
A_3_22    16
Name: count, dtype: int64

In [59]:
example_collective = collective_df_manual_bugsinpy[
    (collective_df_manual_bugsinpy['bug'] == 'cookiecutter-1') & 
    (collective_df_manual_bugsinpy['bundle'] == 'D_4_3')
]

example_collective

Unnamed: 0,id,bundle,file,fcat,image,line,annotation,user,auto,ds,bug
,,,,,,,,,,,
145609.0,bugs-in-py_cookiecutter-1,D_4_3,cookiecutter/generate.py,programming,afterChange,85.0,bug(fix),U3,False,bugs-in-py,cookiecutter-1
145610.0,bugs-in-py_cookiecutter-1,D_4_3,cookiecutter/generate.py,programming,beforeChange,85.0,bug(fix),U3,False,bugs-in-py,cookiecutter-1
145611.0,bugs-in-py_cookiecutter-1,D_4_3,tests/test-generate-context/non_ascii.json,test,afterChange,1.0,test,U3,False,bugs-in-py,cookiecutter-1
145612.0,bugs-in-py_cookiecutter-1,D_4_3,tests/test-generate-context/non_ascii.json,test,afterChange,2.0,test,U3,False,bugs-in-py,cookiecutter-1
145613.0,bugs-in-py_cookiecutter-1,D_4_3,tests/test-generate-context/non_ascii.json,test,afterChange,3.0,test,U3,False,bugs-in-py,cookiecutter-1
145614.0,bugs-in-py_cookiecutter-1,D_4_3,tests/test_generate_context.py,test,afterChange,111.0,test,U3,False,bugs-in-py,cookiecutter-1
145615.0,bugs-in-py_cookiecutter-1,D_4_3,tests/test_generate_context.py,test,afterChange,112.0,documentation,U3,False,bugs-in-py,cookiecutter-1
145616.0,bugs-in-py_cookiecutter-1,D_4_3,tests/test_generate_context.py,test,afterChange,113.0,test,U3,False,bugs-in-py,cookiecutter-1
145617.0,bugs-in-py_cookiecutter-1,D_4_3,tests/test_generate_context.py,test,afterChange,114.0,test,U3,False,bugs-in-py,cookiecutter-1


In [60]:
example_data_from_dataset['changes'].keys()

dict_keys(['cookiecutter/generate.py', '/dev/null', 'tests/test-generate-context/non_ascii.json', 'tests/test_generate_context.py'])

In [61]:
example_data_from_dataset['changes']['cookiecutter/generate.py']

{'language': 'Python',
 'type': 'programming',
 'purpose': 'programming',
 '-': [{'id': 3,
   'file_line_no': 85,
   'type': 'code',
   'purpose': 'programming',
   'tokens': [[40, ['Text'], '        '],
    [48, ['Keyword'], 'with'],
    [52, ['Text'], ' '],
    [53, ['Name', 'Builtin'], 'open'],
    [57, ['Punctuation'], '('],
    [58, ['Name'], 'context_file'],
    [70, ['Punctuation'], ')'],
    [71, ['Text'], ' '],
    [72, ['Keyword'], 'as'],
    [74, ['Text'], ' '],
    [75, ['Name'], 'file_handle'],
    [86, ['Punctuation'], ':'],
    [87, ['Text', 'Whitespace'], '\n']]}],
 '+': [{'id': 4,
   'file_line_no': 85,
   'type': 'code',
   'purpose': 'programming',
   'tokens': [[40, ['Text'], '        '],
    [48, ['Keyword'], 'with'],
    [52, ['Text'], ' '],
    [53, ['Name', 'Builtin'], 'open'],
    [57, ['Punctuation'], '('],
    [58, ['Name'], 'context_file'],
    [70, ['Punctuation'], ','],
    [71, ['Text'], ' '],
    [72, ['Name'], 'encoding'],
    [80, ['Operator'], '='],
 

In [62]:
example_records = []
dataset = "bugs-in-py"
bug = "cookiecutter-1"

for patched_file, file_data in example_data_from_dataset['changes'].items():
    if patched_file == '/dev/null':
        continue
        
    for pm in list("-+"):
        if pm not in file_data:
            continue

        for line_data in file_data[pm]:
            example_records.append({
                'id': f"{dataset}_{bug}",
                'file': patched_file,
                'fcat': file_data['purpose'],
                'image': 'beforeChange' if pm == '-' else 'afterChange',
                'line': line_data['file_line_no'],
                'annotation': 'bug(fix)' if line_data['type'] == 'code' else line_data['type'],
                'ds': dataset,
                'bug': bug,
            })

example_records[:5]

[{'id': 'bugs-in-py_cookiecutter-1',
  'file': 'cookiecutter/generate.py',
  'fcat': 'programming',
  'image': 'beforeChange',
  'line': 85,
  'annotation': 'bug(fix)',
  'ds': 'bugs-in-py',
  'bug': 'cookiecutter-1'},
 {'id': 'bugs-in-py_cookiecutter-1',
  'file': 'cookiecutter/generate.py',
  'fcat': 'programming',
  'image': 'afterChange',
  'line': 85,
  'annotation': 'bug(fix)',
  'ds': 'bugs-in-py',
  'bug': 'cookiecutter-1'},
 {'id': 'bugs-in-py_cookiecutter-1',
  'file': 'tests/test-generate-context/non_ascii.json',
  'fcat': 'test',
  'image': 'afterChange',
  'line': 1,
  'annotation': 'test',
  'ds': 'bugs-in-py',
  'bug': 'cookiecutter-1'},
 {'id': 'bugs-in-py_cookiecutter-1',
  'file': 'tests/test-generate-context/non_ascii.json',
  'fcat': 'test',
  'image': 'afterChange',
  'line': 2,
  'annotation': 'test',
  'ds': 'bugs-in-py',
  'bug': 'cookiecutter-1'},
 {'id': 'bugs-in-py_cookiecutter-1',
  'file': 'tests/test-generate-context/non_ascii.json',
  'fcat': 'test',
  'i

In [63]:
example_df = pd.DataFrame.from_records(example_records)
example_df

Unnamed: 0,id,file,fcat,image,line,annotation,ds,bug
0,bugs-in-py_cookiecutter-1,cookiecutter/generate.py,programming,beforeChange,85,bug(fix),bugs-in-py,cookiecutter-1
1,bugs-in-py_cookiecutter-1,cookiecutter/generate.py,programming,afterChange,85,bug(fix),bugs-in-py,cookiecutter-1
2,bugs-in-py_cookiecutter-1,tests/test-generate-context/non_ascii.json,test,afterChange,1,test,bugs-in-py,cookiecutter-1
3,bugs-in-py_cookiecutter-1,tests/test-generate-context/non_ascii.json,test,afterChange,2,test,bugs-in-py,cookiecutter-1
4,bugs-in-py_cookiecutter-1,tests/test-generate-context/non_ascii.json,test,afterChange,3,test,bugs-in-py,cookiecutter-1
5,bugs-in-py_cookiecutter-1,tests/test_generate_context.py,test,afterChange,111,test,bugs-in-py,cookiecutter-1
6,bugs-in-py_cookiecutter-1,tests/test_generate_context.py,test,afterChange,112,test,bugs-in-py,cookiecutter-1
7,bugs-in-py_cookiecutter-1,tests/test_generate_context.py,test,afterChange,113,test,bugs-in-py,cookiecutter-1
8,bugs-in-py_cookiecutter-1,tests/test_generate_context.py,test,afterChange,114,test,bugs-in-py,cookiecutter-1
9,bugs-in-py_cookiecutter-1,tests/test_generate_context.py,test,afterChange,115,test,bugs-in-py,cookiecutter-1


In [64]:
example_df.head(5)

Unnamed: 0,id,file,fcat,image,line,annotation,ds,bug
0,bugs-in-py_cookiecutter-1,cookiecutter/generate.py,programming,beforeChange,85,bug(fix),bugs-in-py,cookiecutter-1
1,bugs-in-py_cookiecutter-1,cookiecutter/generate.py,programming,afterChange,85,bug(fix),bugs-in-py,cookiecutter-1
2,bugs-in-py_cookiecutter-1,tests/test-generate-context/non_ascii.json,test,afterChange,1,test,bugs-in-py,cookiecutter-1
3,bugs-in-py_cookiecutter-1,tests/test-generate-context/non_ascii.json,test,afterChange,2,test,bugs-in-py,cookiecutter-1
4,bugs-in-py_cookiecutter-1,tests/test-generate-context/non_ascii.json,test,afterChange,3,test,bugs-in-py,cookiecutter-1


In [65]:
example_collective.head(5)

Unnamed: 0,id,bundle,file,fcat,image,line,annotation,user,auto,ds,bug
,,,,,,,,,,,
145609.0,bugs-in-py_cookiecutter-1,D_4_3,cookiecutter/generate.py,programming,afterChange,85.0,bug(fix),U3,False,bugs-in-py,cookiecutter-1
145610.0,bugs-in-py_cookiecutter-1,D_4_3,cookiecutter/generate.py,programming,beforeChange,85.0,bug(fix),U3,False,bugs-in-py,cookiecutter-1
145611.0,bugs-in-py_cookiecutter-1,D_4_3,tests/test-generate-context/non_ascii.json,test,afterChange,1.0,test,U3,False,bugs-in-py,cookiecutter-1
145612.0,bugs-in-py_cookiecutter-1,D_4_3,tests/test-generate-context/non_ascii.json,test,afterChange,2.0,test,U3,False,bugs-in-py,cookiecutter-1
145613.0,bugs-in-py_cookiecutter-1,D_4_3,tests/test-generate-context/non_ascii.json,test,afterChange,3.0,test,U3,False,bugs-in-py,cookiecutter-1


### Join/merge for comparison

In [66]:
example_collective_sel = example_collective[['ds', 'bug', 'bundle', 'user', 'file', 'fcat', 'image', 'line', 'annotation']]
example_collective_sel.head(5)

Unnamed: 0,ds,bug,bundle,user,file,fcat,image,line,annotation
,,,,,,,,,
145609.0,bugs-in-py,cookiecutter-1,D_4_3,U3,cookiecutter/generate.py,programming,afterChange,85.0,bug(fix)
145610.0,bugs-in-py,cookiecutter-1,D_4_3,U3,cookiecutter/generate.py,programming,beforeChange,85.0,bug(fix)
145611.0,bugs-in-py,cookiecutter-1,D_4_3,U3,tests/test-generate-context/non_ascii.json,test,afterChange,1.0,test
145612.0,bugs-in-py,cookiecutter-1,D_4_3,U3,tests/test-generate-context/non_ascii.json,test,afterChange,2.0,test
145613.0,bugs-in-py,cookiecutter-1,D_4_3,U3,tests/test-generate-context/non_ascii.json,test,afterChange,3.0,test


In [67]:
example_df_sel = example_df[['bug', 'file', 'fcat', 'image', 'line', 'annotation']]
example_df_sel.head(5)

Unnamed: 0,bug,file,fcat,image,line,annotation
0,cookiecutter-1,cookiecutter/generate.py,programming,beforeChange,85,bug(fix)
1,cookiecutter-1,cookiecutter/generate.py,programming,afterChange,85,bug(fix)
2,cookiecutter-1,tests/test-generate-context/non_ascii.json,test,afterChange,1,test
3,cookiecutter-1,tests/test-generate-context/non_ascii.json,test,afterChange,2,test
4,cookiecutter-1,tests/test-generate-context/non_ascii.json,test,afterChange,3,test


In [68]:
example_merge_sel = pd.merge(
    example_collective_sel, example_df_sel,
    how='outer', on=['bug', 'file', 'image', 'line'],
    indicator="indicator_column", suffixes=("_hapy", "_auto"),
)
example_merge_sel.head()

Unnamed: 0,ds,bug,bundle,user,file,fcat_hapy,image,line,annotation_hapy,fcat_auto,annotation_auto,indicator_column
0,bugs-in-py,cookiecutter-1,D_4_3,U3,cookiecutter/generate.py,programming,afterChange,85,bug(fix),programming,bug(fix),both
1,bugs-in-py,cookiecutter-1,D_4_3,U3,cookiecutter/generate.py,programming,beforeChange,85,bug(fix),programming,bug(fix),both
2,bugs-in-py,cookiecutter-1,D_4_3,U3,tests/test-generate-context/non_ascii.json,test,afterChange,1,test,test,test,both
3,bugs-in-py,cookiecutter-1,D_4_3,U3,tests/test-generate-context/non_ascii.json,test,afterChange,2,test,test,test,both
4,bugs-in-py,cookiecutter-1,D_4_3,U3,tests/test-generate-context/non_ascii.json,test,afterChange,3,test,test,test,both


In [69]:
example_merge_sel['fcat_eq'] = example_merge_sel['fcat_hapy'] == example_merge_sel['fcat_auto']
example_merge_sel['annotation_eq'] = example_merge_sel['annotation_hapy'] == example_merge_sel['annotation_auto']

example_merge_sel.head()

Unnamed: 0,ds,bug,bundle,user,file,fcat_hapy,image,line,annotation_hapy,fcat_auto,annotation_auto,indicator_column,fcat_eq,annotation_eq
0,bugs-in-py,cookiecutter-1,D_4_3,U3,cookiecutter/generate.py,programming,afterChange,85,bug(fix),programming,bug(fix),both,True,True
1,bugs-in-py,cookiecutter-1,D_4_3,U3,cookiecutter/generate.py,programming,beforeChange,85,bug(fix),programming,bug(fix),both,True,True
2,bugs-in-py,cookiecutter-1,D_4_3,U3,tests/test-generate-context/non_ascii.json,test,afterChange,1,test,test,test,both,True,True
3,bugs-in-py,cookiecutter-1,D_4_3,U3,tests/test-generate-context/non_ascii.json,test,afterChange,2,test,test,test,both,True,True
4,bugs-in-py,cookiecutter-1,D_4_3,U3,tests/test-generate-context/non_ascii.json,test,afterChange,3,test,test,test,both,True,True


In [70]:
example_merge_sel[['fcat_eq', 'annotation_eq']].value_counts()

fcat_eq  annotation_eq
True     True             15
         False             1
Name: count, dtype: int64

In [71]:
example_merge_sel[(example_merge_sel['fcat_eq'] == False) | (example_merge_sel['annotation_eq'] == False)]

Unnamed: 0,ds,bug,bundle,user,file,fcat_hapy,image,line,annotation_hapy,fcat_auto,annotation_auto,indicator_column,fcat_eq,annotation_eq
6,bugs-in-py,cookiecutter-1,D_4_3,U3,tests/test_generate_context.py,test,afterChange,112,documentation,test,test,both,True,False


### Full comparison

In [72]:
sha_to_bug = {}
for repo_data in repo_commits.values():
    for sha, bug in zip(repo_data['commits'], repo_data['bugs']):
        sha_to_bug[sha] = bug

{sha: bug for sha, bug in sha_to_bug.items() if bug.startswith('cookiecutter')}

{'7f6804c4953a18386809f11faf4d86898570debc': 'cookiecutter-1',
 '7129d474206761a6156925db78eee4b62a0e3944': 'cookiecutter-3',
 '90434ff4ea4477941444f1e83313beb414838535': 'cookiecutter-2',
 '457a1a4e862aab4102b644ff1d2b2e2b5a766b3c': 'cookiecutter-4'}

In [73]:
bugsinpy_annotated_from_repo_dir

'/mnt/data/python-diff-annotator/example_annotations/bugsinpy-from-repo/'

In [74]:
records_from_repos = []

dataset = 'bugs-in-py'

for subdir in Path(bugsinpy_annotated_from_repo_dir).iterdir():
    print(f"{subdir.name}")
    count = 0
    n_files = 0
    n_lines = 0

    for json_file in subdir.glob('*.json'):
        sha = json_file.name.split('.', maxsplit=1)[0]
        bug = sha_to_bug[sha]
        #print(f"  {json_file.name} -> {sha=}, {bug=}")
        count += 1

        with open(json_file, mode='r') as json_fp:
            json_data = json.load(json_fp)

        for patched_file, file_data in json_data['changes'].items():
            if patched_file == '/dev/null':
                continue

            n_files += 1
            
            for pm in list("-+"):
                if pm not in file_data:
                    continue

                for line_data in file_data[pm]:
                    n_lines += 1
                    records_from_repos.append({
                        'id': f"{dataset}_{bug}",
                        'ds': dataset,
                        'bug': bug,
                        'sha': sha,
                        'file': patched_file,
                        'fcat': file_data['purpose'],
                        'image': 'beforeChange' if pm == '-' else 'afterChange',
                        'line': line_data['file_line_no'],
                        'annotation': 'bug(fix)' if line_data['type'] == 'code' else line_data['type'],
                    })

    print(f"  {count} commits, {n_files} changed files, {n_lines} changed lines") 

httpie
  5 commits, 13 changed files, 145 changed lines
PySnooper
  3 commits, 8 changed files, 117 changed lines
keras
  45 commits, 107 changed files, 2122 changed lines
pandas
  168 commits, 582 changed files, 7464 changed lines
spacy
  10 commits, 29 changed files, 270 changed lines
tornado
  16 commits, 39 changed files, 575 changed lines
scrapy
  40 commits, 98 changed files, 1196 changed lines
youtube-dl
  43 commits, 100 changed files, 702 changed lines
matplotlib
  27 commits, 64 changed files, 714 changed lines
black
  23 commits, 74 changed files, 1638 changed lines
sanic
  5 commits, 14 changed files, 207 changed lines
cookiecutter
  4 commits, 11 changed files, 108 changed lines
fastapi
  16 commits, 43 changed files, 1415 changed lines
luigi
  33 commits, 70 changed files, 1308 changed lines
tqdm
  9 commits, 19 changed files, 215 changed lines
ansible
  18 commits, 54 changed files, 803 changed lines
thefuck
  32 commits, 72 changed files, 891 changed lines


In [75]:
records_from_repos[:5]

[{'id': 'bugs-in-py_httpie-1',
  'ds': 'bugs-in-py',
  'bug': 'httpie-1',
  'sha': '5300b0b490b8db48fac30b5e32164be93dc574b7',
  'file': 'CHANGELOG.rst',
  'fcat': 'documentation',
  'image': 'afterChange',
  'line': 30,
  'annotation': 'documentation'},
 {'id': 'bugs-in-py_httpie-1',
  'ds': 'bugs-in-py',
  'bug': 'httpie-1',
  'sha': '5300b0b490b8db48fac30b5e32164be93dc574b7',
  'file': 'httpie/downloads.py',
  'fcat': 'programming',
  'image': 'beforeChange',
  'line': 142,
  'annotation': 'bug(fix)'},
 {'id': 'bugs-in-py_httpie-1',
  'ds': 'bugs-in-py',
  'bug': 'httpie-1',
  'sha': '5300b0b490b8db48fac30b5e32164be93dc574b7',
  'file': 'httpie/downloads.py',
  'fcat': 'programming',
  'image': 'beforeChange',
  'line': 143,
  'annotation': 'bug(fix)'},
 {'id': 'bugs-in-py_httpie-1',
  'ds': 'bugs-in-py',
  'bug': 'httpie-1',
  'sha': '5300b0b490b8db48fac30b5e32164be93dc574b7',
  'file': 'httpie/downloads.py',
  'fcat': 'programming',
  'image': 'afterChange',
  'line': 10,
  'annot

In [76]:
from_repos_df = pd.DataFrame.from_records(records_from_repos)
from_repos_df

Unnamed: 0,id,ds,bug,sha,file,fcat,image,line,annotation
0,bugs-in-py_httpie-1,bugs-in-py,httpie-1,5300b0b490b8db48fac30b5e32164be93dc574b7,CHANGELOG.rst,documentation,afterChange,30,documentation
1,bugs-in-py_httpie-1,bugs-in-py,httpie-1,5300b0b490b8db48fac30b5e32164be93dc574b7,httpie/downloads.py,programming,beforeChange,142,bug(fix)
2,bugs-in-py_httpie-1,bugs-in-py,httpie-1,5300b0b490b8db48fac30b5e32164be93dc574b7,httpie/downloads.py,programming,beforeChange,143,bug(fix)
3,bugs-in-py_httpie-1,bugs-in-py,httpie-1,5300b0b490b8db48fac30b5e32164be93dc574b7,httpie/downloads.py,programming,afterChange,10,bug(fix)
4,bugs-in-py_httpie-1,bugs-in-py,httpie-1,5300b0b490b8db48fac30b5e32164be93dc574b7,httpie/downloads.py,programming,afterChange,139,bug(fix)
...,...,...,...,...,...,...,...,...,...
19885,bugs-in-py_thefuck-9,bugs-in-py,thefuck-9,feb36ede5c518fdc3b6eddf945b2d8b1e2294d15,thefuck/rules/git_push.py,programming,afterChange,27,bug(fix)
19886,bugs-in-py_thefuck-9,bugs-in-py,thefuck-9,feb36ede5c518fdc3b6eddf945b2d8b1e2294d15,thefuck/rules/git_push.py,programming,afterChange,28,bug(fix)
19887,bugs-in-py_thefuck-9,bugs-in-py,thefuck-9,feb36ede5c518fdc3b6eddf945b2d8b1e2294d15,thefuck/rules/git_push.py,programming,afterChange,29,bug(fix)
19888,bugs-in-py_thefuck-9,bugs-in-py,thefuck-9,feb36ede5c518fdc3b6eddf945b2d8b1e2294d15,thefuck/rules/git_push.py,programming,afterChange,30,documentation


In [77]:
collective_df_manual

Unnamed: 0,id,bundle,file,fcat,image,line,annotation,user,auto,ds,bug
,,,,,,,,,,,
0,cve_CVE-2020-10289,B_6_13,actionlib_tools/scripts/library.py,programming,afterChange,103,bug(fix),U1,False,cve,CVE-2020-10289
1,cve_CVE-2020-10289,B_6_13,actionlib_tools/scripts/library.py,programming,afterChange,137,bug(fix),U1,False,cve,CVE-2020-10289
2,cve_CVE-2020-10289,B_6_13,actionlib_tools/scripts/library.py,programming,beforeChange,103,bug(fix),U1,False,cve,CVE-2020-10289
3,cve_CVE-2020-10289,B_6_13,actionlib_tools/scripts/library.py,programming,beforeChange,137,bug(fix),U1,False,cve,CVE-2020-10289
4,cve_CVE-2020-10289,C_4_9,actionlib_tools/scripts/library.py,programming,afterChange,103,bug(fix),U2,False,cve,CVE-2020-10289
...,...,...,...,...,...,...,...,...,...,...,...
195960,cve_CVE-2018-16876,C_5_8,lib/ansible/plugins/connection/ssh.py,programming,afterChange,365,bug(fix) + refactoring,U2,False,cve,CVE-2018-16876
195961,cve_CVE-2018-16876,C_5_8,lib/ansible/plugins/connection/ssh.py,programming,beforeChange,335,bug(fix),U2,False,cve,CVE-2018-16876
195962,cve_CVE-2018-16876,C_5_8,lib/ansible/plugins/connection/ssh.py,programming,beforeChange,339,bug(fix),U2,False,cve,CVE-2018-16876


In [78]:
collective_df_manual['ds'].value_counts()

ds
crawl         73183
cve           62588
bugs-in-py    60194
Name: count, dtype: int64

In [79]:
collective_df_bugsinpy = collective_df_manual[collective_df_manual['ds'] == 'bugs-in-py']
collective_df_bugsinpy

Unnamed: 0,id,bundle,file,fcat,image,line,annotation,user,auto,ds,bug
,,,,,,,,,,,
16414,bugs-in-py_keras-17,B_6_13,keras/metrics.py,programming,afterChange,37,documentation,U1,False,bugs-in-py,keras-17
16415,bugs-in-py_keras-17,B_6_13,keras/metrics.py,programming,afterChange,38,bug(fix),U1,False,bugs-in-py,keras-17
16416,bugs-in-py_keras-17,B_6_13,keras/metrics.py,programming,beforeChange,37,bug(fix),U1,False,bugs-in-py,keras-17
16417,bugs-in-py_keras-17,B_6_13,tests/keras/metrics_test.py,test,afterChange,50,test,U1,False,bugs-in-py,keras-17
16418,bugs-in-py_keras-17,B_6_13,tests/keras/metrics_test.py,test,afterChange,51,test,U1,False,bugs-in-py,keras-17
...,...,...,...,...,...,...,...,...,...,...,...
195909,bugs-in-py_pandas-54,A_1_24,pandas/tests/dtypes/test_dtypes.py,test,afterChange,133,test,E1,False,bugs-in-py,pandas-54
195910,bugs-in-py_pandas-54,A_1_24,pandas/tests/dtypes/test_dtypes.py,test,afterChange,134,test,E1,False,bugs-in-py,pandas-54
195911,bugs-in-py_pandas-54,A_1_24,pandas/tests/indexes/common.py,test,afterChange,608,test,E1,False,bugs-in-py,pandas-54


In [80]:
collective_df_bugsinpy_sel = collective_df_bugsinpy[['ds', 'bug', 'bundle', 'user', 'file', 'fcat', 'image', 'line', 'annotation']]
collective_df_bugsinpy_sel.head(5)

Unnamed: 0,ds,bug,bundle,user,file,fcat,image,line,annotation
,,,,,,,,,
16414.0,bugs-in-py,keras-17,B_6_13,U1,keras/metrics.py,programming,afterChange,37.0,documentation
16415.0,bugs-in-py,keras-17,B_6_13,U1,keras/metrics.py,programming,afterChange,38.0,bug(fix)
16416.0,bugs-in-py,keras-17,B_6_13,U1,keras/metrics.py,programming,beforeChange,37.0,bug(fix)
16417.0,bugs-in-py,keras-17,B_6_13,U1,tests/keras/metrics_test.py,test,afterChange,50.0,test
16418.0,bugs-in-py,keras-17,B_6_13,U1,tests/keras/metrics_test.py,test,afterChange,51.0,test


In [81]:
from_repos_df_sel = from_repos_df[['ds', 'bug', 'sha', 'file', 'fcat', 'image', 'line', 'annotation']]
from_repos_df_sel.head(5)

Unnamed: 0,ds,bug,sha,file,fcat,image,line,annotation
0,bugs-in-py,httpie-1,5300b0b490b8db48fac30b5e32164be93dc574b7,CHANGELOG.rst,documentation,afterChange,30,documentation
1,bugs-in-py,httpie-1,5300b0b490b8db48fac30b5e32164be93dc574b7,httpie/downloads.py,programming,beforeChange,142,bug(fix)
2,bugs-in-py,httpie-1,5300b0b490b8db48fac30b5e32164be93dc574b7,httpie/downloads.py,programming,beforeChange,143,bug(fix)
3,bugs-in-py,httpie-1,5300b0b490b8db48fac30b5e32164be93dc574b7,httpie/downloads.py,programming,afterChange,10,bug(fix)
4,bugs-in-py,httpie-1,5300b0b490b8db48fac30b5e32164be93dc574b7,httpie/downloads.py,programming,afterChange,139,bug(fix)


In [82]:
merge_sel = pd.merge(
    collective_df_bugsinpy_sel, from_repos_df_sel,
    how='outer', on=['ds', 'bug', 'file', 'image', 'line'],
    indicator="indicator_column", suffixes=("_hapy", "_auto"),
)

merge_sel['fcat_neq'] = merge_sel['fcat_hapy'] != merge_sel['fcat_auto']
merge_sel['annotation_neq'] = merge_sel['annotation_hapy'] != merge_sel['annotation_auto']

merge_sel[[
    'ds', 'bug', 'sha',
    'bundle', 'user',
    'file', 'fcat_hapy', 'fcat_auto',
    'image', 'line', 'annotation_hapy', 'annotation_auto',
    'fcat_neq', 'annotation_neq'
]].head()

Unnamed: 0,ds,bug,sha,bundle,user,file,fcat_hapy,fcat_auto,image,line,annotation_hapy,annotation_auto,fcat_neq,annotation_neq
0,bugs-in-py,PySnooper-1,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,B_6_13,U1,pysnooper/pycompat.py,programming,programming,afterChange,11,other,bug(fix),False,True
1,bugs-in-py,PySnooper-1,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,D_2_5,U3,pysnooper/pycompat.py,programming,programming,afterChange,11,bug(fix),bug(fix),False,False
2,bugs-in-py,PySnooper-1,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,A_4_21,E1,pysnooper/pycompat.py,programming,programming,afterChange,11,bug(fix),bug(fix),False,False
3,bugs-in-py,PySnooper-1,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,B_6_13,U1,pysnooper/tracer.py,programming,programming,afterChange,17,bug(fix) + refactoring,bug(fix),False,True
4,bugs-in-py,PySnooper-1,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,D_2_5,U3,pysnooper/tracer.py,programming,programming,afterChange,17,bug(fix),bug(fix),False,False


In [83]:
merge_sel[['fcat_neq', 'annotation_neq']].value_counts()

fcat_neq  annotation_neq
False     False             49091
          True               9274
True      True               1951
          False                89
Name: count, dtype: int64

In [84]:
merge_sel['fcat_neq'].value_counts()

fcat_neq
False    58365
True      2040
Name: count, dtype: int64

In [85]:
merge_sel['annotation_neq'].value_counts()

annotation_neq
False    49180
True     11225
Name: count, dtype: int64

### Analysis of comparison results

Disagreement, as percentage

In [86]:
merge_sel.shape

(60405, 15)

In [87]:
merge_sel['annotation_neq'].value_counts().sum()

np.int64(60405)

In [88]:
merge_sel['annotation_neq'].value_counts()/merge_sel.shape[0]

annotation_neq
False    0.814171
True     0.185829
Name: count, dtype: float64

Analyze what was the source of disagreement

In [90]:
merge_sel[merge_sel['annotation_neq']]['annotation_hapy'].value_counts()

annotation_hapy
documentation             4716
bug(fix) + refactoring    1534
bug(fix)                  1223
test + refactoring        1155
refactoring               1136
test                       947
other                      303
Name: count, dtype: int64

In [97]:
merge_sel[merge_sel['annotation_neq']]['annotation_hapy'].value_counts()/merge_sel.shape[0]

annotation_hapy
documentation             0.078073
bug(fix) + refactoring    0.025395
bug(fix)                  0.020247
test + refactoring        0.019121
refactoring               0.018806
test                      0.015678
other                     0.005016
Name: count, dtype: float64

Let's examine the case for **df['annotation_hapy'] == 'documentation'** and disagreement

In [93]:
df = merge_sel[merge_sel['annotation_neq'] & (merge_sel['annotation_hapy'] == 'documentation')][[
    'bug','bundle','user',
    'file','image','line',
    'annotation_hapy', 'annotation_auto'
]]
df

Unnamed: 0,bug,bundle,user,file,image,line,annotation_hapy,annotation_auto
24,PySnooper-1,B_6_13,U1,tests/test_chinese.py,afterChange,1,documentation,test
25,PySnooper-1,D_2_5,U3,tests/test_chinese.py,afterChange,1,documentation,test
26,PySnooper-1,A_4_21,E1,tests/test_chinese.py,afterChange,1,documentation,test
27,PySnooper-1,B_6_13,U1,tests/test_chinese.py,afterChange,2,documentation,test
28,PySnooper-1,D_2_5,U3,tests/test_chinese.py,afterChange,2,documentation,test
...,...,...,...,...,...,...,...,...
59890,youtube-dl-39,B_1_18,U1,youtube_dl/utils.py,afterChange,1577,documentation,bug(fix)
59891,youtube-dl-39,D_2_5,U3,youtube_dl/utils.py,afterChange,1577,documentation,bug(fix)
60132,youtube-dl-42,D_5_2,U3,youtube_dl/extractor/mtv.py,beforeChange,87,documentation,bug(fix)
60133,youtube-dl-42,B_3_16,U1,youtube_dl/extractor/mtv.py,beforeChange,87,documentation,bug(fix)


In [94]:
df[['annotation_hapy', 'annotation_auto']].value_counts()

annotation_hapy  annotation_auto
documentation    test               2449
                 bug(fix)           1545
                 data                 77
Name: count, dtype: int64

In [98]:
df_2 = merge_sel[
    merge_sel['annotation_neq'] &
    (merge_sel['annotation_hapy'] == 'documentation') &
    (merge_sel['annotation_auto'] == 'test')
][[
    'bug','bundle','user','sha',
    'file','image','line',
    'annotation_hapy', 'annotation_auto'
]]
df_2

Unnamed: 0,bug,bundle,user,sha,file,image,line,annotation_hapy,annotation_auto
24,PySnooper-1,B_6_13,U1,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,tests/test_chinese.py,afterChange,1,documentation,test
25,PySnooper-1,D_2_5,U3,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,tests/test_chinese.py,afterChange,1,documentation,test
26,PySnooper-1,A_4_21,E1,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,tests/test_chinese.py,afterChange,1,documentation,test
27,PySnooper-1,B_6_13,U1,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,tests/test_chinese.py,afterChange,2,documentation,test
28,PySnooper-1,D_2_5,U3,56f22f8ffe1c6b2be4d2cf3ad1987fdb66113da2,tests/test_chinese.py,afterChange,2,documentation,test
...,...,...,...,...,...,...,...,...,...
58968,youtube-dl-2,D_6_1,U3,9d6ac71c27b1dfb662c795ef598dbfd0286682da,test/test_InfoExtractor.py,afterChange,497,documentation,test
58969,youtube-dl-2,B_4_15,U1,9d6ac71c27b1dfb662c795ef598dbfd0286682da,test/test_InfoExtractor.py,afterChange,497,documentation,test
58970,youtube-dl-2,A_3_22,E3,9d6ac71c27b1dfb662c795ef598dbfd0286682da,test/test_InfoExtractor.py,afterChange,497,documentation,test
59113,youtube-dl-22,C_4_9,U2,db13c16ef8968613680e2bbc85f373c3e74faf98,test/test_YoutubeDL.py,afterChange,2,documentation,test


In [105]:
df_2[df_2['file'].str.count(r'^test|/test|conftest\.py$|_testing\.py') == 0]

Unnamed: 0,bug,bundle,user,sha,file,image,line,annotation_hapy,annotation_auto


In [107]:
df_2.shape[0]/merge_sel.shape[0]

0.04054300140716828

In [108]:
df_2.sample(4)

Unnamed: 0,bug,bundle,user,sha,file,image,line,annotation_hapy,annotation_auto
14401,keras-16,B_6_13,U1,fe38f9dfc8c732a77ac03507b63c79b1d2acfba2,tests/keras/test_sequential_model.py,afterChange,171,documentation,test
1869,ansible-4,A_2_23,E2,18a66e291dad71128a32d662aa808213acefe0e9,test/units/playbook/test_collectionsearch.py,afterChange,27,documentation,test
6407,black-23,A_1_24,E1,6316e293ac30a2837ec20eba289fd28a2a18cf89,tests/python2.py,afterChange,17,documentation,test
23312,luigi-9,D_4_3,U3,b7115974c3deadf77113686248b39567cb67e38f,test/retcodes_test.py,afterChange,176,documentation,test


In all cases for project, files, and lines selected at current run thought the notebook, I got comment or comment-like i.e. docstring.
Here are the results:

```console
repositories/keras$ git show fe38f9dfc8c732a77ac03507b63c79b1d2acfba2:tests/keras/test_sequential_model.py | sed -n '171p'
    # Test serialization
repositories/ansible$ git show 18a66e291dad71128a32d662aa808213acefe0e9:test/units/playbook/test_collectionsearch.py | sed -n '27p'
    """Test that collection name is not templated.
repositories/black$ git show 6316e293ac30a2837ec20eba289fd28a2a18cf89:tests/python2.py | sed -n '17p'
# output
repositories/luigi$ git show b7115974c3deadf77113686248b39567cb67e38f:test/retcodes_test.py | sed -n '176p'
    """
```

Which means that in 4 test cases, it was 2 times comment, 2 times docstring - in a test file

In [109]:
df_3 = merge_sel[
    merge_sel['annotation_neq'] &
    (merge_sel['annotation_hapy'] == 'documentation') &
    (merge_sel['annotation_auto'] == 'bug(fix)')
][[
    'bug','bundle','user','sha',
    'file','image','line',
    'annotation_hapy', 'annotation_auto'
]]
df_3

Unnamed: 0,bug,bundle,user,sha,file,image,line,annotation_hapy,annotation_auto
549,ansible-11,D_4_3,U3,52f3ce8a808f943561803bd664e695fed1841fe8,lib/ansible/modules/network/ios/ios_banner.py,afterChange,117,documentation,bug(fix)
550,ansible-11,B_4_15,U1,52f3ce8a808f943561803bd664e695fed1841fe8,lib/ansible/modules/network/ios/ios_banner.py,afterChange,117,documentation,bug(fix)
551,ansible-11,A_5_20,E2,52f3ce8a808f943561803bd664e695fed1841fe8,lib/ansible/modules/network/ios/ios_banner.py,afterChange,117,documentation,bug(fix)
552,ansible-11,D_4_3,U3,52f3ce8a808f943561803bd664e695fed1841fe8,lib/ansible/modules/network/ios/ios_banner.py,afterChange,118,documentation,bug(fix)
553,ansible-11,B_4_15,U1,52f3ce8a808f943561803bd664e695fed1841fe8,lib/ansible/modules/network/ios/ios_banner.py,afterChange,118,documentation,bug(fix)
...,...,...,...,...,...,...,...,...,...
59890,youtube-dl-39,B_1_18,U1,a020a0dc20ced6468ec46214c394f6f360735b1d,youtube_dl/utils.py,afterChange,1577,documentation,bug(fix)
59891,youtube-dl-39,D_2_5,U3,a020a0dc20ced6468ec46214c394f6f360735b1d,youtube_dl/utils.py,afterChange,1577,documentation,bug(fix)
60132,youtube-dl-42,D_5_2,U3,5aafe895fce2a7be9595cb2e56b7bd73a748e6b6,youtube_dl/extractor/mtv.py,beforeChange,87,documentation,bug(fix)
60133,youtube-dl-42,B_3_16,U1,5aafe895fce2a7be9595cb2e56b7bd73a748e6b6,youtube_dl/extractor/mtv.py,beforeChange,87,documentation,bug(fix)


In [110]:
df_3.shape[0]/merge_sel.shape[0]

0.025577352868140054

In [112]:
df_3.sample(5)

Unnamed: 0,bug,bundle,user,sha,file,image,line,annotation_hapy,annotation_auto
18335,keras-42,C_6_7,U2,2f3edf96078d78450b985bdf3bfffe7e0c627169,keras/engine/training.py,afterChange,1946,documentation,bug(fix)
45587,pandas-90,C_3_10,U2,1c3d64bae7c07b5ae1be337e0ebd751385b7ce27,pandas/io/pickle.py,afterChange,165,documentation,bug(fix)
49525,scrapy-17,D_6_1,U3,65c7c05060fd2d1fc161d4904243d5e0b31e202b,scrapy/utils/response.py,beforeChange,50,documentation,bug(fix)
45561,pandas-90,B_6_13,U1,1c3d64bae7c07b5ae1be337e0ebd751385b7ce27,pandas/io/pickle.py,afterChange,115,documentation,bug(fix)
39191,pandas-44,C_5_8,U2,50817487ce5b1a2c4896495509e2b53e22fa3212,pandas/core/indexes/timedeltas.py,afterChange,218,documentation,bug(fix)


- 2f3edf96078d78450b985bdf3bfffe7e0c627169:keras/engine/training.py:1946 - inside very long docstring (should be detected by new annotator for `from-repo` case)
- 1c3d64bae7c07b5ae1be337e0ebd751385b7ce27:pandas/io/pickle.py:165 - code just after the end of the docstring; here the user is _**wrong**_,<br> though the context visible in the UI of Label Studio might be the cause of this mistake
- 1c3d64bae7c07b5ae1be337e0ebd751385b7ce27:pandas/io/pickle.py:115 - inside quite a long docstring (should be detected by new annotator)
- 50817487ce5b1a2c4896495509e2b53e22fa3212:pandas/core/indexes/timedeltas.py:218 - 2-nd line of 3-line docstring (should be detected by new annotator)
- 65c7c05060fd2d1fc161d4904243d5e0b31e202b^:scrapy/utils/response.py:50 - empty line inside just started docstring - one line prior (should be detected by new annotator)

In [113]:
merge_sel[merge_sel['annotation_neq']]['annotation_auto'].value_counts()/merge_sel.shape[0]

annotation_auto
test             0.071732
bug(fix)         0.064101
documentation    0.022184
data             0.001440
project          0.000050
Name: count, dtype: float64

In [114]:
merge_sel[merge_sel['annotation_auto']=='project']

Unnamed: 0,ds,bug,bundle,user,file,fcat_hapy,image,line,annotation_hapy,sha,fcat_auto,annotation_auto,indicator_column,fcat_neq,annotation_neq
56649,bugs-in-py,tornado-15,A_6_19,E3,MANIFEST.in,project,afterChange,12,bug(fix),ecb3ea7543cc942659faf3d2144853018afa6139,project,project,both,False,True
56650,bugs-in-py,tornado-15,C_2_11,U2,MANIFEST.in,project,afterChange,12,bug(fix),ecb3ea7543cc942659faf3d2144853018afa6139,project,project,both,False,True
56651,bugs-in-py,tornado-15,B_4_15,U1,MANIFEST.in,project,afterChange,12,test,ecb3ea7543cc942659faf3d2144853018afa6139,project,project,both,False,True


In [115]:
merge_sel[merge_sel['annotation_auto']=='data']

Unnamed: 0,ds,bug,bundle,user,file,fcat_hapy,image,line,annotation_hapy,sha,fcat_auto,annotation_auto,indicator_column,fcat_neq,annotation_neq
468,bugs-in-py,ansible-10,A_2_23,E2,changelogs/fragments/66398-pamd_fix-attributee...,documentation,afterChange,1,documentation,a4b59d021368285490f7cda50c11ac4f7a8030b5,data,data,both,True,True
469,bugs-in-py,ansible-10,C_4_9,U2,changelogs/fragments/66398-pamd_fix-attributee...,documentation,afterChange,1,documentation,a4b59d021368285490f7cda50c11ac4f7a8030b5,data,data,both,True,True
470,bugs-in-py,ansible-10,B_4_15,U1,changelogs/fragments/66398-pamd_fix-attributee...,documentation,afterChange,1,documentation,a4b59d021368285490f7cda50c11ac4f7a8030b5,data,data,both,True,True
471,bugs-in-py,ansible-10,A_2_23,E2,changelogs/fragments/66398-pamd_fix-attributee...,documentation,afterChange,2,documentation,a4b59d021368285490f7cda50c11ac4f7a8030b5,data,data,both,True,True
472,bugs-in-py,ansible-10,C_4_9,U2,changelogs/fragments/66398-pamd_fix-attributee...,documentation,afterChange,2,documentation,a4b59d021368285490f7cda50c11ac4f7a8030b5,data,data,both,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56839,bugs-in-py,tornado-3,B_4_15,U1,.travis.yml,project,beforeChange,87,documentation,aa622e724f80e0f7fcee369f75d69d1db13d72f2,data,data,both,True,True
56840,bugs-in-py,tornado-3,A_5_20,E2,.travis.yml,project,beforeChange,87,documentation,aa622e724f80e0f7fcee369f75d69d1db13d72f2,data,data,both,True,True
56841,bugs-in-py,tornado-3,C_6_7,U2,.travis.yml,project,beforeChange,88,bug(fix),aa622e724f80e0f7fcee369f75d69d1db13d72f2,data,data,both,True,True
56842,bugs-in-py,tornado-3,B_4_15,U1,.travis.yml,project,beforeChange,88,other,aa622e724f80e0f7fcee369f75d69d1db13d72f2,data,data,both,True,True


# ---