In [21]:
import json
from pathlib import Path

import pandas as pd

# Comparison with line annotations in the HaPy-Bug dataset

**HaPy-Bug** dataset comprises annotated diff files from three sources. None
of them had previously been subjected to human annotation at the
granular (line-by-line) level.

$D_{BIP}$: **BugsInPy** subset is an extension of dataset of 496 real
bugs proposed in 
_"[BugsInPy: a database of existing bugs in Python programs to enable controlled testing and debugging studies][BugsInPy]"_ (2020).
This subset focuses on bugs in source code
and excludes issues related to configurations, build scripts, docu-
mentation, and test cases. It also requires bugs to be reproducible,
i.e. at least one test case from the fixed version must fail with the
faulty version. Only changes involving isolated bugs are included.

$D_{CVE}$: **Python CVE** and $D_{CRAWL}$: **Crawled Python CVE** are
new custom made, specialized collections of Python-related bugs
sourced from the [CVE DB](https://cve.mitre.org/) and projects git repositories.

$D_{CVE}$ comprises bugs identified through a comprehensive full-text search
of CVE DB. This subset was refined by selecting bugs with direct
links to source code fixes that involved modifications to Python code.

$D_{CRAWL}$ is a subset created by scanning repositories of most
popular Python projects for commits that have a CVE id pattern
inside. Each bug found was cross-referenced with CVE DB.

[BugsInPy]: https://doi.org/10.1145/3368089.3417943

## $D_{BIP}$: BugsInPy subset of HaPy-Bug dataset

Here all entries are **single diff**

### Experiments extracting data for a single entry (single bug)

> Larger outputs are stored collapsed

In [75]:
annotator_json = '/mnt/data/python-diff-annotator/example_annotations/HaPy-Bug/bugsinpy-dataset/cookiecutter-1/annotation/7f6804c4953a18386809f11faf4d86898570debc.v2.json'

In [76]:
with open(annotator_json, mode='r') as json_fp:
    annotator_data = json.load(json_fp)

In [77]:
annotator_data.keys()

dict_keys(['commit_metadata', 'changes', 'diff_metadata'])

In [78]:
annotator_data['changes'].keys()

dict_keys(['cookiecutter/generate.py', '/dev/null', 'tests/test-generate-context/non_ascii.json', 'tests/test_generate_context.py'])

In [79]:
hapybug_json = '/mnt/data/HaPy-Bug/raw_data/bugsinpy-dataset/cookiecutter-1/annotation/7f6804c4953a18386809f11faf4d86898570debc.json'

In [80]:
with open(hapybug_json, mode='r') as json_fp:
    hapybug_data = json.load(json_fp)

In [81]:
hapybug_data.keys()

dict_keys(['cookiecutter/generate.py', '/dev/null', 'tests/test-generate-context/non_ascii.json', 'tests/test_generate_context.py'])

In [82]:
annotator_data['changes']['cookiecutter/generate.py']

{'language': 'Python',
 'type': 'programming',
 'purpose': 'programming',
 '-': [{'id': 3,
   'file_line_no': 85,
   'type': 'code',
   'purpose': 'programming',
   'tokens': [[40, ['Text'], '        '],
    [48, ['Keyword'], 'with'],
    [52, ['Text'], ' '],
    [53, ['Name', 'Builtin'], 'open'],
    [57, ['Punctuation'], '('],
    [58, ['Name'], 'context_file'],
    [70, ['Punctuation'], ')'],
    [71, ['Text'], ' '],
    [72, ['Keyword'], 'as'],
    [74, ['Text'], ' '],
    [75, ['Name'], 'file_handle'],
    [86, ['Punctuation'], ':'],
    [87, ['Text', 'Whitespace'], '\n']]}],
 '+': [{'id': 4,
   'file_line_no': 85,
   'type': 'code',
   'purpose': 'programming',
   'tokens': [[40, ['Text'], '        '],
    [48, ['Keyword'], 'with'],
    [52, ['Text'], ' '],
    [53, ['Name', 'Builtin'], 'open'],
    [57, ['Punctuation'], '('],
    [58, ['Name'], 'context_file'],
    [70, ['Punctuation'], ','],
    [71, ['Text'], ' '],
    [72, ['Name'], 'encoding'],
    [80, ['Operator'], '='],
 

In [83]:
hapybug_data['cookiecutter/generate.py']

{'language': 'Python',
 'type': 'programming',
 'purpose': 'programming',
 '+': [{'id': 85, 'type': 'bug(fix)'}],
 '-': [{'id': 85, 'type': 'bug(fix)'}]}

In [84]:
hapybug_data['tests/test_generate_context.py']

{'language': 'Python',
 'type': 'programming',
 'purpose': 'test',
 '+': [{'id': 111, 'type': 'test'},
  {'id': 112, 'type': 'test'},
  {'id': 113, 'type': 'test'},
  {'id': 114, 'type': 'test'},
  {'id': 115, 'type': 'test'},
  {'id': 116, 'type': 'test'},
  {'id': 117, 'type': 'test'},
  {'id': 118, 'type': 'test'},
  {'id': 119, 'type': 'test'},
  {'id': 120, 'type': 'test'},
  {'id': 121, 'type': 'test'}],
 '-': []}

In [85]:
print(Path('/mnt/data/HaPy-Bug/raw_data/bugsinpy-dataset/cookiecutter-1/patches/7f6804c4953a18386809f11faf4d86898570debc.diff').read_text())

diff --git a/cookiecutter/generate.py b/cookiecutter/generate.py
index 37365a4..c526b97 100644
--- a/cookiecutter/generate.py
+++ b/cookiecutter/generate.py
@@ -82,7 +82,7 @@ def generate_context(
     context = OrderedDict([])
 
     try:
-        with open(context_file) as file_handle:
+        with open(context_file, encoding='utf-8') as file_handle:
             obj = json.load(file_handle, object_pairs_hook=OrderedDict)
     except ValueError as e:
         # JSON decoding error.  Let's throw a new exception that is more
diff --git a/tests/test-generate-context/non_ascii.json b/tests/test-generate-context/non_ascii.json
new file mode 100644
index 0000000..af0edf6
--- /dev/null
+++ b/tests/test-generate-context/non_ascii.json
@@ -0,0 +1,3 @@
+{
+    "full_name": "éèà"
+}
diff --git a/tests/test_generate_context.py b/tests/test_generate_context.py
index 26e7d4d..69d0148 100644
--- a/tests/test_generate_context.py
+++ b/tests/test_generate_context.py
@@ -108,6 +108,17 @@ def test_def

In [86]:
with open('/mnt/data/CVE/final_bugs_packages.json', mode='r') as json_fp:
    where_labeling_data = json.load(json_fp)

In [87]:
where_labeling_data['cookiecutter-1']

{'rA': 1, 'rB': 1, 'rC': 0, 'rD': 1, 'pA': 2, 'pB': 4, 'pC': 1, 'pD': 3}

In [88]:
label_studio_json_1 = '/mnt/data/HaPy-Bug/annotated_data/D_4_3.json'

In [89]:
with open(label_studio_json_1, mode='r') as json_fp:
    label_studio_data_1 = json.load(json_fp)

In [90]:
[elem['annotations'][0]['result'][3]['value'] for elem in label_studio_data_1]

[{'hyperlinks': [{'url': 'http://lists.fedoraproject.org/pipermail/package-announce/2013-May/106220.html',
    'dates': {'min': '2013-01-01', 'max': '2020-05-24'},
    'labels': ['lists.fedoraproject.org',
     'lists.fedoraproject.org/pipermail',
     'lists.fedoraproject.org/pipermail/package-announce',
     'lists.fedoraproject.org/pipermail/package-announce/2013-May']},
   {'url': 'http://lists.fedoraproject.org/pipermail/package-announce/2013-May/105916.html',
    'dates': {'min': '2001-05-22', 'max': '2013-05-14'},
    'labels': ['lists.fedoraproject.org',
     'lists.fedoraproject.org/pipermail',
     'lists.fedoraproject.org/pipermail/package-announce',
     'lists.fedoraproject.org/pipermail/package-announce/2013-May']},
   {'url': 'http://rhn.redhat.com/errata/RHSA-2013-0806.html',
    'dates': {'min': '2012-02-04', 'max': '2013-05-09'},
    'labels': ['Vendor Advisory']},
   {'url': 'https://bugs.launchpad.net/keystone/+bug/1172195',
    'dates': {'min': '2013-01-01', 'max':

### Using the collective.{csv,json}, generated by Paper.ipynb

In [91]:
collective_dir = '../../data/experiments/HaPy-Bug/'
list(Path(collective_dir).glob('*'))

[PosixPath('../../data/experiments/HaPy-Bug/collective.csv'),
 PosixPath('../../data/experiments/HaPy-Bug/collective.json')]

In [92]:
%ls -l '../../data/experiments/HaPy-Bug/'

total 140368
-rw-r--r-- 1 jnareb jnareb 50424028 Nov 27 00:22 collective.csv
-rw-r--r-- 1 jnareb jnareb 93299694 Nov 27 00:22 collective.json


In [93]:
collective_csv = Path(collective_dir) / 'collective.csv'
collective_csv

PosixPath('../../data/experiments/HaPy-Bug/collective.csv')

In [94]:
collective_df = pd.read_csv(collective_csv, index_col=0)
collective_df.index = collective_df.index.rename(name='')
collective_df

Unnamed: 0,id,bundle,file,fcat,image,line,annotation,user,auto,ds,bug
,,,,,,,,,,,
0,cve_CVE-2020-10289,B_6_13,actionlib_tools/scripts/library.py,programming,afterChange,103,bug(fix),U1,False,cve,CVE-2020-10289
1,cve_CVE-2020-10289,B_6_13,actionlib_tools/scripts/library.py,programming,afterChange,137,bug(fix),U1,False,cve,CVE-2020-10289
2,cve_CVE-2020-10289,B_6_13,actionlib_tools/scripts/library.py,programming,beforeChange,103,bug(fix),U1,False,cve,CVE-2020-10289
3,cve_CVE-2020-10289,B_6_13,actionlib_tools/scripts/library.py,programming,beforeChange,137,bug(fix),U1,False,cve,CVE-2020-10289
4,cve_CVE-2020-10289,C_4_9,actionlib_tools/scripts/library.py,programming,afterChange,103,bug(fix),U2,False,cve,CVE-2020-10289
...,...,...,...,...,...,...,...,...,...,...,...
391913,cve_CVE-2018-16876,auto_C_5_8,lib/ansible/plugins/connection/ssh.py,programming,afterChange,361,bug(fix),U2,True,cve,CVE-2018-16876
391914,cve_CVE-2018-16876,auto_C_5_8,lib/ansible/plugins/connection/ssh.py,programming,afterChange,362,bug(fix),U2,True,cve,CVE-2018-16876
391915,cve_CVE-2018-16876,auto_C_5_8,lib/ansible/plugins/connection/ssh.py,programming,afterChange,363,bug(fix),U2,True,cve,CVE-2018-16876


In [95]:
collective_df['ds'].value_counts()

ds
crawl         146366
cve           125176
bugs-in-py    120376
Name: count, dtype: int64

In [96]:
collective_df_manual = collective_df[collective_df['auto'] == False]
collective_df_manual['ds'].value_counts()

ds
crawl         73183
cve           62588
bugs-in-py    60194
Name: count, dtype: int64

### Running annotation on BugsInPy dataset

The annotation data was generated using the following command:

```console
diff-annotate dataset \
    --output-prefix=/mnt/data/python-diff-annotator/example_annotations/HaPy-Bug \
    /mnt/data/HaPy-Bug/raw_data/bugsinpy-dataset/
```

And as can be seen, it is present in `/mnt/data/python-diff-annotator/example_annotations/HaPy-Bug/bugsinpy-dataset/`

In [97]:
bugsinpy_annotated_from_dataset_dir = '/mnt/data/python-diff-annotator/example_annotations/HaPy-Bug/bugsinpy-dataset/'

In [98]:
%ls /mnt/data/python-diff-annotator/example_annotations/HaPy-Bug/bugsinpy-dataset/

[0m[01;34mansible-1[0m/       [01;34mkeras-40[0m/       [01;34mpandas-125[0m/  [01;34mpandas-64[0m/    [01;34mthefuck-12[0m/
[01;34mansible-10[0m/      [01;34mkeras-41[0m/       [01;34mpandas-126[0m/  [01;34mpandas-65[0m/    [01;34mthefuck-13[0m/
[01;34mansible-11[0m/      [01;34mkeras-42[0m/       [01;34mpandas-127[0m/  [01;34mpandas-66[0m/    [01;34mthefuck-14[0m/
[01;34mansible-12[0m/      [01;34mkeras-43[0m/       [01;34mpandas-128[0m/  [01;34mpandas-67[0m/    [01;34mthefuck-15[0m/
[01;34mansible-13[0m/      [01;34mkeras-44[0m/       [01;34mpandas-129[0m/  [01;34mpandas-68[0m/    [01;34mthefuck-16[0m/
[01;34mansible-14[0m/      [01;34mkeras-45[0m/       [01;34mpandas-13[0m/   [01;34mpandas-69[0m/    [01;34mthefuck-17[0m/
[01;34mansible-15[0m/      [01;34mkeras-5[0m/        [01;34mpandas-130[0m/  [01;34mpandas-7[0m/     [01;34mthefuck-18[0m/
[01;34mansible-16[0m/      [01;34mkeras-6[0m/        [01;34mpandas-

In [99]:
%ls /mnt/data/python-diff-annotator/example_annotations/HaPy-Bug/bugsinpy-dataset/cookiecutter-1/annotation/

7f6804c4953a18386809f11faf4d86898570debc.v2.json


In [100]:
example_repo = 'cookiecutter'
example_bug = 'cookiecutter-1'

example_path = next(Path(bugsinpy_annotated_from_dataset_dir).joinpath(example_bug, 'annotation').glob('*.json'))
example_path

PosixPath('/mnt/data/python-diff-annotator/example_annotations/HaPy-Bug/bugsinpy-dataset/cookiecutter-1/annotation/7f6804c4953a18386809f11faf4d86898570debc.v2.json')

In [101]:
with open(example_path, mode='r') as json_fp:
    example_data_from_dataset = json.load(json_fp)

type(example_data_from_dataset)

dict

In [102]:
example_data_from_dataset.keys()

dict_keys(['commit_metadata', 'changes', 'diff_metadata'])

In [103]:
example_data_from_dataset['commit_metadata']

{'id': '7f6804c4953a18386809f11faf4d86898570debc'}

In [104]:
example_data_from_dataset['diff_metadata']

{'n_files': 3,
 'hunk_span_src': 11,
 'hunk_span_dst': 24,
 'n_hunks': 3,
 'n_lines_added': 15,
 'n_lines_removed': 1,
 'n_lines_all': 28,
 'n_mod': 1,
 'n_groups': 3,
 'patch_size': 15,
 'n_added_files': 1,
 'n_add': 14}

In [105]:
example_data_from_dataset['changes'].keys()

dict_keys(['cookiecutter/generate.py', '/dev/null', 'tests/test-generate-context/non_ascii.json', 'tests/test_generate_context.py'])

In [106]:
example_data_from_dataset['changes']['cookiecutter/generate.py']

{'language': 'Python',
 'type': 'programming',
 'purpose': 'programming',
 '-': [{'id': 3,
   'file_line_no': 85,
   'type': 'code',
   'purpose': 'programming',
   'tokens': [[40, ['Text'], '        '],
    [48, ['Keyword'], 'with'],
    [52, ['Text'], ' '],
    [53, ['Name', 'Builtin'], 'open'],
    [57, ['Punctuation'], '('],
    [58, ['Name'], 'context_file'],
    [70, ['Punctuation'], ')'],
    [71, ['Text'], ' '],
    [72, ['Keyword'], 'as'],
    [74, ['Text'], ' '],
    [75, ['Name'], 'file_handle'],
    [86, ['Punctuation'], ':'],
    [87, ['Text', 'Whitespace'], '\n']]}],
 '+': [{'id': 4,
   'file_line_no': 85,
   'type': 'code',
   'purpose': 'programming',
   'tokens': [[40, ['Text'], '        '],
    [48, ['Keyword'], 'with'],
    [52, ['Text'], ' '],
    [53, ['Name', 'Builtin'], 'open'],
    [57, ['Punctuation'], '('],
    [58, ['Name'], 'context_file'],
    [70, ['Punctuation'], ','],
    [71, ['Text'], ' '],
    [72, ['Name'], 'encoding'],
    [80, ['Operator'], '='],
 

### Extracting commit ids from BugsInPy dataset

For each bug in **BugsInPy** dataset we want repository and commit id, to be able to use more powerful `diff-annotate from-repo`, rather than `diff-annotate dataset`.

In [107]:
bugsinpy_dir = '/mnt/data/HaPy-Bug/raw_data/bugsinpy-dataset/'

In [132]:
repo_commits = {}

for bug_dir in Path(bugsinpy_dir).iterdir():
    repo_name = bug_dir.name.rsplit('-', maxsplit=1)[0]

    #print(f"{bug_dir.name=}, {repo_name=}")
    if repo_name not in repo_commits:
        repo_commits[repo_name] = { 'commits': [], 'bugs': [] }

    repo_commits[repo_name]['bugs'].append(bug_dir.name)
    
    for diff_file in bug_dir.joinpath('patches').glob('*.diff'):
        #print(f"  {diff_file.stem=}")
        repo_commits[repo_name]['commits'].append(diff_file.stem)

repo_commits['cookiecutter']

{'commits': ['7f6804c4953a18386809f11faf4d86898570debc',
  '7129d474206761a6156925db78eee4b62a0e3944',
  '90434ff4ea4477941444f1e83313beb414838535',
  '457a1a4e862aab4102b644ff1d2b2e2b5a766b3c'],
 'bugs': ['cookiecutter-1',
  'cookiecutter-3',
  'cookiecutter-2',
  'cookiecutter-4']}

In [133]:
repo_commits.keys()

dict_keys(['pandas', 'thefuck', 'tornado', 'black', 'youtube-dl', 'spacy', 'keras', 'ansible', 'scrapy', 'fastapi', 'luigi', 'matplotlib', 'tqdm', 'sanic', 'cookiecutter', 'httpie', 'PySnooper'])

Find where repositories were cloned to (locally):

In [125]:
repositories_json = '../../data/experiments/HaPy-Bug/repositories.json'
%ls -l '../../data/experiments/HaPy-Bug/repositories.json'

-rw-r--r-- 1 jnareb jnareb 15132 Nov 27 02:01 ../../data/experiments/HaPy-Bug/repositories.json


In [126]:
with open(repositories_json, mode='r') as json_fp:
    repositories_data = json.load(json_fp)

repositories_data[:3]

[{'project': 'pandas',
  'repository_url': 'https://github.com/pandas-dev/pandas',
  'repository_path': '/mnt/data/python_bug_localization_data/repositories/pandas'},
 {'project': 'ansible',
  'repository_url': 'https://github.com/ansible/ansible',
  'repository_path': '/mnt/data/python_bug_localization_data/repositories/ansible'},
 {'project': 'black',
  'repository_url': 'https://github.com/psf/black',
  'repository_path': '/mnt/data/python_bug_localization_data/repositories/black'}]

In [127]:
repositories_map = {
    elem['project']: {'url': elem['repository_url'], 'path': elem['repository_path'] }
    for elem in repositories_data
}

repositories_map['cookiecutter']

{'url': 'https://github.com/cookiecutter/cookiecutter',
 'path': '/mnt/data/python_bug_localization_data/repositories/cookiecutter'}

### Running annotation on BugsInPy repos

In [136]:
for repo_name, repo_data in repo_commits.items():
    print(f"{repo_name}:")
    cmd_str = (
        f"uptime && time diff-annotate from-repo " 
        f"--output-dir=/mnt/data/python-diff-annotator/example_annotations/bugsinpy-from-repo/{repo_name} "
        f"{repositories_map[repo_name]['path']} --no-walk=sorted {' '.join(repo_data['commits'])}"
    )
    print("  arg_length <=", len(cmd_str))
    print(cmd_str)
    print("")

pandas:
  arg_length <= 7093
uptime && time diff-annotate from-repo --output-dir=/mnt/data/python-diff-annotator/example_annotations/bugsinpy-from-repo/pandas /mnt/data/python_bug_localization_data/repositories/pandas --no-walk=sorted ffe6cfdbf82d663c3f77567bde11f1666de1df38 b1c871ce4b5e76b3cffe1ebd4216d36379872352 74dad82827e9b13552df2d6d3fbbeb901821b53f daef69c1366e31c3c49aea6f2e55f577d0c832fd 71d610596ed128055614eb660f13c88168bfe22f 82c9547ddcaf2fd70e00f1368731f14a03bbac88 0b0cd08524e4472eb15835c2b91621dc0a6eeeb0 cb71376385c33270fa1922aec9eb6c49de4336f4 fcf7258c19b0a6a712f33fb0bcefdae426be7e7f 628dfba239865adc09c94108b288bcb60c619950 8dd9fabd2ad9104e747084437b9ad436d5be087a 73d614403759831814ef7ab83ef1e4aaa645b33a 91150d976ac41bd93a0e6516b2090c534f91aff2 b7f061c3d24df943e16918ad3932e767f5639a38 53a0dfd41a65a33dd7b0963734b24c749212e625 dd71064327721c1ec7366000f357b0c08bcec4d2 ca5198a6daa7757e398112a17ccadc9e7d078d96 f61deb962ac0853595a43ad024c482b018d1792b 06ef193a5c1957c0a76e3e88bc7

Run for example:

```console
uptime && time diff-annotate from-repo \
    --output-dir=/mnt/data/python-diff-annotator/example_annotations/bugsinpy-from-repo/cookiecutter \
    /mnt/data/python_bug_localization_data/repositories/cookiecutter --no-walk=sorted \
    7f6804c4953a18386809f11faf4d86898570debc 7129d474206761a6156925db78eee4b62a0e3944 \
    90434ff4ea4477941444f1e83313beb414838535 457a1a4e862aab4102b644ff1d2b2e2b5a766b3c
```

```
 02:36:41 up 289 days,  4:26, 12 users,  load average: 1.08, 1.18, 1.54
Logging to 'diff-annotate.log' file, with log level=WARNING
Computing patch sizes and spreads (# files, # change groups, # spanned lines,...)
Storing annotations in <output_dir>/<commit_id>.json
  with output dir: '/mnt/data/python-diff-annotator/example_annotations/bugsinpy-from-repo/cookiecutter'
Ensuring that output directory '/mnt/data/python-diff-annotator/example_annotations/bugsinpy-from-repo/cookiecutter' exists
Generating patches from local Git repo '/mnt/data/python_bug_localization_data/repositories/cookiecutter'
  using `git log -p '--no-walk=sorted' '7f6804c4953a18386809f11faf4d86898570debc' '7129d474206761a6156925db78eee4b62a0e3944' '90434ff4ea4477941444f1e83313beb414838535' '457a1a4e862aab4102b644ff1d2b2e2b5a766b3c'`
  took 0.212 seconds (includes parsing unified diffs)
Annotating commits and saving annotated data, for 4 commits
  lexing pre- and post-image file contents, from repo 'cookiecutter'
  using sequential processing
commits: 100%|█████████████████████████████████████████████████████| 4/4 [00:00<00:00,  7.38it/s]

real    0m1.720s
user    0m8.406s
sys     0m0.176s
```

Extract the same commit annotated data:

In [137]:
example_repo = 'cookiecutter'
example_commit = '7f6804c4953a18386809f11faf4d86898570debc'

In [139]:
bugsinpy_annotated_from_repo_dir = '/mnt/data/python-diff-annotator/example_annotations/bugsinpy-from-repo/'

%ls -1 '/mnt/data/python-diff-annotator/example_annotations/bugsinpy-from-repo/'

[0m[01;34mcookiecutter[0m/


In [140]:
%ls -1 '/mnt/data/python-diff-annotator/example_annotations/bugsinpy-from-repo/cookiecutter'

457a1a4e862aab4102b644ff1d2b2e2b5a766b3c.v2.json
7129d474206761a6156925db78eee4b62a0e3944.v2.json
7f6804c4953a18386809f11faf4d86898570debc.v2.json
90434ff4ea4477941444f1e83313beb414838535.v2.json


In [142]:
example_path_2 = Path(bugsinpy_annotated_from_repo_dir).joinpath(example_repo, f"{example_commit}.v2.json")
example_path_2

PosixPath('/mnt/data/python-diff-annotator/example_annotations/bugsinpy-from-repo/cookiecutter/7f6804c4953a18386809f11faf4d86898570debc.v2.json')

In [143]:
with open(example_path_2, mode='r') as json_fp:
    example_data_from_repo = json.load(json_fp)

type(example_data_from_repo)

dict

In [144]:
example_data_from_repo.keys()

dict_keys(['commit_metadata', 'changes', 'diff_metadata'])

There is more commit metadata, because `diff-annotate dataset ...` does not yet try to parse `*.message` files

In [145]:
example_data_from_repo['commit_metadata']

{'id': '7f6804c4953a18386809f11faf4d86898570debc',
 'parents': ['c15633745df6abdb24e02746b82aadb20b8cdf8c'],
 'tree': 'd04faaa47bc47a2f2cda28dcba057ac3865d842e',
 'author': {'author': 'Aurélien Gâteau <mail@agateau.com>',
  'name': 'Aurélien Gâteau',
  'email': 'mail@agateau.com',
  'timestamp': 1590790310,
  'tz_info': '+0200'},
 'committer': {'committer': 'GitHub <noreply@github.com>',
  'name': 'GitHub',
  'email': 'noreply@github.com',
  'timestamp': 1590790310,
  'tz_info': '+0300'},
 'message': 'Fix default values being loaded with wrong encoding on Windows (#1414)\n\nExplicitly set the encoding to utf-8 when reading the context file to\nensure values are correctly loaded.\n\nCo-authored-by: Andrey Shpak <insspb@users.noreply.github.com>\n'}

In [146]:
example_data_from_repo['diff_metadata']

{'n_files': 3,
 'hunk_span_src': 11,
 'hunk_span_dst': 24,
 'n_hunks': 3,
 'n_lines_added': 15,
 'n_lines_removed': 1,
 'n_lines_all': 28,
 'n_mod': 1,
 'n_groups': 3,
 'patch_size': 15,
 'n_added_files': 1,
 'n_add': 14}

In [148]:
example_data_from_dataset['changes'].keys()

dict_keys(['cookiecutter/generate.py', '/dev/null', 'tests/test-generate-context/non_ascii.json', 'tests/test_generate_context.py'])

In [147]:
example_data_from_dataset['changes']['cookiecutter/generate.py']

{'language': 'Python',
 'type': 'programming',
 'purpose': 'programming',
 '-': [{'id': 3,
   'file_line_no': 85,
   'type': 'code',
   'purpose': 'programming',
   'tokens': [[40, ['Text'], '        '],
    [48, ['Keyword'], 'with'],
    [52, ['Text'], ' '],
    [53, ['Name', 'Builtin'], 'open'],
    [57, ['Punctuation'], '('],
    [58, ['Name'], 'context_file'],
    [70, ['Punctuation'], ')'],
    [71, ['Text'], ' '],
    [72, ['Keyword'], 'as'],
    [74, ['Text'], ' '],
    [75, ['Name'], 'file_handle'],
    [86, ['Punctuation'], ':'],
    [87, ['Text', 'Whitespace'], '\n']]}],
 '+': [{'id': 4,
   'file_line_no': 85,
   'type': 'code',
   'purpose': 'programming',
   'tokens': [[40, ['Text'], '        '],
    [48, ['Keyword'], 'with'],
    [52, ['Text'], ' '],
    [53, ['Name', 'Builtin'], 'open'],
    [57, ['Punctuation'], '('],
    [58, ['Name'], 'context_file'],
    [70, ['Punctuation'], ','],
    [71, ['Text'], ' '],
    [72, ['Name'], 'encoding'],
    [80, ['Operator'], '='],
 

### Creating DataFrame for comparison

In [153]:
collective_df_manual.columns

Index(['id', 'bundle', 'file', 'fcat', 'image', 'line', 'annotation', 'user',
       'auto', 'ds', 'bug'],
      dtype='object')

In [154]:
collective_df_manual.dtypes

id            object
bundle        object
file          object
fcat          object
image         object
line           int64
annotation    object
user          object
auto            bool
ds            object
bug           object
dtype: object

In [152]:
collective_df_manual.head(5)

Unnamed: 0,id,bundle,file,fcat,image,line,annotation,user,auto,ds,bug
,,,,,,,,,,,
0.0,cve_CVE-2020-10289,B_6_13,actionlib_tools/scripts/library.py,programming,afterChange,103.0,bug(fix),U1,False,cve,CVE-2020-10289
1.0,cve_CVE-2020-10289,B_6_13,actionlib_tools/scripts/library.py,programming,afterChange,137.0,bug(fix),U1,False,cve,CVE-2020-10289
2.0,cve_CVE-2020-10289,B_6_13,actionlib_tools/scripts/library.py,programming,beforeChange,103.0,bug(fix),U1,False,cve,CVE-2020-10289
3.0,cve_CVE-2020-10289,B_6_13,actionlib_tools/scripts/library.py,programming,beforeChange,137.0,bug(fix),U1,False,cve,CVE-2020-10289
4.0,cve_CVE-2020-10289,C_4_9,actionlib_tools/scripts/library.py,programming,afterChange,103.0,bug(fix),U2,False,cve,CVE-2020-10289


In [156]:
collective_df_manual['ds'].value_counts()

ds
crawl         73183
cve           62588
bugs-in-py    60194
Name: count, dtype: int64

In [157]:
collective_df_manual_bugsinpy = collective_df_manual[collective_df_manual['ds'] == 'bugs-in-py']
collective_df_manual_bugsinpy

Unnamed: 0,id,bundle,file,fcat,image,line,annotation,user,auto,ds,bug
,,,,,,,,,,,
16414,bugs-in-py_keras-17,B_6_13,keras/metrics.py,programming,afterChange,37,documentation,U1,False,bugs-in-py,keras-17
16415,bugs-in-py_keras-17,B_6_13,keras/metrics.py,programming,afterChange,38,bug(fix),U1,False,bugs-in-py,keras-17
16416,bugs-in-py_keras-17,B_6_13,keras/metrics.py,programming,beforeChange,37,bug(fix),U1,False,bugs-in-py,keras-17
16417,bugs-in-py_keras-17,B_6_13,tests/keras/metrics_test.py,test,afterChange,50,test,U1,False,bugs-in-py,keras-17
16418,bugs-in-py_keras-17,B_6_13,tests/keras/metrics_test.py,test,afterChange,51,test,U1,False,bugs-in-py,keras-17
...,...,...,...,...,...,...,...,...,...,...,...
195909,bugs-in-py_pandas-54,A_1_24,pandas/tests/dtypes/test_dtypes.py,test,afterChange,133,test,E1,False,bugs-in-py,pandas-54
195910,bugs-in-py_pandas-54,A_1_24,pandas/tests/dtypes/test_dtypes.py,test,afterChange,134,test,E1,False,bugs-in-py,pandas-54
195911,bugs-in-py_pandas-54,A_1_24,pandas/tests/indexes/common.py,test,afterChange,608,test,E1,False,bugs-in-py,pandas-54


In [159]:
collective_df_manual_bugsinpy[collective_df_manual_bugsinpy['bug'] == 'cookiecutter-1']['bundle'].value_counts()

bundle
D_4_3     16
B_5_14    16
A_3_22    16
Name: count, dtype: int64

In [170]:
example_collective = collective_df_manual_bugsinpy[
    (collective_df_manual_bugsinpy['bug'] == 'cookiecutter-1') & 
    (collective_df_manual_bugsinpy['bundle'] == 'D_4_3')
]

example_collective

Unnamed: 0,id,bundle,file,fcat,image,line,annotation,user,auto,ds,bug
,,,,,,,,,,,
145609.0,bugs-in-py_cookiecutter-1,D_4_3,cookiecutter/generate.py,programming,afterChange,85.0,bug(fix),U3,False,bugs-in-py,cookiecutter-1
145610.0,bugs-in-py_cookiecutter-1,D_4_3,cookiecutter/generate.py,programming,beforeChange,85.0,bug(fix),U3,False,bugs-in-py,cookiecutter-1
145611.0,bugs-in-py_cookiecutter-1,D_4_3,tests/test-generate-context/non_ascii.json,test,afterChange,1.0,test,U3,False,bugs-in-py,cookiecutter-1
145612.0,bugs-in-py_cookiecutter-1,D_4_3,tests/test-generate-context/non_ascii.json,test,afterChange,2.0,test,U3,False,bugs-in-py,cookiecutter-1
145613.0,bugs-in-py_cookiecutter-1,D_4_3,tests/test-generate-context/non_ascii.json,test,afterChange,3.0,test,U3,False,bugs-in-py,cookiecutter-1
145614.0,bugs-in-py_cookiecutter-1,D_4_3,tests/test_generate_context.py,test,afterChange,111.0,test,U3,False,bugs-in-py,cookiecutter-1
145615.0,bugs-in-py_cookiecutter-1,D_4_3,tests/test_generate_context.py,test,afterChange,112.0,documentation,U3,False,bugs-in-py,cookiecutter-1
145616.0,bugs-in-py_cookiecutter-1,D_4_3,tests/test_generate_context.py,test,afterChange,113.0,test,U3,False,bugs-in-py,cookiecutter-1
145617.0,bugs-in-py_cookiecutter-1,D_4_3,tests/test_generate_context.py,test,afterChange,114.0,test,U3,False,bugs-in-py,cookiecutter-1


In [163]:
example_data_from_dataset['changes'].keys()

dict_keys(['cookiecutter/generate.py', '/dev/null', 'tests/test-generate-context/non_ascii.json', 'tests/test_generate_context.py'])

In [164]:
example_data_from_dataset['changes']['cookiecutter/generate.py']

{'language': 'Python',
 'type': 'programming',
 'purpose': 'programming',
 '-': [{'id': 3,
   'file_line_no': 85,
   'type': 'code',
   'purpose': 'programming',
   'tokens': [[40, ['Text'], '        '],
    [48, ['Keyword'], 'with'],
    [52, ['Text'], ' '],
    [53, ['Name', 'Builtin'], 'open'],
    [57, ['Punctuation'], '('],
    [58, ['Name'], 'context_file'],
    [70, ['Punctuation'], ')'],
    [71, ['Text'], ' '],
    [72, ['Keyword'], 'as'],
    [74, ['Text'], ' '],
    [75, ['Name'], 'file_handle'],
    [86, ['Punctuation'], ':'],
    [87, ['Text', 'Whitespace'], '\n']]}],
 '+': [{'id': 4,
   'file_line_no': 85,
   'type': 'code',
   'purpose': 'programming',
   'tokens': [[40, ['Text'], '        '],
    [48, ['Keyword'], 'with'],
    [52, ['Text'], ' '],
    [53, ['Name', 'Builtin'], 'open'],
    [57, ['Punctuation'], '('],
    [58, ['Name'], 'context_file'],
    [70, ['Punctuation'], ','],
    [71, ['Text'], ' '],
    [72, ['Name'], 'encoding'],
    [80, ['Operator'], '='],
 

In [172]:
example_records = []
dataset = "bugs-in-py"
bug = "cookiecutter-1"

for patched_file, file_data in example_data_from_dataset['changes'].items():
    if patched_file == '/dev/null':
        continue
        
    for pm in list("-+"):
        if pm not in file_data:
            continue

        for line_data in file_data[pm]:
            example_records.append({
                'id': f"{dataset}_{bug}",
                'file': patched_file,
                'fcat': file_data['purpose'],
                'image': 'beforeChange' if pm == '-' else 'afterChange',
                'line': line_data['file_line_no'],
                'annotation': 'bug(fix)' if line_data['type'] == 'code' else line_data['type'],
                'ds': dataset,
                'bug': bug,
            })

example_records[:5]

[{'id': 'bugs-in-py_cookiecutter-1',
  'file': 'cookiecutter/generate.py',
  'fcat': 'programming',
  'image': 'beforeChange',
  'line': 85,
  'annotation': 'bug(fix)',
  'ds': 'bugs-in-py',
  'bug': 'cookiecutter-1'},
 {'id': 'bugs-in-py_cookiecutter-1',
  'file': 'cookiecutter/generate.py',
  'fcat': 'programming',
  'image': 'afterChange',
  'line': 85,
  'annotation': 'bug(fix)',
  'ds': 'bugs-in-py',
  'bug': 'cookiecutter-1'},
 {'id': 'bugs-in-py_cookiecutter-1',
  'file': 'tests/test-generate-context/non_ascii.json',
  'fcat': 'test',
  'image': 'afterChange',
  'line': 1,
  'annotation': 'bug(fix)',
  'ds': 'bugs-in-py',
  'bug': 'cookiecutter-1'},
 {'id': 'bugs-in-py_cookiecutter-1',
  'file': 'tests/test-generate-context/non_ascii.json',
  'fcat': 'test',
  'image': 'afterChange',
  'line': 2,
  'annotation': 'bug(fix)',
  'ds': 'bugs-in-py',
  'bug': 'cookiecutter-1'},
 {'id': 'bugs-in-py_cookiecutter-1',
  'file': 'tests/test-generate-context/non_ascii.json',
  'fcat': 'tes

In [167]:
example_df = pd.DataFrame.from_records(example_records)
example_df

Unnamed: 0,id,file,fcat,image,annotation,ds,bug
0,bugs-in-py_cookiecutter-1,cookiecutter/generate.py,programming,beforeChange,bug(fix),bugs-in-py,cookiecutter-1
1,bugs-in-py_cookiecutter-1,cookiecutter/generate.py,programming,afterChange,bug(fix),bugs-in-py,cookiecutter-1
2,bugs-in-py_cookiecutter-1,tests/test-generate-context/non_ascii.json,test,afterChange,bug(fix),bugs-in-py,cookiecutter-1
3,bugs-in-py_cookiecutter-1,tests/test-generate-context/non_ascii.json,test,afterChange,bug(fix),bugs-in-py,cookiecutter-1
4,bugs-in-py_cookiecutter-1,tests/test-generate-context/non_ascii.json,test,afterChange,bug(fix),bugs-in-py,cookiecutter-1
5,bugs-in-py_cookiecutter-1,tests/test_generate_context.py,test,afterChange,bug(fix),bugs-in-py,cookiecutter-1
6,bugs-in-py_cookiecutter-1,tests/test_generate_context.py,test,afterChange,bug(fix),bugs-in-py,cookiecutter-1
7,bugs-in-py_cookiecutter-1,tests/test_generate_context.py,test,afterChange,bug(fix),bugs-in-py,cookiecutter-1
8,bugs-in-py_cookiecutter-1,tests/test_generate_context.py,test,afterChange,documentation,bugs-in-py,cookiecutter-1
9,bugs-in-py_cookiecutter-1,tests/test_generate_context.py,test,afterChange,bug(fix),bugs-in-py,cookiecutter-1


In [169]:
example_df.head(5)

Unnamed: 0,id,file,fcat,image,annotation,ds,bug
0,bugs-in-py_cookiecutter-1,cookiecutter/generate.py,programming,beforeChange,bug(fix),bugs-in-py,cookiecutter-1
1,bugs-in-py_cookiecutter-1,cookiecutter/generate.py,programming,afterChange,bug(fix),bugs-in-py,cookiecutter-1
2,bugs-in-py_cookiecutter-1,tests/test-generate-context/non_ascii.json,test,afterChange,bug(fix),bugs-in-py,cookiecutter-1
3,bugs-in-py_cookiecutter-1,tests/test-generate-context/non_ascii.json,test,afterChange,bug(fix),bugs-in-py,cookiecutter-1
4,bugs-in-py_cookiecutter-1,tests/test-generate-context/non_ascii.json,test,afterChange,bug(fix),bugs-in-py,cookiecutter-1


In [171]:
example_collective.head(5)

Unnamed: 0,id,bundle,file,fcat,image,line,annotation,user,auto,ds,bug
,,,,,,,,,,,
145609.0,bugs-in-py_cookiecutter-1,D_4_3,cookiecutter/generate.py,programming,afterChange,85.0,bug(fix),U3,False,bugs-in-py,cookiecutter-1
145610.0,bugs-in-py_cookiecutter-1,D_4_3,cookiecutter/generate.py,programming,beforeChange,85.0,bug(fix),U3,False,bugs-in-py,cookiecutter-1
145611.0,bugs-in-py_cookiecutter-1,D_4_3,tests/test-generate-context/non_ascii.json,test,afterChange,1.0,test,U3,False,bugs-in-py,cookiecutter-1
145612.0,bugs-in-py_cookiecutter-1,D_4_3,tests/test-generate-context/non_ascii.json,test,afterChange,2.0,test,U3,False,bugs-in-py,cookiecutter-1
145613.0,bugs-in-py_cookiecutter-1,D_4_3,tests/test-generate-context/non_ascii.json,test,afterChange,3.0,test,U3,False,bugs-in-py,cookiecutter-1


# ---