In [1]:
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt

# Data comes from script get mine-all-pr-comments.py
DATA_FILENAME = "data/mined-comments-25stars-25prs-JavaScript-TypeScript.json.gz"
# DATA_FILENAME = "data/mined-comments-25stars-25prs-Java.json.gz"
# DATA_FILENAME = "data/mined-comments-25stars-25prs-Go.json.gz"
# DATA_FILENAME = "data/mined-comments-25stars-25prs-Python.json.gz"

# Note that javascript needs to come before java because java would always match whatever Js matches
LANGUAGE = [l for l in ['Go', 'JavaScript', 'Java', 'Python'] if l in DATA_FILENAME][0]

In [2]:
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt

def get_comments_df():
    all_comments = []

    import json
    import gzip
    with gzip.open(DATA_FILENAME, 'r') as f:
        comment_map = json.load(f)
    for repo, comments in comment_map.items():
        for comment in comments:
            comment['repo'] = repo
            all_comments.append(comment)
    return pd.DataFrame(all_comments)

print(f'Loading data from {DATA_FILENAME} for language {LANGUAGE}...')
comments_df = get_comments_df()

print('Total comments: {}'.format(len(comments_df)))

# How many repositories are there with comments?
print('Repos with PR comments: {}'.format(len(comments_df['repo'].unique())))

Loading data from data/mined-comments-25stars-25prs-JavaScript-TypeScript.json.gz for language JavaScript...
Total comments: 2895875
Repos with PR comments: 15608


In [3]:
# Vanity project: am I in it? Look for "user" == "pelmers"
print('Comments by me: {}'.format(len(comments_df[comments_df['user'] == 'pelmers'])))
# Sample
comments_df[comments_df['user'] == 'pelmers'].head()

Comments by me: 0


Unnamed: 0,html_url,path,line,body,user,diff_hunk,author_association,commit_id,id,repo


In [4]:
import os

# Add extension column to comments_df
comments_df['extension'] = comments_df['path'].apply(lambda x: os.path.splitext(x)[1])
# How many comments have line = NaN?
print('Null lines: {}'.format(comments_df['line'].isnull().sum()))

# How many comments are in each extension?
print(comments_df['extension'].value_counts().head(10))

# How many comments are in each repo?
print(comments_df['repo'].value_counts().head(10))

Null lines: 1808550
.ts    1759265
.js    1136610
Name: extension, dtype: int64
salto-io/salto              9497
plotly/plotly.js            9224
vatesfr/xen-orchestra       9083
microsoft/TypeScript        9067
microsoft/vscode            9037
CesiumGS/cesium             8859
firebase/firebase-js-sdk    8854
tinymce/tinymce             8662
Agoric/agoric-sdk           8576
mozilla/addons-frontend     8545
Name: repo, dtype: int64


In [5]:
comment_pattern = 'javadoc|jsdoc|docstr|godoc|tsdoc|documenta|comment'
print(f'Comments containing the pattern {comment_pattern}, by extension:')
print(comments_df[comments_df['body'].str.contains(comment_pattern, case=False)]['extension'].value_counts())

# Increase column width so the whole url is visible
pd.set_option("max_colwidth", 200)
pd.set_option("max_seq_items", 200)

comment_update_verbs = ['fix', 'update', 'address', 'revise', 'modify', 'amend', 'outdate', 'change']

# How many comments contain one of the comment update verbs, and the word comment?
comments_with_verb_df = comments_df[
        comments_df['body'].str.contains(comment_pattern, case=False) &
        comments_df['body'].str.contains('|'.join(comment_update_verbs), case=False)
    ]

print('Comments containing the pattern and one of the verbs: {}'.format(len(comments_with_verb_df)))
print('percentage of total: {}'.format(100 * len(comments_with_verb_df) / len(comments_df)))


Comments containing the pattern javadoc|jsdoc|docstr|godoc|tsdoc|documenta|comment, by extension:
.ts    83477
.js    55430
Name: extension, dtype: int64
Comments containing the pattern and one of the verbs: 24374
percentage of total: 0.8416799758276859


In [6]:
print('Comments containing the verb as well, by extension')
print(comments_with_verb_df['author_association'].value_counts())

if LANGUAGE == 'Go':
    # Go functions are commented with //
    comment_pattern = '\/\/'
elif LANGUAGE == 'Java':
    # Java uses /** for method comments
    comment_pattern = '\/\*\*'
elif LANGUAGE == 'JavaScript':
    # JS uses /** or // for method comments
    comment_pattern = '\/\*|\/\/'
elif LANGUAGE == 'Python':
    # Python function comments start with ''' or """"
    comment_pattern = '"""|\'\'\''
print(f'Using comment pattern: {comment_pattern}')

comments_with_comment_df = comments_with_verb_df[
    comments_with_verb_df['diff_hunk'].str.contains(comment_pattern)
    # and the comment contains a summary word
    & comments_with_verb_df['line'].notnull()
]
print(comments_with_comment_df['author_association'].value_counts())
# # For JS and Java, parse the "diff_hunk" field and check that the last line starts with *, //, or /*
# if LANGUAGE == 'JavaScript' or LANGUAGE == 'Java':
#     comments_with_comment_df = comments_with_comment_df[
#         comments_with_comment_df['diff_hunk'].str.split('\n').apply(
#             lambda x: x[-1].strip().startswith('*') or x[-1].strip().startswith('//') or x[-1].strip().startswith('/*')
#         )
#     ]
# # For Go, only keep examples where the last line of diff_hunk starts with //
# if LANGUAGE == 'Go':
#     comments_with_comment_df = comments_with_comment_df[
#         comments_with_comment_df['diff_hunk'].str.split('\n').apply(
#             lambda x: x[-1].strip().startswith('//')
#         )
#     ]

out_path = f'data/{LANGUAGE}-comments-with-comment.json'

# Dump the list to json in shuffled order
import json
with open(out_path, 'w') as f:
    records = comments_with_comment_df.to_dict('records')
    import random
    random.shuffle(records)
    json.dump(records, f, indent=2)

print(f'Saved {len(comments_with_comment_df)} comments to {out_path}')

# Sample some with comment
comments_with_comment_df


Comments containing the verb as well, by extension
CONTRIBUTOR     12227
MEMBER           7417
COLLABORATOR     3209
NONE             1032
OWNER             489
Name: author_association, dtype: int64
Using comment pattern: \/\*|\/\/
CONTRIBUTOR     2537
MEMBER          1706
COLLABORATOR     698
NONE             319
OWNER             95
Name: author_association, dtype: int64
Saved 5355 comments to data/JavaScript-comments-with-comment.json


Unnamed: 0,html_url,path,line,body,user,diff_hunk,author_association,commit_id,id,repo,extension
83,https://github.com/airbnb/javascript/pull/618#discussion_r47428523,packages/eslint-config-airbnb/rules/react.js,14.0,those changes are great but i also meant URLs in comments inline in `react.js`\n,ljharb,"@@ -9,7 +9,7 @@ module.exports = {\n // Prevent missing displayName in a React component definition\n 'react/display-name': 0,\n // Enforce boolean attributes notation in JSX\n- 're...",COLLABORATOR,d02a50637763c40dc50af2c033f288f4c7643b41,47428523,airbnb/javascript,.js
490,https://github.com/twbs/bootstrap/pull/12037#discussion_r8581325,Gruntfile.js,321.0,descriptive comment needs to be updated\n,cvrebert,"@@ -306,10 +323,10 @@ module.exports = function (grunt) {\n grunt.registerTask('dist-css', ['less', 'csscomb', 'usebanner']);\n \n // Fonts distribution task.",COLLABORATOR,05cc208333cdffd95929b10e84ef5a2519dc52c7,8581325,twbs/bootstrap,.js
2756,https://github.com/facebook/react/pull/75#discussion_r4649857,src/eventPlugins/TextChangeEventPlugin.js,143.0,"Add a comment about how `propertychange` does not bubble so we cannot listen to this using top-level event delegation and have to enqueue and process manually. (By the way, @jordow, can you explai...",yungsters,"@@ -0,0 +1,204 @@\n+/**\n+ * Copyright 2013 Facebook, Inc.\n+ *\n+ * Licensed under the Apache License, Version 2.0 (the ""License"");\n+ * you may not use this file except in compliance with the Li...",CONTRIBUTOR,0dc08c211598ac26ec33ea395daf9c08754eadbf,4649857,facebook/react,.js
2937,https://github.com/facebook/react/pull/378#discussion_r6636597,src/core/ReactDOMIDOperations.js,146.0,"The comment for this method is no longer correct, can you fix it?\n",petehunt,"@@ -143,7 +143,7 @@ var ReactDOMIDOperations = {\n var node = ReactMount.getNode(id);\n // HACK: IE8- normalize whitespace in innerHTML, removing leading spaces.\n // @see quirksmode.o...",CONTRIBUTOR,3ca507d73ffc07087ed4e95e010a101298f78146,6636597,facebook/react,.js
2983,https://github.com/facebook/react/pull/372#discussion_r6907787,src/dom/components/__tests__/ReactDOMSelect-test.js,145.0,One interesting additional thing we should do here is actually just change the object like you did above and call forceUpdate to ensure that works. So long as we aren't converting `value` to a str...,zpao,"@@ -116,6 +116,35 @@ describe('ReactDOMSelect', function() {\n expect(node.options[2].selected).toBe(false); // gorilla\n });\n \n+ it('should allow setting `value` with `objectToString`',...",MEMBER,91937f9888b79c5c2d85d8ea7f2cb1d5e7cd6ab0,6907787,facebook/react,.js
...,...,...,...,...,...,...,...,...,...,...,...
2892754,https://github.com/zowe/imperative/pull/937#discussion_r1146099390,packages/security/src/CredentialManagerOverride.ts,35.0,"Zowe Explorer does not look for any specific value in the ""CredentialManager"" property, just checks whether it is truthy or falsy. However, the current version of ZE does store the value `@zowe/cl...",t1m0thyj,"@@ -0,0 +1,231 @@\n+/*\n+* This program and the accompanying materials are made available under the terms of the\n+* Eclipse Public License v2.0 which accompanies this distribution, and is availab...",MEMBER,401f82c9a6aec6b3ca50f95f97e981ee264f80d6,1146099390,zowe/imperative,.ts
2893651,https://github.com/project-koku/koku-ui/pull/2363#discussion_r841733684,src/utils/format.ts,210.0,i wonder if the name of this should be changed to be more descriptive rather than having to add comments wherever it is being used.. maybe `normalizeToUsdDecimalFormat` even,gitdallas,"@@ -206,7 +206,7 @@ export const isPercentageFormatValid = (value: string) => {\n };\n \n // Some locales have a comma decimal separator (e.g., ""1.234,56"" in German is ""1,234.56"" in USD).\n-// Thi...",COLLABORATOR,2269c6d184e770eb8db364355f2aa6326bf37225,841733684,project-koku/koku-ui,.ts
2894273,https://github.com/equinor/webviz-subsurface-components/pull/645#discussion_r750356317,react/src/lib/components/DeckGLMap/utils/continuousLegend.ts,32.0,Changes as per the comment,shruthirai,"@@ -1,5 +1,86 @@\n-import { scaleSequential, interpolateViridis, ScaleSequential } from ""d3"";\n+import { color } from ""d3-color"";\n+import { interpolateRgb } from ""d3-interpolate"";\n+// eslint-dis...",COLLABORATOR,d78b0424e74159a0673300385d2fdd4c7ec5aaf0,750356317,equinor/webviz-subsurface-components,.ts
2894310,https://github.com/equinor/webviz-subsurface-components/pull/1049#discussion_r896594229,react/src/lib/components/DeckGLMap/layers/intersection/unfoldedPathLayer.ts,23.0,Or use this inject to modify the position in clip space.\r\nComment line 20 if using inject,shadab-skhan,"@@ -0,0 +1,31 @@\n+import { PathLayer, PathLayerProps } from ""@deck.gl/layers"";\n+import { Feature } from ""geojson"";\n+import unfoldedPathShaderVsGlsl from ""./unfoldedPathShader.vs.glsl"";\n+\n+int...",COLLABORATOR,976154238961d7a194bdbf3584cdd18754eb7f70,896594229,equinor/webviz-subsurface-components,.ts


In [7]:
comments_with_comment_df['repo'].value_counts().head(10)

zulip/zulip-mobile                      57
Agoric/agoric-sdk                       46
firebase/firebase-js-sdk                43
salto-io/salto                          35
eslint/eslint                           34
google/blockly                          34
OfficeDev/microsoft-teams-library-js    27
iTwin/itwinjs-core                      27
microsoft/vscode-jupyter                25
microsoft/TypeScript                    25
Name: repo, dtype: int64