# Exploratory Data Analysis for repo issues - Jest

In [185]:
from itertools import compress
from ast import literal_eval
import re


import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag

import matplotlib.pyplot as plt



%matplotlib inline


In [86]:
df_orig = pd.read_csv('data/repo_issue_summary_jest.csv')

Cast columns to appropriate data type

In [87]:
# --- Cast date columns to datetime
cols_date = ['createdAt', 'closedAt', 'firstCommentCreatedAt']
df_orig[cols_date] = df_orig[cols_date].apply(pd.to_datetime)

# --- Cast labels column to list
df_orig['labels'] = df_orig['labels'].apply(literal_eval)

# --- Fill na of the text columns with ''
cols_text = ['title', 'contents']
df_orig[cols_text] = df_orig[cols_text].fillna('')

In [88]:
df_orig.head()

Unnamed: 0,issue_id,title,contents,authorLogin,authorAssociation,createdAt,closed,closedAt,closedDuration_days,milestone,...,label_:boom: Regression,label_Wontfix,label_On Hold,label_Has Bounty,label_Coverage,label_Bug Report,label_Confirmed,label_Documentation :book:,label_:bug: Bug,label_Needs Triage
0,3,Make global `jest` command line tool like webp...,"webpack can be installed globally, but it dele...",sophiebits,CONTRIBUTOR,2014-05-14 17:17:25+00:00,True,2014-05-16 00:15:53+00:00,1.29,,...,False,False,False,False,False,False,False,False,False,False
1,4,Have a mode to print out a line for each test ...,More fun that way (instead of just the filename).,sophiebits,CONTRIBUTOR,2014-05-14 17:26:17+00:00,True,2015-09-21 23:04:12+00:00,495.23,,...,False,False,False,False,False,False,False,False,False,False
2,5,Use jasmine's `toHaveBeenCalled` rather than `...,13:23 jeffmo: what's the reason behind using ...,chenglou,CONTRIBUTOR,2014-05-14 17:26:37+00:00,True,2016-04-12 01:45:53+00:00,698.35,,...,False,False,False,False,False,False,False,False,False,False
3,6,Expose a `--help`,,chenglou,CONTRIBUTOR,2014-05-14 17:56:10+00:00,True,2014-05-15 07:52:07+00:00,0.58,,...,False,False,False,False,False,False,False,False,False,False
4,7,Add a `--watch`,,chenglou,CONTRIBUTOR,2014-05-14 17:57:45+00:00,True,2015-10-16 00:18:52+00:00,519.26,,...,False,False,False,False,False,False,False,False,False,False


In [89]:
df_orig.shape

(4530, 51)

In [90]:
df_orig['labels'].apply(lambda x: len(x) > 0).sum()

1536

In [91]:
df_orig.groupby(['authorLogin', 'authorAssociation'])['issue_id'].count().sort_values(ascending=False)

authorLogin          authorAssociation
cpojer               CONTRIBUTOR          83
aaronabramov         MEMBER               69
gaearon              MEMBER               43
SimenB               COLLABORATOR         40
thymikee             COLLABORATOR         35
hramos               CONTRIBUTOR          30
kentcdodds           CONTRIBUTOR          25
probablyup           CONTRIBUTOR          21
rogeliog             COLLABORATOR         17
ide                  CONTRIBUTOR          16
hgezim               CONTRIBUTOR          14
Daniel15             MEMBER               14
dandv                CONTRIBUTOR          13
StephanBijzitter     CONTRIBUTOR          13
mjesun               CONTRIBUTOR          13
ColCh                CONTRIBUTOR          13
blainekasten         CONTRIBUTOR          12
binarykitchen        NONE                 12
segrey               NONE                 11
pedrottimark         COLLABORATOR         11
unional              NONE                 10
awei01          

In [92]:
all_cols = df_orig.columns.tolist()

# https://stackoverflow.com/questions/18665873/filtering-a-list-based-on-a-list-of-booleans
cols_labels = list(compress(all_cols, [x.startswith('label_') for x in all_cols]))

In [93]:
df_orig[cols_labels].sum().sort_values(ascending=False)

label_:bug: Bug                             346
label_Help Wanted                           292
label_:rocket: Feature Request              231
label_Needs Triage                          209
label_Needs Repro                           201
label_good first issue                      192
label_Bug Report                            178
label_:rocket: Enhancement                  144
label_Needs More Info :man_shrugging:       132
label_Documentation :book:                  118
label_Confirmed                             118
label_Discussion                             83
label_New API proposal                       42
label_Question                               41
label_Windows                                40
label_:boom: Regression                      37
label_Website                                15
label_Hi-Pri                                 14
label_UX                                     14
label_Wontfix                                 8
label_Infrastructure :hammer_and_wrench:

In [94]:
cols_label_bug = [
    'label_:bug: Bug',
    'label_good first issue',
    'label_Bug Report',
    'label_:boom: Regression',
    'label_Infrastructure :hammer_and_wrench:',
    'label_Upstream Bug',
]

df_label_bugs = df_orig[df_orig[cols_label_bug].sum(axis=1) > 0]


In [95]:
df_label_bugs.groupby(pd.Grouper(key='createdAt', freq='Y'))['issue_id'].count()

createdAt
2014-12-31 00:00:00+00:00      8
2015-12-31 00:00:00+00:00      2
2016-12-31 00:00:00+00:00     43
2017-12-31 00:00:00+00:00    133
2018-12-31 00:00:00+00:00    181
2019-12-31 00:00:00+00:00    368
Freq: A-DEC, Name: issue_id, dtype: int64

In [96]:
df_label_bugs

Unnamed: 0,issue_id,title,contents,authorLogin,authorAssociation,createdAt,closed,closedAt,closedDuration_days,milestone,...,label_:boom: Regression,label_Wontfix,label_On Hold,label_Has Bounty,label_Coverage,label_Bug Report,label_Confirmed,label_Documentation :book:,label_:bug: Bug,label_Needs Triage
64,102,Jest does not respect NODE_PATH,In our project we do not use any relative path...,iamrandys,NONE,2014-08-03 19:06:51+00:00,True,2015-02-19 22:19:17+00:00,200.13,,...,False,False,False,False,False,False,False,False,True,False
67,106,modulePathIgnorePatterns does not work for nod...,When trying to configure node's util and event...,iamrandys,NONE,2014-08-05 18:11:02+00:00,True,2016-03-03 08:54:00+00:00,575.61,,...,False,False,False,False,False,False,False,False,True,False
68,107,"Jest does not support jasmine ""Runner beforeEach""",beforeEach statements outside of describe stat...,iamrandys,NONE,2014-08-05 18:15:33+00:00,True,2015-11-19 00:12:02+00:00,470.25,,...,False,False,False,False,False,False,False,False,True,False
72,112,Add possibility to define module directories.,"Using webpack and defining paths with the ""mod...",Chrazy,NONE,2014-08-07 06:31:21+00:00,True,2016-05-20 19:57:57+00:00,652.56,,...,False,False,False,False,False,False,False,False,True,False
73,114,Preprocessor + coffee + source maps?,Hello guys. Thanks for making an awesome tool ...,gothy,NONE,2014-08-07 13:14:10+00:00,True,2015-10-16 00:35:00+00:00,434.47,,...,False,False,False,False,False,False,False,False,True,False
79,124,`mockReturnValue` doesn't work with constructors,To workaround this I used mockImpl but it seem...,amasad,CONTRIBUTOR,2014-09-02 19:53:32+00:00,True,2016-03-27 21:01:21+00:00,572.05,,...,False,False,False,False,False,False,False,False,True,False
81,128,"Requiring jsdom throws ""Unexpected token ILLEGAL""",I'm trying to write tests for a module that us...,Daniel15,MEMBER,2014-09-08 04:40:22+00:00,True,2014-12-03 17:29:52+00:00,86.53,,...,False,False,False,False,False,False,False,False,True,False
82,129,Cannot mock htmlparser2,version of nodejs : 10.31\nversion of jest : 0...,jasper-lyons,NONE,2014-09-10 14:58:50+00:00,True,2015-10-16 00:38:31+00:00,400.40,,...,False,False,False,False,False,False,False,False,True,False
219,357,CLI --testPathPattern option?,"Hi, I'm hoping someone with more knowledge can...",McNouvion,CONTRIBUTOR,2015-05-07 18:51:20+00:00,True,2015-05-11 21:52:02+00:00,4.13,,...,False,False,False,False,False,False,False,False,True,False
333,554,Enable jest mocking to work properly with npm3.,Seeing this issue. Not sure where to begin wit...,jezen,NONE,2015-10-16 17:17:48+00:00,True,2016-02-22 02:16:49+00:00,128.37,,...,False,False,False,False,False,False,False,False,True,False


In [97]:
cols_feat_req = [
    'label_:rocket: Feature Request',
    'label_:rocket: Enhancement',
    'label_New API proposal',
]

df_label_feat = df_orig[df_orig[cols_feat_req].sum(axis=1) > 0]


In [98]:
df_label_feat.groupby(pd.Grouper(key='createdAt', freq='Y'))['issue_id'].count()

createdAt
2014-12-31 00:00:00+00:00      7
2015-12-31 00:00:00+00:00      1
2016-12-31 00:00:00+00:00     51
2017-12-31 00:00:00+00:00    125
2018-12-31 00:00:00+00:00    121
2019-12-31 00:00:00+00:00     96
Freq: A-DEC, Name: issue_id, dtype: int64

## Word counts

### Steps:

1. Cleaning to remove irrelevant items, such as HTML tags
2. Normalizing by converting to all lowercase and removing punctuation
3. Splitting text into words or tokens
4. Removing words that are too common, also known as stop words
5. Identifying different parts of speech and named entities
6. Converting words into their dictionary forms, using stemming and lemmatization

In [165]:
issue_text = (df_orig['title'] + ' ' + df_orig['contents']).tolist()

#### 2. Normalization

In [166]:
issue_words = [word_tokenize(x) for x in issue_text]

In [169]:
# Case Normalization

issue_words = [[x.lower() for x in words] for words in issue_words]


In [170]:
# Remove punctuation characters
issue_words = [[x for x in words if len(re.findall(r"^\w+", x)) > 0] for words in issue_words]

In [171]:
issue_words[0]

['make',
 'global',
 'jest',
 'command',
 'line',
 'tool',
 'like',
 'webpack',
 'webpack',
 'can',
 'be',
 'installed',
 'globally',
 'but',
 'it',
 'delegates',
 'to',
 'a',
 'locally-installed',
 'version',
 'if',
 'one',
 'exists',
 'https',
 'l8-l14',
 'perhaps',
 'we',
 'should',
 'consider',
 'the',
 'same',
 'so',
 'that',
 'people',
 'can',
 'use',
 'the',
 'jest',
 'command',
 'easily',
 'instead',
 'of',
 'npm',
 'test']

In [173]:
issue_words = [[w for w in words if w not in stopwords.words("english")] for words in issue_words]

In [174]:
issue_words

[['make',
  'global',
  'jest',
  'command',
  'line',
  'tool',
  'like',
  'webpack',
  'webpack',
  'installed',
  'globally',
  'delegates',
  'locally-installed',
  'version',
  'one',
  'exists',
  'https',
  'l8-l14',
  'perhaps',
  'consider',
  'people',
  'use',
  'jest',
  'command',
  'easily',
  'instead',
  'npm',
  'test'],
 ['mode', 'print', 'line', 'test', 'run', 'fun', 'way', 'instead', 'filename'],
 ['use',
  'jasmine',
  'tohavebeencalled',
  'rather',
  'tobecalled',
  '13:23',
  'jeffmo',
  'reason',
  'behind',
  'using',
  'tobecalled',
  'rather',
  'jasmine',
  'tohavebeencalled',
  '13:24',
  'chenglou',
  'good',
  'question',
  'tobecalled/tohavebeencalled',
  'wondered',
  'layover',
  'jst',
  'really',
  'suspect',
  'leave',
  'support',
  'using',
  'regular',
  'jasmine',
  'spies',
  'wanted'],
 ['expose', 'help'],
 ['add', 'watch'],
 ['test',
  'directory',
  'need',
  '__tests__',
  'feel',
  'like',
  'lot',
  'node',
  'projects',
  'use',
  'lib

In [179]:
# --- Part of speech tagging
pv_tags_words = [pos_tag(words) for words in issue_words]

In [181]:
pv_tags_words[0]

[('make', 'VB'),
 ('global', 'JJ'),
 ('jest', 'NN'),
 ('command', 'NN'),
 ('line', 'NN'),
 ('tool', 'NN'),
 ('like', 'IN'),
 ('webpack', 'NN'),
 ('webpack', 'NN'),
 ('installed', 'VBD'),
 ('globally', 'RB'),
 ('delegates', 'VBZ'),
 ('locally-installed', 'JJ'),
 ('version', 'NN'),
 ('one', 'CD'),
 ('exists', 'VBZ'),
 ('https', 'JJR'),
 ('l8-l14', 'JJ'),
 ('perhaps', 'RB'),
 ('consider', 'VB'),
 ('people', 'NNS'),
 ('use', 'VBP'),
 ('jest', 'JJ'),
 ('command', 'NN'),
 ('easily', 'RB'),
 ('instead', 'RB'),
 ('npm', 'JJ'),
 ('test', 'NN')]

In [182]:
def get_wordnet_pos(tag):
    ''' 
    Get a TreeBank tag from the specified WordNet part of speech name
    Args:
    tag: string. WordNet part of speech name.
    Returns:
    A corresponding TreeBank tag
    '''

    treebank_tag = ''
    # Refer to 
    # https://stackoverflow.com/questions/15586721/wordnet-lemmatization-and-pos-tagging-in-python

    if tag.startswith('J'):
        # Adjective
        treebank_tag = wordnet.ADJ

    elif tag.startswith('V'):
        # Verb
        treebank_tag = wordnet.VERB

    elif tag.startswith('N'):
        # Noun
        treebank_tag = wordnet.NOUN

    elif tag.startswith('R'):
        # Adverb
        treebank_tag = wordnet.ADV

    else:
        # Use Noun as a default output if none of above matches
        treebank_tag = wordnet.NOUN

    return treebank_tag

In [198]:
pv_tags_words[2]

[('use', 'NN'),
 ('jasmine', 'NN'),
 ('tohavebeencalled', 'VBD'),
 ('rather', 'RB'),
 ('tobecalled', 'JJ'),
 ('13:23', 'CD'),
 ('jeffmo', 'NN'),
 ('reason', 'NN'),
 ('behind', 'IN'),
 ('using', 'VBG'),
 ('tobecalled', 'VBD'),
 ('rather', 'RB'),
 ('jasmine', 'NN'),
 ('tohavebeencalled', 'VBD'),
 ('13:24', 'CD'),
 ('chenglou', 'NN'),
 ('good', 'JJ'),
 ('question', 'NN'),
 ('tobecalled/tohavebeencalled', 'VBD'),
 ('wondered', 'VBD'),
 ('layover', 'RB'),
 ('jst', 'JJ'),
 ('really', 'RB'),
 ('suspect', 'JJ'),
 ('leave', 'VBP'),
 ('support', 'NN'),
 ('using', 'VBG'),
 ('regular', 'JJ'),
 ('jasmine', 'NN'),
 ('spies', 'NNS'),
 ('wanted', 'VBD')]

In [209]:
%%time

lemmatizer = WordNetLemmatizer()

# Get current text
final_clean_tokens = []
for cur_pv_tags_words in pv_tags_words:

    cur_clean_tokens = []
    for cur_tag in cur_pv_tags_words:

        cur_text = cur_tag[0]

        # Get a corresponding part of speech that will be used with the lemmatizer
        w_tag = get_wordnet_pos(cur_tag[1])

        # lemmatize the text with pos and append it to clean_tokens
        clean_tok = lemmatizer.lemmatize(cur_text, w_tag)
        cur_clean_tokens.append(clean_tok)
        
    final_clean_tokens.append(cur_clean_tokens)

CPU times: user 1.95 s, sys: 14.7 ms, total: 1.97 s
Wall time: 1.99 s


In [216]:
final_clean_tokens

[['make',
  'global',
  'jest',
  'command',
  'line',
  'tool',
  'like',
  'webpack',
  'webpack',
  'instal',
  'globally',
  'delegate',
  'locally-installed',
  'version',
  'one',
  'exist',
  'https',
  'l8-l14',
  'perhaps',
  'consider',
  'people',
  'use',
  'jest',
  'command',
  'easily',
  'instead',
  'npm',
  'test'],
 ['mode', 'print', 'line', 'test', 'run', 'fun', 'way', 'instead', 'filename'],
 ['use',
  'jasmine',
  'tohavebeencalled',
  'rather',
  'tobecalled',
  '13:23',
  'jeffmo',
  'reason',
  'behind',
  'use',
  'tobecalled',
  'rather',
  'jasmine',
  'tohavebeencalled',
  '13:24',
  'chenglou',
  'good',
  'question',
  'tobecalled/tohavebeencalled',
  'wonder',
  'layover',
  'jst',
  'really',
  'suspect',
  'leave',
  'support',
  'use',
  'regular',
  'jasmine',
  'spy',
  'want'],
 ['expose', 'help'],
 ['add', 'watch'],
 ['test',
  'directory',
  'need',
  '__tests__',
  'feel',
  'like',
  'lot',
  'node',
  'project',
  'use',
  'lib',
  'test',
  '

In [147]:
issue_text

['Make global `jest` command line tool like webpack? webpack can be installed globally, but it delegates to a locally-installed version if one exists:\nhttps://github.com/webpack/webpack/blob/7e08847df59f9bed1854ce660946b285055c73d6/bin/webpack.js#L8-L14\nPerhaps we should consider the same so that people can use the jest command easily instead of npm test.',
 "Have a mode to print out a line for each test that's run More fun that way (instead of just the filename).",
 "Use jasmine's `toHaveBeenCalled` rather than `toBeCalled` 13:23  jeffmo: what's the reason behind using toBeCalled rather than jasmine's toHaveBeenCalled?\n13:24 <•jeffmo> chenglou: good question on toBeCalled/toHaveBeenCalled — I’ve wondered that myself (it’s just layover from jst really). I suspect it was to leave support for using regular jasmine spies if you wanted to",
 'Expose a `--help` ',
 'Add a `--watch` ',
 'Does the test directory need to be __tests__? I feel like a lot of node projects use lib and test as t

In [219]:
final_joined_text = ' '.join([' '.join(words) for words in issue_words])

In [221]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
stopwords = set(STOPWORDS)

ModuleNotFoundError: No module named 'wordcloud'