In [1]:
import pickle
import re
from collections import defaultdict
import operator
import spacy
from spacy.symbols import ORTH
from spacymoji import Emoji
from spacy_cld import LanguageDetector
from textacy import preprocess
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
from spacy.tokens import Token

Load unpreprocessed issue comments

In [3]:
clusterdata = pickle.load(open("issue_commments.p", "rb"))
final_partition= pickle.load(open("data/final_partition", "rb"))

### Defining the pipeline for preprocessing the issue comments
- This mainly deals with removing github markup language
- We also replace code snippets with just the string "CODESAMPLE"
- we replace dates with a generic string "DATE_TIME"
- we replace user mentions with the generic string "USER_MENTION"
- we replace hash references with the generic string "HASHREF"
- we replace full sha with the generic string "FULLSHA"
- we replace individual commit ids with the generic string "C_ID"
- we escape emojis so that they can be treated as a single token
- we replace urls with the generic string 'URL'
- we replace E-mails with the generic string 'EMAIL'
- we replace phone number with the generic string 'PHONE'
- we remove quotes


In [2]:
# %load ../wordembeddings/scripts/preprocess_mongo_31007.py


GITEMOJIS=pickle.load(open("../wordembeddings/scripts/data/gitemojis_list.p", "rb"))
def escape_emojis(text):
    """
    escapes github emoji markup so that it can be treated as a single token.
    """
    rep=text
    for i,emo in enumerate(GITEMOJIS):
        if emo in rep:
            rep= rep.replace(emo,' ' + emo + ' ')  
    return ' '.join(rep.split()) 

HASHREF= re.compile(u"#[0-9]+")
USER_MENTION= re.compile(u"@[0-9a-zA-Z_-]+")
CODEREF= re.compile(u"```[\s\S]+?```")
CODEREF2= re.compile(u"`.*?`")
BOLD= re.compile(u"\*\*(.*?)\*\*")
BOLD2= re.compile(u"\_\_(.*?)\_\_")
ITAL= re.compile(u"\*(.*?)\*")
#ITAL2= re.compile(u"\_(.*?)\_")
HEADER = re.compile("#+? ")
C_ID= re.compile("commit [0-9a-f]{5,40}")
DATE_TIME= re.compile("(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun), "
                      "(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) [0-9]{1,2}, [0-9]{4} "
                      "at [0-9]{1,2}:[0-9]{2} (?:AM|PM)")
FULLSHA=re.compile("[0-9a-f]{40}")
URL_MARKUP=re.compile("\[(.*?)\]\((.*?)\)")


def strip_url_markup(text):
    text= URL_MARKUP.sub(r"\1", text)
    return text

def repl_sha(text):
    """ Replace FULLSHA"""
    return FULLSHA.sub(u"FULLSHA", text)

def strip_hash(text):
    """ strip of header markup"""
    return HEADER.sub(u"", text)

def repl_commit_id(text):
    """ Replace commit ID with C_ID"""
    return C_ID.sub(u"C_ID", text)

def strip_references(text):
    return HASHREF.sub(u"HASHREF", text)

def strip_code(text):
    text=CODEREF.sub(u"CODESAMPLE", text)
    text=CODEREF2.sub(u"CODESAMPLE", text)
    return text
    
def strip_bold(text):
    text= BOLD.sub(r"\1", text)
    text= BOLD2.sub(r"\1", text)
    return text

def strip_italic(text):
    text= ITAL.sub(r"\1", text)
    #text= ITAL2.sub(r"\1", text)
    return text

def repl_date(text):
    text= DATE_TIME.sub("DATE_TIME", text)
    return text
def repl_user_mention(text):
    text= USER_MENTION.sub("USER_MENTION", text)
    return text


SENT_ENDS = [u".", u"!", u"?"]

def tokenize_sentence_split(text, nlp):
    for line in text.split("\n"):
        line= " ".join(line.split())
        tok_acc = []
        doc=nlp(line)
        if len(doc) > 1 and 'en' in doc._.language_scores and doc._.language_scores['en']>0.8:
            entity_text=""
            entity_detected=0
            for tok in doc:
                if tok.ent_iob_=='O':
                    if entity_detected==1:
                        tok_acc.append(entity_text)
                        entity_detected=0      
                    tok_acc.append(tok.text)
                    
                elif tok.ent_iob_=='B':
                    entity_detected=1
                    entity_text=tok.text
                elif tok.ent_iob_=='I':
                    entity_text+='_'+tok.text

                    
                if tok.text in SENT_ENDS:
                    yield " ".join(tok_acc)
                    tok_acc = []
            if tok_acc:
                if entity_detected==1:
                    tok_acc.append(entity_text)              
                yield " ".join(tok_acc)
        else:
             yield doc.text
            


def clean_lines(txt):
    for line in txt.split(u"\n"):
        line= line.strip()
        
        if not line.startswith('>'): #exclude quotes
            if line.startswith('- '):
                line=line[2:]
            yield line


def pre_filter(text):
    text = preprocess.fix_bad_unicode(text, normalization='NFC')
    text= strip_url_markup(text)
    text=strip_hash(text)
    text=strip_references(text)
    text=strip_code(text)
    text=strip_bold(text)
    text= strip_italic(text)
    test= strip_url_markup(text)
    text = repl_commit_id(text)
    text.replace('\r', '') 
    text=repl_sha(text)
    text = preprocess.replace_urls(text, replace_with='URL')
    text = preprocess.replace_emails(text, replace_with='EMAIL')
    text = preprocess.replace_phone_numbers(text, replace_with='PHONE')
    text = repl_user_mention(text)
    text = repl_date(text)
    return text




def extract_text(content, nlp):
    content=pre_filter(content)
    sentences = []
    lines = clean_lines(content)
    for line in lines:
        for sent in tokenize_sentence_split(line, nlp):
            sentences.append(sent)
    return u"\n".join(sentences)


#set up language pipe
nlp = spacy.load('en_core_web_sm', disable=['parser'])
emoji = Emoji(nlp)
nlp.add_pipe(emoji, first=True)
language_detector = LanguageDetector()
nlp.add_pipe(language_detector)

# add special case rule
for emoji in GITEMOJIS:
    nlp.tokenizer.add_special_case(emoji, [{ORTH: emoji}])






Preprocess issue comments

In [5]:
clustertexts_processed=defaultdict(list)
for k,v in final_partition.items():
    print(k)
    comms = clusterdata[k]
    for obj in comms:
        try:
            clustertexts_processed[v].append(extract_text(obj['comment']['body'],nlp))
        except:
            pass
    

52584449
3
10723331
4399108
4235272
11173910
33357857
44892207
90159
7995445
21889082
53223490
53928010
7585866
12902477
48595026
12656723
42057813
139365
9658471
28999793
24273012
39510136
52633720
42287226
8921209
44761210
29745276
43810956
48267408
4923539
23707802
6758559
43450528
24518816
29073574
10076333
10395822
55148722
18268343
31342779
43155644
21676222
49471687
11346122
229588
36282585
48611546
13385950
8478942
8423
16618
250
41217
25583877
29884685
6357264
23068950
39223584
11346209
5030178
16680
11182376
26525996
1884461
49537328
35848503
24888
39239993
17359160
43106625
1958216
28827979
52060492
47612236
50708814
22069585
46154067
11526484
36798817
7922024
8487275
367
55722354
51151223
48718204
11510142
51724671
46776704
56058245
1900934
21250443
4260235
34251149
43508115
49734036
49555
565654
1933719
411
46154148
29172140
10985905
49021366
1016248
28385730
57745865
37298635
631244
29172179
42287572
52109783
10387929
55869913
1901021
28770787
35611113
19939821
3654127
43

25112179
41029236
13545078
30568055
39693951
51875458
44789386
13078154
40898190
1879700
42012310
46927512
49819293
25030306
806576
51957425
17436346
26324669
47877823
11538112
25161410
11975
11677384
19828425
16871114
3829455
6074063
41529042
1486549
12275415
22286039
913115
52547291
47574759
11456233
28765931
5328622
55406319
46059253
19394305
27012868
12049
42856215
1871639
44945177
48738073
52760350
44199713
45838120
13668138
25448235
33787694
13356854
35098425
34352953
14282562
13668163
52531043
52612971
31879021
25087855
30994287
27733884
50196352
24129416
51220368
31526801
42684306
5410707
20374
4280217
30756766
15167392
19779488
56364965
50786219
40964011
47058864
45699002
4517819
10514366
176067
56487879
56913863
44560333
20429
51040209
17526738
16945107
45412309
33476567
31633368
32083928
47771608
42209244
12253
32600028
20910048
49663969
16207844
22073318
48844776
10325993
55644136
33656812
50745326
16183278
26439665
35393532
29536261
4190213
42766347
4109
10260494
8114189
4

25812210
54967540
17923317
54492407
6561022
228609
7372037
392460
113941
3530012
35650845
12811549
27942185
5504302
5373251
212297
24304973
10419533
47570255
8019285
531800
45604195
1351013
19488108
53362028
51846516
43883894
5291383
44989822
46718341
31972747
57802126
39361935
5692816
36863379
12500376
39009692
1924510
48635295
38469028
13008298
28974506
16137647
41803184
40058290
44662196
8117689
50339257
17604028
36494791
5307852
31743440
2997720
48603
24028
56360413
25558494
18652638
44391905
5856741
1957357
35945975
31448567
15171065
50830840
19774971
10075641
40091134
47496703
27844096
20160011
3325454
663065
14638623
42253868
26492463
29089332
51379765
30604861
1457727
52411971
27958852
28687941
13647428
45620811
19807820
37068365
36249166
13631058
3186265
57130585
36699739
11664989
14351968
44916321
8592995
12770920
16793202
48463475
45547124
24804980
39943798
31964791
30047859
7803
9756291
7812
10436229
40580
29343370
58072716
38821519
7826
27524757
18521760
5095078
5643949
46

5656305
51883766
50286327
17354488
5828347
12807938
1593092
22212363
46034700
44502807
33656599
5017372
9539357
21180189
30691102
52891428
10088234
44854
34230077
5312326
46763856
17641298
3936090
12291934
46591842
46329699
35917675
48361326
69488
35671926
17846135
17190777
18575227
25087867
11743112
54439817
8793995
51974027
11038611
46452641
290724
3690409
51318700
48271276
31059894
4023
17248186
52596666
27299787
39677902
987096
24981465
28086245
41668582
46780400
46436347
46116860
30470142
31690758
48803847
21368840
27480077
51499022
55209997
978965
18452508
38899743
1912872
45166634
14028844
44462124
29216814
39604273
52375601
10604596
9916474
27267131
42184770
36343894
37326938
45461601
28774507
10866795
17657968
21123204
26030213
45183123
47476888
46731417
11202713
41136289
12455
4267
12185772
33321156
24056007
5320906
6189264
15601873
36253906
31428832
30519527
10522859
11186411
26325230
10014963
27382004
42791161
43102468
27398408
11637006
27398422
35377440
45510945
49271074
5

51032399
49582416
6363476
784726
17598807
48089440
51190112
8421731
9079147
10496365
42723696
156018
20930939
29788539
384
6957448
27404682
32891279
52709789
1356204
47702447
6977976
19098044
21703100
31926720
6977984
57721282
7944641
39813572
52550087
44155339
12653008
20320721
5474773
24130005
1970646
295386
922075
9994717
209373
30329311
24836573
43952611
32100853
37369
9660928
1763841
33081859
9730565
13951511
19706398
47002145
6828578
15143464
17650217
8063534
25868847
3539505
11629117
3410497
43587
203334
15792710
76384
42836589
9478765
5118576
22047350
13386365
19753597
6382221
30884497
7809681
3992211
15684243
50707092
8444565
9659036
50924189
37898910
68256
23480992
20859553
37108388
46473900
13345454
123568
35222200
3228357
12960458
50180811
19833548
15082189
84691
20161239
3181271
1690332
6832863
34257638
43731687
42711787
4846
54024942
36594417
3648248
11002
2988811
46195471
9511697
25152274
33555
29029145
34183961
21276
7977767
36596522
47614775
2472759
25419
22178639
1413

51322777
29659035
36681628
12224413
47402909
39440287
6045
39835551
35680162
39835554
40992670
44083117
35102640
37199795
19273658
35102658
20791251
7434196
55795668
33873878
12842974
45127647
32417762
14012387
4691948
11814893
45268976
3475441
42418160
13451254
48496637
36364289
1368068
19363858
38010901
55064603
12142620
38617119
39571495
36921385
48252969
28399662
21844020
33945653
15435832
11501635
9723975
43081
20334668
45819988
8452180
10158183
28270696
43113
11665531
6270
9715849
32307337
22065294
10563729
46258325
22085783
8622231
49479841
20244649
4634794
39823536
45656242
21532851
42234038
46682298
22051004
50268349
24205506
30578884
53563598
6965455
8499420
49064159
29042912
28612834
22044902
11780326
1306869
21481718
6435067
28621059
9672983
47472936
18475307
22860078
25794866
48550199
15040826
37257530
21391687
53589
192858
38832479
44616032
809314
32022897
8251762
13461875
8407416
51474808
28811652
11463055
289168
16501144
37179808
11579809
50756002
33169827
54798764
2963

In [3]:
#pickle.dump(clustertexts_processed, open("clustertexts_processed.p", "wb"))  
clustertexts_processed = pickle.load(open("clustertexts_processed.p", "rb"))  

In [4]:
STOP_WORDS = {'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'front',
 'full',
 'further',
 'get',
 'give',
 'go',
 'had',
 'has',
 'have',
 'hence',
 'here',
 'hereafter',
 'hereby',
 'herein',
 'hereupon',
 'how',
 'however',
 'hundred',
 'if',
 'in',
 'indeed',
 'into',
 'is',
 'it',
 'its',
 'itself',
 'just',
 'keep',
 'last',
 'latter',
 'latterly',
 'least',
 'less',
 'made',
 'make',
 'many',
 'may',
 'meanwhile',
 'might',
 'more',
 'moreover',
 'most',
 'mostly',
 'move',
 'much',
 'must',
 "n't",
 'name',
 'namely',
 'neither',
 'never',
 'nevertheless',
 'next',
 'nine',
 'no',
 'none',
 'noone',
 'nor',
 'not',
 'nothing',
 'now',
 'nowhere',
 'of',
 'off',
 'often',
 'on',
 'once',
 'one',
 'only',
 'onto',
 'or',
 'other',
 'others',
 'otherwise',
 'out',
 'over',
 'part',
 'per',
 'perhaps',
 'please',
 'put',
 'quite',
 'rather',
 're',
 'really',
 'regarding',
 'same',
 'say',
 'see',
 'seem',
 'seemed',
 'seeming',
 'seems',
 'serious',
 'several',
 'show',
 'side',
 'since',
 'six',
 'sixty',
 'so',
 'some',
 'somehow',
 'someone',
 'something',
 'sometime',
 'sometimes',
 'somewhere',
 'still',
 'such',
 'take',
 'ten',
 'than',
 'that',
 'the',
 'then',
 'thence',
 'there',
 'thereafter',
 'thereby',
 'therefore',
 'therein',
 'thereupon',
 'these',
 'third',
 'this',
 'those',
 'though',
 'three',
 'through',
 'throughout',
 'thru',
 'thus',
 'to',
 'too',
 'toward',
 'towards',
 'twelve',
 'twenty',
 'two',
 'under',
 'unless',
 'until',
 'up',
 'upon',
 'us',
 'used',
 'using',
 'various',
 'very',
 'via',
 'was',
 'were',
 'what',
 'whatever',
 'when',
 'whence',
 'whenever',
 'where',
 'whereafter',
 'whereas',
 'whereby',
 'wherein',
 'whereupon',
 'wherever',
 'whether',
 'which',
 'while',
 'whither',
 'who',
 'whoever',
 'whole',
 'whom',
 'whose',
 '.', '-', '!', '?', '_', '#', '\n', '\r', ',', '<', '>',  ':','[', '(', ';', ']',')', '{', '}', '"', "'",
 '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
  '/tr',
  '/tbody',
  '/table',
  'table',
  'thead', '/th',
  '/thead',
  'tbody',
 'p', '/p', 'm', "'s",  '–', 'm',  'cc',
  'td',
  'tr', "\n\n", "0",
  'th', '%', '=',
  'response&lt;modeltype&gt',
  ';',
 've', 'codesample', 'user_mention',
 'why',
 'will',
 'with',
 'within',
 'without',
 'would',
 'yet',
 "n't", 
 "be",
 "so" }

In [5]:


stop_words_getter = lambda token: token.lower_ in STOP_WORDS or token.lemma_ in STOP_WORDS
Token.set_extension('is_stop', getter=stop_words_getter, force=True)  # set attribute with getter

def mylemma(token):
    lemm = token.lemma_
    if lemm == '-PRON-':
        return token.text.lower()
    else:
        return lemm

def lemmatization(texts):
    texts_out = []
    for i, sent in enumerate(texts):
        if i%1000==0:
            print(i)
        doc = nlp(sent) 
        texts_out.append([mylemma(token) for token in doc if token._.is_stop == False])
    return texts_out

In [6]:
data_lemmatized={}
for k,v in clustertexts_processed.items():
    data_lemmatized[k]= lemmatization(v)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000
100000
101000
102000
103000
104000
105000
106000
107000
108000
109000
110000
111000
112000
113000
114000
115000
116000
117000
118000
119000
120000
121000
122000
123000
124000
125000
126000
127000
128000
129000
130000
131000
132000
133000
134000
135000
136000
137000
138000
139000
140000
141000
142000
143000
144000
145000
146000
147000
148000
149000
150000
151000
152000
153000
154000
155000
156000
157000
158000


In [2]:
#pickle.dump(data_lemmatized, open("datalemmatized_simplestopwordlist.p", "wb"))  
#data_lemmatized = pickle.load(open("datalemmatized_simplestopwordlist.p", "rb"))  

Create dictionary with sorted word frequencies for each cluster

In [3]:
def default_tokenizer(string):
    return string.split()

def freq_dist(data):
    """
    """
    ngram_vectorizer = CountVectorizer(tokenizer=default_tokenizer, ngram_range=(1, 1), min_df=0.0005) #tokenizer=default_tokenizer,
    X = ngram_vectorizer.fit_transform(data.split('\n'))
    vocab = list(ngram_vectorizer.get_feature_names())
    counts = X.sum(axis=0).A1
    return Counter(dict(zip(vocab, counts)))


freq_dicts=[]
for i in range(5):
    freq_dict=freq_dist(" ".join([" ".join(items) for items in data_lemmatized[i]]))
    freq_dicts.append(freq_dict)
    
sorted_dicts = [sorted(fd.items(), key=operator.itemgetter(1), reverse=True) for fd in freq_dicts]

# remove some additional stop words that have not been captured by the previous data cleansing.
additional_stopwords="r2 nt de sp etc src lot ref x r1 20 /blockquote></details ﹠ una por con como si un se y la b d h2:0 timeout('timed r … /i></summary><blockquote details><summary><i 26 。 25 joão fullsha status](url this_pull_request 2011 2012 2013 2014 2015 2016 2017 2018 2019 01 02 03 04 05 06 07 08 09 → | + /usr * -- --- ... .. * $ ` lo para el que es en election hashref url 00_ 11 12 13 \ / georgios josé ioannis konstantinos oct nov dec john jose maria dimitrios mohamed mohammad nikolaos fernando antónio antonio luis vasileios carlos abdul luís santos manuel pedro athanasios christos 11168 ".split()
sorted_dicts_stop=[[x for x in sd if x[0] not in additional_stopwords] for sd in sorted_dicts]
#pickle.dump(sorted_dicts_stop,open("data/sorted_wordfrequencies.p", "wb"))
#pickle.dump(freq_dicts, open("data/freq_dicts.p", "wb"))