In [254]:
import json
import os
from keras.preprocessing.text import Tokenizer
import numpy as np
from nltk.translate.bleu_score import sentence_bleu
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.model_selection import train_test_split
from typing import List, Dict
from collections import OrderedDict
import itertools

In [255]:
cluster_header, tokens_header, message_header = "cluster", "tokens", "message"
cluster1, cluster2, cluster3, cluster4, cluster5 = "NoInfo", "Test", "Bomb", "Npe", "Other"

In [258]:
def load_and_filter_data(input_file: str):
    with open(input_file, 'r') as file:
        data = json.load(file)

    elements_to_delete = []
    for each in data:
        if each[message_header] == "(no message)" or \
        each[message_header].startswith("This commit was manufactured by cvs2svn"):
            elements_to_delete.append(each)
    for e in elements_to_delete:
        data.remove(e)

    return data


def cluster_all_messages(raw_data: List[Dict]) -> DataFrame:
    result = []
    deleted_tokens, added_tokens = "deletedTokens", "addedTokens"
    other_cluster_member_number = 0
    for current in raw_data:
        if "cleanup" in current[message_header] or "minor" in current[message_header] or \
            current[message_header] == "fix" or current[message_header] == "fixes" or \
            current[message_header] == "several fixes" or current[message_header] == "minor fixes" or \
            current[message_header] == "yet another inference bug" or \
            current[message_header] == "some improvements (igor e. mikhailuk)" or \
            current[message_header] == "no changes actually" or \
            current[message_header] == "clean up" or current[message_header] == "cosmetics":
            
            cluster = cluster1
        elif "test" in current[message_header]:
            cluster = cluster2
        elif "bomb" in current[message_header]:
            cluster = cluster3
        elif "npe" in current[message_header] or "null" in current[message_header]:
            cluster = cluster4
        else:
            if other_cluster_member_number >= 40_000:
                continue
            cluster = cluster5
            other_cluster_member_number += 1
            
        
        if current[deleted_tokens]:  # list with tokens not empty
            result.append({cluster_header: cluster,
                           tokens_header: current[deleted_tokens],
                           message_header: current[message_header]
                           })
        if current[added_tokens]:  # list with tokens not empty
            result.append({cluster_header: cluster,
                       tokens_header: current[added_tokens],
                       message_header: current[message_header]
                       })
    return DataFrame(result)

## All tokens 
### except `"{", "}", "(", ")", "[", "]", ";", ".", ","`

In [259]:
################### get data; IDEA ###################
parent_dir = "/Users/natalia.murycheva/Documents/gitCommitMessageCollectorStorage"
git_dir_name = "intellij"
git_dir = os.path.join(parent_dir, git_dir_name)
json_with_diffs = f"{git_dir_name}_diff_blobs.json"
json_with_diffs = os.path.join(parent_dir, json_with_diffs)

raw_data = load_and_filter_data(json_with_diffs)
df = cluster_all_messages(raw_data)

In [260]:
print(f"total shape \t\t{df.shape}")

total_size = df.shape[0]

size1 = (df[cluster_header] == cluster1).sum()
size2 = (df[cluster_header] == cluster2).sum()
size3 = (df[cluster_header] == cluster3).sum()
size4 = (df[cluster_header] == cluster4).sum()
size5 = (df[cluster_header] == cluster5).sum()

print(f"size cluster1 \t{cluster1}\t{size1}\t%{(size1/total_size)*100}")
print(f"size cluster2 \t{cluster2}\t{size2}\t%{(size2/total_size)*100}")
print(f"size cluster3 \t{cluster3}\t{size3}\t%{(size3/total_size)*100}")
print(f"size cluster4 \t{cluster4}\t{size4}\t%{(size4/total_size)*100}")
print(f"size cluster5 \t{cluster5}\t{size5}\t%{(size5/total_size)*100}")

total shape 		(63516, 3)
size cluster1 	NoInfo	4048	%6.37319730461616
size cluster2 	Test	3980	%6.26613766609988
size cluster3 	Bomb	423	%0.6659739278292084
size cluster4 	Npe	647	%1.0186409723534229
size cluster5 	Other	54418	%85.67605012910133


In [262]:
################### split data ###################

msg_train , msg_test, y_train, y_test = train_test_split(df[message_header], df[cluster_header],
                                                         test_size=0.3, random_state=142)

In [295]:
text_clf = Pipeline([
    ('vect', CountVectorizer(max_df=0.95)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

text_clf.fit(msg_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=0.95,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [296]:
################### classififcation results ###################
predicted = text_clf.predict(msg_test)
np.mean(predicted == y_test)    
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

        Bomb       0.98      0.45      0.62       135
      NoInfo       0.98      0.91      0.94      1183
         Npe       1.00      0.59      0.74       193
       Other       0.98      1.00      0.99     16289
        Test       0.94      0.84      0.89      1255

    accuracy                           0.97     19055
   macro avg       0.97      0.76      0.83     19055
weighted avg       0.97      0.97      0.97     19055



In [297]:
import warnings
warnings.filterwarnings("ignore")

In [298]:
def invert_dict(input_dict: Dict[str, int]) -> Dict[int, List[str]]:
    result = dict()

    for key, value in input_dict.items():
        if value not in result:
            result[value] = [key]
        else:
            result[value].append(key)

    return result

def get_top_popular_words_per_class(raw_data: List[Dict]) -> DataFrame:
    result = {}
    for current in raw_data:
        if "cleanup" in current[message_header] or "minor" in current[message_header] or \
            current[message_header] == "fix" or current[message_header] == "fixes" or \
            current[message_header] == "several fixes" or current[message_header] == "minor fixes" or \
            current[message_header] == "yet another inference bug" or \
            current[message_header] == "some improvements (igor e. mikhailuk)" or \
            current[message_header] == "no changes actually" or \
            current[message_header] == "clean up" or current[message_header] == "cosmetics":

            cluster = cluster1
        elif "test" in current[message_header]:
            cluster = cluster2
        elif "bomb" in current[message_header]:
            cluster = cluster3
        elif "npe" in current[message_header] or "null" in current[message_header]:
            cluster = cluster4
        else:
            cluster = cluster5
            
        if cluster not in result:
            result[cluster] = []
        else:
            result[cluster].append(current[message_header])
        
    for cur_cluster in [cluster1, cluster2, cluster3, cluster4, cluster5]:
        print(f"Current cluster = {cur_cluster}")
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(result[cur_cluster])
        counts_vs_word = invert_dict(tokenizer.word_counts)
        counts_vs_word_sorted = OrderedDict(sorted(counts_vs_word.items(), reverse=True))
        top_popular_words = dict(itertools.islice(counts_vs_word_sorted.items(), 0, 25))
        print(top_popular_words)
        print()
        
get_top_popular_words_per_class(raw_data)   

Current cluster = NoInfo
{56: ['minor'], 54: ['cleanup'], 51: ['some'], 40: ['cmpletion', 'and', 'xmlmeta', 'enchantment'], 21: ['to'], 20: ['improvements'], 11: ['code', 'of'], 10: ['for', 'fixes', 'imporved', 'performance', 'formatter', 'smartencodinginputstream', 'file', 'reading', 'routines', 'implementation'], 4: ['fix', 'changes', 'vf'], 3: ['added', 'methods', 'internal', 'use', 'information', 'optimization'], 2: ['cosmetics'], 1: ['corrected', 'bug', 'in', 'example', 'fixed', 'issue', 'with', 'length', 'an', 'array', 'memory', 'leak', 'change', 'fabrique', 'update', 'version', '0', '3', 'display', 'purposes']}

Current cluster = Test
{401: ['tests'], 362: ['test'], 264: ['replace'], 263: ['to'], 262: ['testing'], 247: ['support'], 244: ['the'], 242: ['first'], 238: ['enable'], 237: ['statement', 'expression', 'implementation', 'used'], 188: ['for'], 116: ['fixed'], 112: ['added'], 101: ['of'], 100: ['in'], 99: ['and'], 78: ['xml'], 75: ['completion'], 72: ['initial'], 70: ['new

In [299]:
# cluster1, cluster2, cluster3, cluster4, cluster5 = "NoInfo", "Test", "Bomb", "Npe", "Other"
candidate1 = "cleanup"
candidate2 = "test"
candidate3 = "timebomb"
candidate4 = "npe fix and null checks removed"
candidate5 = "refactoring and ideadev fix"

In [300]:
################### bleu score; IDEA ###################
total_score = 0.
bad_score_number = 0
cluster_vs_number = {cluster1: 0, cluster2: 0, cluster3: 0, cluster4: 0, cluster5: 0}
cluster_vs_bad_score_number = {cluster1: 0, cluster2: 0, cluster3: 0, cluster4: 0, cluster5: 0}

for msg, cluster in zip(msg_test, predicted):
    cluster_vs_number[cluster] += 1
    
    if cluster == cluster1:
        candidate = candidate1
    elif cluster == cluster2:
        candidate = candidate2
    elif cluster == cluster3:
        candidate = candidate3
    elif cluster == cluster4:
        candidate = candidate4
    else:
        candidate = candidate5
    
    score = sentence_bleu([msg], candidate, weights=((1./len(candidate),) * len(candidate)))
        
    if score < 0.001:
        bad_score_number += 1
        cluster_vs_bad_score_number[cluster] += 1
#         if cluster == cluster4:
#             print("Score: {:.6f}; Candidate: {:>16}; Msg: {}".format(score, candidate, msg))
    total_score += score

print()
print(f"Result = {total_score / len(msg_test)}")
print(f"Number of bad score {bad_score_number}; test size {len(msg_test)}")
print(f"Elements number per cluster {cluster_vs_number}")
print(f"Elements number with bad score per cluster {cluster_vs_bad_score_number}")


Result = 0.037946796158592935
Number of bad score 17569; test size 19055
Elements number per cluster {'NoInfo': 1102, 'Test': 1124, 'Bomb': 62, 'Npe': 113, 'Other': 16654}
Elements number with bad score per cluster {'NoInfo': 315, 'Test': 437, 'Bomb': 50, 'Npe': 113, 'Other': 16654}


In [269]:
################### get data; AURORA ###################
parent_dir = "/Users/natalia.murycheva/Documents/gitCommitMessageCollectorStorage"
git_dir_name = "aurora"
git_dir = os.path.join(parent_dir, git_dir_name)
json_with_diffs = f"{git_dir_name}_diff_blobs.json"
json_with_diffs = os.path.join(parent_dir, json_with_diffs)

raw_data = load_and_filter_data(json_with_diffs)
aurora = cluster_all_messages(raw_data)

In [313]:
print(f"Aurora sample number \t\t{aurora.shape}")

total_size = aurora.shape[0]

size1 = (aurora[cluster_header] == cluster1).sum()
size2 = (aurora[cluster_header] == cluster2).sum()
size3 = (aurora[cluster_header] == cluster3).sum()
size4 = (aurora[cluster_header] == cluster4).sum()
size5 = (aurora[cluster_header] == cluster5).sum()

print(f"size cluster1 \t{cluster1}\t{size1}\t%{(size1/total_size)*100}")
print(f"size cluster2 \t{cluster2}\t{size2}\t%{(size2/total_size)*100}")
print(f"size cluster3 \t{cluster3}\t{size3}\t%{(size3/total_size)*100}")
print(f"size cluster4 \t{cluster4}\t{size4}\t%{(size4/total_size)*100}")
print(f"size cluster5 \t{cluster5}\t{size5}\t%{(size5/total_size)*100}")

Aurora sample number 		(19453, 3)
size cluster1 	NoInfo	140	%0.7196833393306945
size cluster2 	Test	1332	%6.847272914203464
size cluster3 	Bomb	60	%0.3084357168560119
size cluster4 	Npe	169	%0.868760602477767
size cluster5 	Other	17752	%91.25584742713207


In [303]:
################### split data ###################

a_msg_train , a_msg_test, a_y_train, a_y_test = train_test_split(aurora[message_header], aurora[cluster_header],
                                                                 test_size=0.3, random_state=742)

In [304]:
################### classififcation results ###################
a_predicted = text_clf.predict(a_msg_test)
np.mean(a_predicted == a_y_test)    
print(metrics.classification_report(a_y_test, a_predicted))

              precision    recall  f1-score   support

        Bomb       1.00      0.23      0.38        13
      NoInfo       0.78      0.89      0.83        47
         Npe       1.00      0.29      0.45        51
       Other       0.94      0.96      0.95      5303
        Test       0.47      0.41      0.44       422

    accuracy                           0.91      5836
   macro avg       0.84      0.56      0.61      5836
weighted avg       0.91      0.91      0.91      5836



In [305]:
################### bleu score; AURORA ###################
total_score = 0.
bad_score_number = 0
cluster_vs_number = {cluster1: 0, cluster2: 0, cluster3: 0, cluster4: 0, cluster5: 0}
cluster_vs_bad_score_number = {cluster1: 0, cluster2: 0, cluster3: 0, cluster4: 0, cluster5: 0}

for msg, cluster in zip(a_msg_test, a_predicted):
    cluster_vs_number[cluster] += 1
    
    if cluster == cluster1:
        candidate = candidate1
    elif cluster == cluster2:
        candidate = candidate2
    elif cluster == cluster3:
        candidate = candidate3
    elif cluster == cluster4:
        candidate = candidate4
    else:
        candidate = candidate5
    
    score = sentence_bleu([msg], candidate, weights=((1./len(candidate),) * len(candidate)))
        
    if score < 0.001:
        bad_score_number += 1
        cluster_vs_bad_score_number[cluster] += 1
#         if cluster == cluster4:
#             print("Score: {:.6f}; Candidate: {:>16}; Msg: {}".format(score, candidate, msg))
    total_score += score

print()
print(f"Result = {total_score / len(a_msg_test)}")
print(f"Number of bad score {bad_score_number}; test size {len(a_msg_test)}")
print(f"Elements number per cluster {cluster_vs_number}")
print(f"Elements number with bad score per cluster {cluster_vs_bad_score_number}")


Result = 0.014686711258301514
Number of bad score 5653; test size 5836
Elements number per cluster {'NoInfo': 54, 'Test': 365, 'Bomb': 3, 'Npe': 15, 'Other': 5399}
Elements number with bad score per cluster {'NoInfo': 30, 'Test': 209, 'Bomb': 0, 'Npe': 15, 'Other': 5399}


In [306]:
################### fit on aurora dataset ###################
text_clf = Pipeline([
    ('vect', CountVectorizer(max_df=0.95)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

text_clf.fit(a_msg_train, a_y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=0.95,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [307]:
################### classififcation results ###################
a_predicted = text_clf.predict(a_msg_test)
np.mean(a_predicted == a_y_test)    
print(metrics.classification_report(a_y_test, a_predicted))

              precision    recall  f1-score   support

        Bomb       1.00      0.54      0.70        13
      NoInfo       1.00      0.51      0.68        47
         Npe       1.00      0.37      0.54        51
       Other       0.95      1.00      0.98      5303
        Test       0.99      0.52      0.68       422

    accuracy                           0.95      5836
   macro avg       0.99      0.59      0.72      5836
weighted avg       0.96      0.95      0.95      5836



In [308]:
################### bleu score; AURORA ###################
total_score = 0.
bad_score_number = 0
cluster_vs_number = {cluster1: 0, cluster2: 0, cluster3: 0, cluster4: 0, cluster5: 0}
cluster_vs_bad_score_number = {cluster1: 0, cluster2: 0, cluster3: 0, cluster4: 0, cluster5: 0}

for msg, cluster in zip(a_msg_test, a_predicted):
    cluster_vs_number[cluster] += 1
    
    if cluster == cluster1:
        candidate = candidate1
    elif cluster == cluster2:
        candidate = candidate2
    elif cluster == cluster3:
        candidate = candidate3
    elif cluster == cluster4:
        candidate = candidate4
    else:
        candidate = candidate5
    
    score = sentence_bleu([msg], candidate, weights=((1./len(candidate),) * len(candidate)))
        
    if score < 0.001:
        bad_score_number += 1
        cluster_vs_bad_score_number[cluster] += 1
#         if cluster == cluster4:
#             print("Score: {:.6f}; Candidate: {:>16}; Msg: {}".format(score, candidate, msg))
    total_score += score

print()
print(f"Result = {total_score / len(a_msg_test)}")
print(f"Number of bad score {bad_score_number}; test size {len(a_msg_test)}")
print(f"Elements number per cluster {cluster_vs_number}")
print(f"Elements number with bad score per cluster {cluster_vs_bad_score_number}")


Result = 0.01288976238844559
Number of bad score 5704; test size 5836
Elements number per cluster {'NoInfo': 24, 'Test': 224, 'Bomb': 7, 'Npe': 19, 'Other': 5562}
Elements number with bad score per cluster {'NoInfo': 1, 'Test': 115, 'Bomb': 7, 'Npe': 19, 'Other': 5562}


In [309]:
################### fit on idea and then on aurora ###################
text_clf = Pipeline([
    ('vect', CountVectorizer(max_df=0.95)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

text_clf.fit(msg_train, y_train)
text_clf.fit(a_msg_train, a_y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=0.95,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [310]:
################### classififcation results ###################
a_predicted = text_clf.predict(a_msg_test)
np.mean(a_predicted == a_y_test)    
print(metrics.classification_report(a_y_test, a_predicted))

              precision    recall  f1-score   support

        Bomb       1.00      0.54      0.70        13
      NoInfo       1.00      0.51      0.68        47
         Npe       1.00      0.37      0.54        51
       Other       0.95      1.00      0.98      5303
        Test       0.99      0.52      0.68       422

    accuracy                           0.95      5836
   macro avg       0.99      0.59      0.72      5836
weighted avg       0.96      0.95      0.95      5836



In [311]:
################### bleu score; AURORA ###################
total_score = 0.
bad_score_number = 0
cluster_vs_number = {cluster1: 0, cluster2: 0, cluster3: 0, cluster4: 0, cluster5: 0}
cluster_vs_bad_score_number = {cluster1: 0, cluster2: 0, cluster3: 0, cluster4: 0, cluster5: 0}

for msg, cluster in zip(a_msg_test, a_predicted):
    cluster_vs_number[cluster] += 1
    
    if cluster == cluster1:
        candidate = candidate1
    elif cluster == cluster2:
        candidate = candidate2
    elif cluster == cluster3:
        candidate = candidate3
    elif cluster == cluster4:
        candidate = candidate4
    else:
        candidate = candidate5
    
    score = sentence_bleu([msg], candidate, weights=((1./len(candidate),) * len(candidate)))
        
    if score < 0.001:
        bad_score_number += 1
        cluster_vs_bad_score_number[cluster] += 1
#         if cluster == cluster4:
#             print("Score: {:.6f}; Candidate: {:>16}; Msg: {}".format(score, candidate, msg))
    total_score += score

print()
print(f"Result = {total_score / len(a_msg_test)}")
print(f"Number of bad score {bad_score_number}; test size {len(a_msg_test)}")
print(f"Elements number per cluster {cluster_vs_number}")
print(f"Elements number with bad score per cluster {cluster_vs_bad_score_number}")


Result = 0.01288976238844559
Number of bad score 5704; test size 5836
Elements number per cluster {'NoInfo': 24, 'Test': 224, 'Bomb': 7, 'Npe': 19, 'Other': 5562}
Elements number with bad score per cluster {'NoInfo': 1, 'Test': 115, 'Bomb': 7, 'Npe': 19, 'Other': 5562}


In [312]:
################### bleu score; IDEA ###################
total_score = 0.
bad_score_number = 0
cluster_vs_number = {cluster1: 0, cluster2: 0, cluster3: 0, cluster4: 0, cluster5: 0}
cluster_vs_bad_score_number = {cluster1: 0, cluster2: 0, cluster3: 0, cluster4: 0, cluster5: 0}

for msg, cluster in zip(msg_test, predicted):
    cluster_vs_number[cluster] += 1
    
    if cluster == cluster1:
        candidate = candidate1
    elif cluster == cluster2:
        candidate = candidate2
    elif cluster == cluster3:
        candidate = candidate3
    elif cluster == cluster4:
        candidate = candidate4
    else:
        candidate = candidate5
    
    score = sentence_bleu([msg], candidate, weights=((1./len(candidate),) * len(candidate)))
        
    if score < 0.001:
        bad_score_number += 1
        cluster_vs_bad_score_number[cluster] += 1
#         if cluster == cluster4:
#             print("Score: {:.6f}; Candidate: {:>16}; Msg: {}".format(score, candidate, msg))
    total_score += score

print()
print(f"Result = {total_score / len(msg_test)}")
print(f"Number of bad score {bad_score_number}; test size {len(msg_test)}")
print(f"Elements number per cluster {cluster_vs_number}")
print(f"Elements number with bad score per cluster {cluster_vs_bad_score_number}")


Result = 0.037946796158592935
Number of bad score 17569; test size 19055
Elements number per cluster {'NoInfo': 1102, 'Test': 1124, 'Bomb': 62, 'Npe': 113, 'Other': 16654}
Elements number with bad score per cluster {'NoInfo': 315, 'Test': 437, 'Bomb': 50, 'Npe': 113, 'Other': 16654}


## Only identifiers

In [None]:
################### get data; IDEA ###################
parent_dir = "/Users/natalia.murycheva/Documents/gitCommitMessageCollectorStorage"
git_dir_name = "intellij"
git_dir = os.path.join(parent_dir, git_dir_name)
json_with_diffs = f"{git_dir_name}_diff_blobs_identifiers.json"
json_with_diffs = os.path.join(parent_dir, json_with_diffs)

raw_data = load_and_filter_data(json_with_diffs)
df = cluster_all_messages(raw_data)