In [68]:
import json
import os
import numpy as np
from nltk.translate.bleu_score import sentence_bleu
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.model_selection import train_test_split
from typing import List, Dict

In [20]:
cluster_header, tokens_header, message_header = "cluster", "tokens", "message"
cluster1, cluster2, cluster3, cluster4, cluster5 = "NoInfo", "Test", "Bomb", "Npe", "Other"

In [85]:
def load_and_filter_data(input_file: str):
    with open(input_file, 'r') as file:
        data = json.load(file)

    elements_to_delete = []
    for each in data:
        if each[message_header] == "(no message)":
            elements_to_delete.append(each)
    for e in elements_to_delete:
        data.remove(e)

    return data


def cluster_all_messages(raw_data: List[Dict]) -> DataFrame:
    result = []
    deleted_tokens, added_tokens = "deletedTokens", "addedTokens"
    for current in raw_data:
        if "cleanup" in current[message_header] or "minor" in current[message_header] or \
            current[message_header] == "fix" or current[message_header] == "fixes" or \
            current[message_header] == "several fixes" or current[message_header] == "minor fixes" or \
            current[message_header] == "yet another inference bug" or \
            current[message_header] == "some improvements (igor e. mikhailuk)" or \
            current[message_header] == "no changes actually" or \
            current[message_header] == "clean up" or current[message_header] == "cosmetics":
            
            cluster = cluster1
 
        elif "test" in current[message_header]:
            cluster = cluster2
        elif "bomb" in current[message_header]:
            cluster = cluster3
        elif "npe" in current[message_header] or "null" in current[message_header]:
            cluster = cluster4
        else:
            cluster = cluster5
        
        if current[deleted_tokens]:  # list with tokens not empty
            result.append({cluster_header: cluster,
                           tokens_header: current[deleted_tokens],
                           message_header: current[message_header]
                           })
        if current[added_tokens]:  # list with tokens not empty
            result.append({cluster_header: cluster,
                       tokens_header: current[added_tokens],
                       message_header: current[message_header]
                       })

    return DataFrame(result)

In [31]:
################### get data; IDEA ###################
parent_dir = "/Users/natalia.murycheva/Documents/gitCommitMessageCollectorStorage"
git_dir_name = "intellij"
git_dir = os.path.join(parent_dir, git_dir_name)
json_with_diffs = f"{git_dir_name}_diff_blobs.json"
json_with_diffs = os.path.join(parent_dir, json_with_diffs)

raw_data = load_and_filter_data(json_with_diffs)
df = cluster_all_messages(raw_data)
print(df)

       cluster                                            message  \
0        Other  don't retrieve method separators for binary fi...   
1        Other  don't retrieve method separators for binary fi...   
2        Other  force refresh properties on roots change (IDEA...   
3        Other  force refresh properties on roots change (IDEA...   
4       NoInfo                                            cleanup   
5       NoInfo                                            cleanup   
6       NoInfo                                            cleanup   
7       NoInfo                                            cleanup   
8       NoInfo                                            cleanup   
9       NoInfo                                            cleanup   
10      NoInfo                                            cleanup   
11      NoInfo                                            cleanup   
12       Other  avoid showing error twice on saving combobox p...   
13       Other  avoid showing erro

In [81]:
print(f"total shape \t\t{df.shape}")

total_size = df.shape[0]

size1 = (df[cluster_header] == cluster1).sum()
size2 = (df[cluster_header] == cluster2).sum()
size3 = (df[cluster_header] == cluster3).sum()
size4 = (df[cluster_header] == cluster4).sum()
size5 = (df[cluster_header] == cluster5).sum()

print(f"size cluster1 \t{cluster1}\t{size1}\t%{(size1/total_size)*100}")
print(f"size cluster2 \t{cluster2}\t{size2}\t%{(size2/total_size)*100}")
print(f"size cluster3 \t{cluster3}\t{size3}\t%{(size3/total_size)*100}")
print(f"size cluster4 \t{cluster4}\t{size4}\t%{(size4/total_size)*100}")
print(f"size cluster5 \t{cluster5}\t{size5}\t%{(size5/total_size)*100}")

total shape 		(101026, 3)
size cluster1 	NoInfo	4025	%3.9841228990556887
size cluster2 	Test	3711	%3.6733118207194186
size cluster3 	Bomb	421	%0.4167244075782472
size cluster4 	Npe	643	%0.6364698196503871
size cluster5 	Other	92226	%91.28937105299626


In [75]:
################### split data ###################

msg_train , msg_test, y_train, y_test = train_test_split(df[message_header], df[cluster_header],
                                                         test_size=0.3, random_state=42)

In [76]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

text_clf.fit(msg_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [77]:
################### classififcation results ###################
predicted = text_clf.predict(msg_test)
np.mean(predicted == y_test)    
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

        Bomb       1.00      0.20      0.33       123
      NoInfo       0.97      0.76      0.85      1203
         Npe       1.00      0.38      0.55       194
       Other       0.97      1.00      0.99     27695
        Test       0.90      0.71      0.79      1093

    accuracy                           0.97     30308
   macro avg       0.97      0.61      0.70     30308
weighted avg       0.97      0.97      0.97     30308



In [78]:
import warnings
warnings.filterwarnings("ignore")

In [79]:
################### bleu score ###################
total_score = 0.
for msg, cluster in zip(msg_test, predicted):
    if cluster == cluster1:
        candidate = "cleanup"
    elif cluster == cluster2:
        candidate = "test"
    elif cluster == cluster3:
        candidate = "bomb"
    elif cluster == cluster4:
        candidate = "npe"
    else:
        candidate = "smth was changed"
    
    score = sentence_bleu([msg], candidate, weights=(1, 0, 0, 0))
    total_score += score

print(f"Result = {total_score / len(msg_test)}")

Result = 0.22162699056501112


In [83]:
################### get data; AURORA ###################
parent_dir = "/Users/natalia.murycheva/Documents/gitCommitMessageCollectorStorage"
git_dir_name = "aurora"
git_dir = os.path.join(parent_dir, git_dir_name)
json_with_diffs = f"{git_dir_name}_diff_blobs.json"
json_with_diffs = os.path.join(parent_dir, json_with_diffs)

raw_data = load_and_filter_data(json_with_diffs)
aurora = cluster_all_messages(raw_data)
print(aurora)

       cluster                                            message  \
0         Test                                               test   
1         Test                                               test   
2        Other  This commit was manufactured by cvs2svn to cre...   
3        Other  This commit was manufactured by cvs2svn to cre...   
4        Other                                             import   
5        Other                                             import   
6         Test                                               test   
7         Test                                               test   
8        Other                                               init   
9        Other                                               init   
10       Other                                               init   
11       Other                                No changes actually   
12       Other                                No changes actually   
13       Other                    

In [86]:
print(f"total shape \t\t{aurora.shape}")

total_size = aurora.shape[0]

size1 = (aurora[cluster_header] == cluster1).sum()
size2 = (aurora[cluster_header] == cluster2).sum()
size3 = (aurora[cluster_header] == cluster3).sum()
size4 = (aurora[cluster_header] == cluster4).sum()
size5 = (aurora[cluster_header] == cluster5).sum()

print(f"size cluster1 \t{cluster1}\t{size1}\t%{(size1/total_size)*100}")
print(f"size cluster2 \t{cluster2}\t{size2}\t%{(size2/total_size)*100}")
print(f"size cluster3 \t{cluster3}\t{size3}\t%{(size3/total_size)*100}")
print(f"size cluster4 \t{cluster4}\t{size4}\t%{(size4/total_size)*100}")
print(f"size cluster5 \t{cluster5}\t{size5}\t%{(size5/total_size)*100}")

total shape 		(172323, 3)
size cluster1 	NoInfo	138	%0.08008217127139151
size cluster2 	Test	34255	%19.878367948561714
size cluster3 	Bomb	60	%0.03481833533538762
size cluster4 	Npe	169	%0.09807164452800844
size cluster5 	Other	137701	%79.9086599003035


In [87]:
################### split data ###################

a_msg_train , a_msg_test, a_y_train, a_y_test = train_test_split(aurora[message_header], aurora[cluster_header],
                                                                 test_size=0.3, random_state=42)

In [88]:
################### classififcation results ###################
a_predicted = text_clf.predict(a_msg_test)
np.mean(a_predicted == a_y_test)    
print(metrics.classification_report(a_y_test, a_predicted))

              precision    recall  f1-score   support

        Bomb       0.00      0.00      0.00        20
      NoInfo       0.94      0.71      0.81        41
         Npe       1.00      0.02      0.04        48
       Other       0.80      1.00      0.89     41234
        Test       0.81      0.01      0.02     10354

    accuracy                           0.80     51697
   macro avg       0.71      0.35      0.35     51697
weighted avg       0.80      0.80      0.71     51697



In [89]:
################### bleu score ###################
total_score = 0.
for msg, cluster in zip(a_msg_test, predicted):
    if cluster == cluster1:
        candidate = "cleanup"
    elif cluster == cluster2:
        candidate = "test"
    elif cluster == cluster3:
        candidate = "bomb"
    elif cluster == cluster4:
        candidate = "npe"
    else:
        candidate = "smth was changed"
    
    score = sentence_bleu([msg], candidate, weights=(1, 0, 0, 0))
    total_score += score

print(f"Result = {total_score / len(a_msg_test)}")

Result = 0.030793871912447895


In [90]:
################### fit on aurora dataset ###################
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

text_clf.fit(a_msg_train, a_y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [91]:
################### classififcation results ###################
a_predicted = text_clf.predict(a_msg_test)
np.mean(a_predicted == a_y_test)    
print(metrics.classification_report(a_y_test, a_predicted))

              precision    recall  f1-score   support

        Bomb       1.00      0.60      0.75        20
      NoInfo       0.86      0.76      0.81        41
         Npe       0.21      0.77      0.33        48
       Other       1.00      1.00      1.00     41234
        Test       1.00      0.98      0.99     10354

    accuracy                           0.99     51697
   macro avg       0.81      0.82      0.77     51697
weighted avg       1.00      0.99      0.99     51697



In [92]:
################### bleu score ###################
total_score = 0.
for msg, cluster in zip(a_msg_test, predicted):
    if cluster == cluster1:
        candidate = "cleanup"
    elif cluster == cluster2:
        candidate = "test"
    elif cluster == cluster3:
        candidate = "bomb"
    elif cluster == cluster4:
        candidate = "npe"
    else:
        candidate = "smth was changed"
    
    score = sentence_bleu([msg], candidate, weights=(1, 0, 0, 0))
    total_score += score

print(f"Result = {total_score / len(a_msg_test)}")

Result = 0.030793871912447895


In [93]:
################### fit on idea and then on aurora ###################
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

text_clf.fit(msg_train, y_train)
text_clf.fit(a_msg_train, a_y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [94]:
################### classififcation results ###################
a_predicted = text_clf.predict(a_msg_test)
np.mean(a_predicted == a_y_test)    
print(metrics.classification_report(a_y_test, a_predicted))

              precision    recall  f1-score   support

        Bomb       1.00      0.60      0.75        20
      NoInfo       0.86      0.76      0.81        41
         Npe       0.21      0.77      0.33        48
       Other       1.00      1.00      1.00     41234
        Test       1.00      0.98      0.99     10354

    accuracy                           0.99     51697
   macro avg       0.81      0.82      0.77     51697
weighted avg       1.00      0.99      0.99     51697



In [95]:
################### bleu score ###################
total_score = 0.
for msg, cluster in zip(a_msg_test, predicted):
    if cluster == cluster1:
        candidate = "cleanup"
    elif cluster == cluster2:
        candidate = "test"
    elif cluster == cluster3:
        candidate = "bomb"
    elif cluster == cluster4:
        candidate = "npe"
    else:
        candidate = "smth was changed"
    
    score = sentence_bleu([msg], candidate, weights=(1, 0, 0, 0))
    total_score += score

print(f"Result = {total_score / len(a_msg_test)}")

Result = 0.030793871912447895


In [96]:
################### bleu score; change cluster1 ###################
total_score = 0.
for msg, cluster in zip(a_msg_test, predicted):
    if cluster == cluster1:
        candidate = "fix"
    elif cluster == cluster2:
        candidate = "test"
    elif cluster == cluster3:
        candidate = "bomb"
    elif cluster == cluster4:
        candidate = "npe"
    else:
        candidate = "smth was changed"
    
    score = sentence_bleu([msg], candidate, weights=(1, 0, 0, 0))
    total_score += score

print(f"Result = {total_score / len(a_msg_test)}")

Result = 0.03071389456281856
