In [68]:
import json
import os
import numpy as np
from nltk.translate.bleu_score import sentence_bleu
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.model_selection import train_test_split
from typing import List, Dict

In [20]:
cluster_header, tokens_header, message_header = "cluster", "tokens", "message"
cluster1, cluster2, cluster3, cluster4, cluster5 = "NoInfo", "Test", "Bomb", "Npe", "Other"

In [159]:
def load_and_filter_data(input_file: str):
    with open(input_file, 'r') as file:
        data = json.load(file)

    elements_to_delete = []
    for each in data:
        if each[message_header] == "(no message)":
            elements_to_delete.append(each)
    for e in elements_to_delete:
        data.remove(e)

    return data


def cluster_all_messages(raw_data: List[Dict]) -> DataFrame:
    result = []
    deleted_tokens, added_tokens = "deletedTokens", "addedTokens"
    for current in raw_data:
        if "cleanup" in current[message_header] or "minor" in current[message_header] or \
            current[message_header] == "fix" or current[message_header] == "fixes" or \
            current[message_header] == "several fixes" or current[message_header] == "minor fixes" or \
            current[message_header] == "yet another inference bug" or \
            current[message_header] == "some improvements (igor e. mikhailuk)" or \
            current[message_header] == "no changes actually" or \
            current[message_header] == "clean up" or current[message_header] == "cosmetics":
            
            cluster = cluster1
        elif "test" in current[message_header] and \
            current[message_header] != "This commit was manufactured by cvs2svn to create branch 'testing'." and \
            current[message_header] != "This commit was manufactured by cvs2svn to create branch 'test'." and \
            current[message_header] != "This commit was manufactured by cvs2svn to create branch 'test_for_ven'.":

            cluster = cluster2
        elif "bomb" in current[message_header]:
            cluster = cluster3
        elif "npe" in current[message_header] or "null" in current[message_header]:
            cluster = cluster4
        else:
            cluster = cluster5
        
        if current[deleted_tokens]:  # list with tokens not empty
            result.append({cluster_header: cluster,
                           tokens_header: current[deleted_tokens],
                           message_header: current[message_header]
                           })
        if current[added_tokens]:  # list with tokens not empty
            result.append({cluster_header: cluster,
                       tokens_header: current[added_tokens],
                       message_header: current[message_header]
                       })
    return DataFrame(result)

In [160]:
################### get data; IDEA ###################
parent_dir = "/Users/natalia.murycheva/Documents/gitCommitMessageCollectorStorage"
git_dir_name = "intellij"
git_dir = os.path.join(parent_dir, git_dir_name)
json_with_diffs = f"{git_dir_name}_diff_blobs.json"
json_with_diffs = os.path.join(parent_dir, json_with_diffs)

raw_data = load_and_filter_data(json_with_diffs)
df = cluster_all_messages(raw_data)

In [81]:
print(f"total shape \t\t{df.shape}")

total_size = df.shape[0]

size1 = (df[cluster_header] == cluster1).sum()
size2 = (df[cluster_header] == cluster2).sum()
size3 = (df[cluster_header] == cluster3).sum()
size4 = (df[cluster_header] == cluster4).sum()
size5 = (df[cluster_header] == cluster5).sum()

print(f"size cluster1 \t{cluster1}\t{size1}\t%{(size1/total_size)*100}")
print(f"size cluster2 \t{cluster2}\t{size2}\t%{(size2/total_size)*100}")
print(f"size cluster3 \t{cluster3}\t{size3}\t%{(size3/total_size)*100}")
print(f"size cluster4 \t{cluster4}\t{size4}\t%{(size4/total_size)*100}")
print(f"size cluster5 \t{cluster5}\t{size5}\t%{(size5/total_size)*100}")

total shape 		(101026, 3)
size cluster1 	NoInfo	4025	%3.9841228990556887
size cluster2 	Test	3711	%3.6733118207194186
size cluster3 	Bomb	421	%0.4167244075782472
size cluster4 	Npe	643	%0.6364698196503871
size cluster5 	Other	92226	%91.28937105299626


In [102]:
################### split data ###################

msg_train , msg_test, y_train, y_test = train_test_split(df[message_header], df[cluster_header],
                                                         test_size=0.3, random_state=442)

In [135]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

text_clf.fit(msg_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [136]:
################### classififcation results ###################
predicted = text_clf.predict(msg_test)
np.mean(predicted == y_test)    
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

        Bomb       1.00      0.23      0.37       131
      NoInfo       0.96      0.77      0.85      1154
         Npe       1.00      0.31      0.47       204
       Other       0.97      1.00      0.98     27728
        Test       0.90      0.68      0.78      1091

    accuracy                           0.97     30308
   macro avg       0.97      0.60      0.69     30308
weighted avg       0.97      0.97      0.97     30308



In [116]:
import warnings
warnings.filterwarnings("ignore")

In [137]:
################### bleu score ###################
total_score = 0.
for msg, cluster in zip(msg_test, predicted):
    if cluster == cluster1:
        candidate = "cleanup"
    elif cluster == cluster2:
        candidate = "test"
    elif cluster == cluster3:
        candidate = "bomb"
    elif cluster == cluster4:
        candidate = "npe"
    else:
        candidate = "smth was changed"
    
    score = sentence_bleu([msg], candidate, weights=(1, 0, 0, 0))
    if candidate == "smth was changed":
        score = sentence_bleu([msg], candidate, weights=(1./3, 1./3, 1./3, 0))
    else:
        score = sentence_bleu([msg], candidate, weights=(1, 0, 0, 0))
        
#     if score > 0.0001:
#         print("Score: {:.2f}; Candidate: {:>16} Msg: {}".format(score, candidate, msg))
    total_score += score

print()
print(f"Result = {total_score / len(msg_test)}")


Result = 0.03583387395378652


In [138]:
################### get data; AURORA ###################
parent_dir = "/Users/natalia.murycheva/Documents/gitCommitMessageCollectorStorage"
git_dir_name = "aurora"
git_dir = os.path.join(parent_dir, git_dir_name)
json_with_diffs = f"{git_dir_name}_diff_blobs.json"
json_with_diffs = os.path.join(parent_dir, json_with_diffs)

raw_data = load_and_filter_data(json_with_diffs)
aurora = cluster_all_messages(raw_data)
#print(aurora)
# This commit was manufactured by cvs2svn to create branch 'testing'.
# This commit was manufactured by cvs2svn to create branch 'test'.
# This commit was manufactured by cvs2svn to create branch 'test_for_ven'.

In [139]:
print(f"total shape \t\t{aurora.shape}")

total_size = aurora.shape[0]

size1 = (aurora[cluster_header] == cluster1).sum()
size2 = (aurora[cluster_header] == cluster2).sum()
size3 = (aurora[cluster_header] == cluster3).sum()
size4 = (aurora[cluster_header] == cluster4).sum()
size5 = (aurora[cluster_header] == cluster5).sum()

print(f"size cluster1 \t{cluster1}\t{size1}\t%{(size1/total_size)*100}")
print(f"size cluster2 \t{cluster2}\t{size2}\t%{(size2/total_size)*100}")
print(f"size cluster3 \t{cluster3}\t{size3}\t%{(size3/total_size)*100}")
print(f"size cluster4 \t{cluster4}\t{size4}\t%{(size4/total_size)*100}")
print(f"size cluster5 \t{cluster5}\t{size5}\t%{(size5/total_size)*100}")

total shape 		(172323, 3)
size cluster1 	NoInfo	138	%0.08008217127139151
size cluster2 	Test	1241	%0.7201592358536005
size cluster3 	Bomb	60	%0.03481833533538762
size cluster4 	Npe	169	%0.09807164452800844
size cluster5 	Other	170715	%99.06686861301162


In [141]:
################### split data ###################

a_msg_train , a_msg_test, a_y_train, a_y_test = train_test_split(aurora[message_header], aurora[cluster_header],
                                                                 test_size=0.3, random_state=742)

In [142]:
################### classififcation results ###################
a_predicted = text_clf.predict(a_msg_test)
np.mean(a_predicted == a_y_test)    
print(metrics.classification_report(a_y_test, a_predicted))

              precision    recall  f1-score   support

        Bomb       0.00      0.00      0.00        17
      NoInfo       0.83      0.65      0.73        31
         Npe       0.00      0.00      0.00        45
       Other       0.99      1.00      1.00     51225
        Test       0.75      0.24      0.36       379

    accuracy                           0.99     51697
   macro avg       0.52      0.38      0.42     51697
weighted avg       0.99      0.99      0.99     51697



In [144]:
################### bleu score ###################
total_score = 0.
for msg, cluster in zip(a_msg_test, predicted):
    if cluster == cluster1:
        candidate = "cleanup"
    elif cluster == cluster2:
        candidate = "test"
    elif cluster == cluster3:
        candidate = "bomb"
    elif cluster == cluster4:
        candidate = "npe"
    else:
        candidate = "smth was changed"
    
    if candidate == "smth was changed":
        score = sentence_bleu([msg], candidate, weights=(1./3, 1./3, 1./3, 0))
    else:
        score = sentence_bleu([msg], candidate, weights=(1, 0, 0, 0))
    total_score += score

print(f"Result = {total_score / len(a_msg_test)}")

Result = 0.03056921293487318


In [145]:
################### fit on aurora dataset ###################
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

text_clf.fit(a_msg_train, a_y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [146]:
################### classififcation results ###################
a_predicted = text_clf.predict(a_msg_test)
np.mean(a_predicted == a_y_test)    
print(metrics.classification_report(a_y_test, a_predicted))

              precision    recall  f1-score   support

        Bomb       1.00      0.41      0.58        17
      NoInfo       1.00      0.68      0.81        31
         Npe       0.85      0.78      0.81        45
       Other       1.00      0.98      0.99     51225
        Test       0.23      0.76      0.35       379

    accuracy                           0.98     51697
   macro avg       0.82      0.72      0.71     51697
weighted avg       0.99      0.98      0.98     51697



In [148]:
################### bleu score ###################
total_score = 0.
for msg, cluster in zip(a_msg_test, predicted):
    if cluster == cluster1:
        candidate = "cleanup"
    elif cluster == cluster2:
        candidate = "test"
    elif cluster == cluster3:
        candidate = "bomb"
    elif cluster == cluster4:
        candidate = "npe"
    else:
        candidate = "smth was changed"
    
#     if candidate == "smth was changed":
#         score = sentence_bleu([msg], candidate, weights=(1./3, 1./3, 1./3, 0))
#     else:
    score = sentence_bleu([msg], candidate, weights=(1, 0, 0, 0))
    total_score += score

print(f"Result = {total_score / len(a_msg_test)}")

Result = 0.03056921293487318


In [149]:
################### fit on idea and then on aurora ###################
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

text_clf.fit(msg_train, y_train)
text_clf.fit(a_msg_train, a_y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [150]:
################### classififcation results ###################
a_predicted = text_clf.predict(a_msg_test)
np.mean(a_predicted == a_y_test)    
print(metrics.classification_report(a_y_test, a_predicted))

              precision    recall  f1-score   support

        Bomb       1.00      0.41      0.58        17
      NoInfo       1.00      0.68      0.81        31
         Npe       0.85      0.78      0.81        45
       Other       1.00      0.98      0.99     51225
        Test       0.23      0.76      0.35       379

    accuracy                           0.98     51697
   macro avg       0.82      0.72      0.71     51697
weighted avg       0.99      0.98      0.98     51697



In [152]:
################### bleu score ###################
total_score = 0.
for msg, cluster in zip(a_msg_test, predicted):
    if cluster == cluster1:
        candidate = "cleanup"
    elif cluster == cluster2:
        candidate = "test"
    elif cluster == cluster3:
        candidate = "bomb"
    elif cluster == cluster4:
        candidate = "npe"
    else:
        candidate = "smth was changed"
    
    if candidate == "smth was changed":
        score = sentence_bleu([msg], candidate, weights=(1./3, 1./3, 1./3, 0))
    else:
        score = sentence_bleu([msg], candidate, weights=(1, 0, 0, 0))
    total_score += score

print(f"Result = {total_score / len(a_msg_test)}")

Result = 0.03056921293487318


In [153]:
################### bleu score; change cluster1 ###################
total_score = 0.
for msg, cluster in zip(a_msg_test, predicted):
    if cluster == cluster1:
        candidate = "fix"
    elif cluster == cluster2:
        candidate = "test"
    elif cluster == cluster3:
        candidate = "bomb"
    elif cluster == cluster4:
        candidate = "npe"
    else:
        candidate = "smth was changed"
    
    score = sentence_bleu([msg], candidate, weights=(1, 0, 0, 0))
    total_score += score

print(f"Result = {total_score / len(a_msg_test)}")

Result = 0.030483569957176968
