In [2]:
import collections
from collections import Counter
import csv
import nltk
from nltk.lm.preprocessing import padded_everygram_pipeline
import string
from typing import Dict, List, Tuple, Optional

In [3]:
def get_sentence_tokens_and_tags(sentence: str) -> List[Tuple[str, str]]:
    sentence_without_punctuation = sentence.translate(str.maketrans('', '', string.punctuation))
    tokens = nltk.word_tokenize(sentence_without_punctuation)
    return nltk.pos_tag(tokens)

def get_sentence_tags(sentence: str) -> List[str]:
    tokens: List[Tuple[str, str]] = get_sentence_tokens_and_tags(sentence)
#     print(tokens)
    only_tags: List[str] = []
    
    for _, tag in tokens:
        only_tags.append(tag)
#     print(only_tags)
    return only_tags

In [4]:
# test for tagging 

print(get_sentence_tags("This commit was manufactured by cvs2svn to create branch 'lesya'"))
print(get_sentence_tags("fixes: sorting of resource bundle properties in UI Designer; extra method invocation when finding classes in Debugger; resource bundles in UI designer can be used even if they are located in non-default package"))

['DT', 'NN', 'VBD', 'VBN', 'IN', 'NN', 'TO', 'VB', 'NN', 'NN']
['NNS', 'VBG', 'IN', 'NN', 'NN', 'NNS', 'IN', 'NNP', 'NNP', 'JJ', 'NN', 'NN', 'WRB', 'VBG', 'NNS', 'IN', 'NNP', 'NN', 'NNS', 'IN', 'NNP', 'NN', 'MD', 'VB', 'VBN', 'RB', 'IN', 'PRP', 'VBP', 'VBN', 'IN', 'NN', 'NN']


In [5]:
# get dataset for actual messages
data_file = "/Users/natalia.murycheva/Documents/code2seq/result.aurora.correct.csv"
commit_vs_message: Dict[str, str] = {}
commit_vs_tags: Dict[str, List[str]] = {}
commit_index = 0
actual_message_index = 3
i = 0   

with open(data_file, newline='') as f:
    reader = csv.reader(f, delimiter='^')
    is_first = True
    last_commit = ""
    
    for row in reader:
        if is_first:
            last_commit = row[commit_index]
            is_first = False
            
        if row[commit_index] != "":
            last_commit = row[commit_index]
        
            if row[actual_message_index] != "":
                commit_vs_message[last_commit] = row[actual_message_index]
                tags = get_sentence_tags(row[actual_message_index])
                if tags:                    
                    i += 1
                    commit_vs_tags[last_commit] = tags
                    
print(f"Number of messages = {i}")

Number of messages = 16923


In [6]:
len(commit_vs_tags)

16923

In [7]:
tags_dataset: List[List[str]] = []
    
for key, value in commit_vs_tags.items():
    tags_dataset.append(value)

In [8]:
print(tags_dataset[:10])

[['NN'], ['NN'], ['NNS'], ['NN'], ['NN'], ['DT', 'NN', 'VBD', 'VBN', 'IN', 'NN', 'TO', 'VB', 'NN', 'NN'], ['NN'], ['NN'], ['NN'], ['NN']]


# Bigram Language Model

In [9]:
from nltk.lm import MLE

n = 3
train, padded = padded_everygram_pipeline(n, tags_dataset)
model = MLE(n)

In [10]:
for i, a in enumerate(train):
    print(list(a))
    if i > 15:
        break

[('<s>',), ('<s>',), ('NN',), ('</s>',), ('</s>',), ('<s>', '<s>'), ('<s>', 'NN'), ('NN', '</s>'), ('</s>', '</s>'), ('<s>', '<s>', 'NN'), ('<s>', 'NN', '</s>'), ('NN', '</s>', '</s>')]
[('<s>',), ('<s>',), ('NN',), ('</s>',), ('</s>',), ('<s>', '<s>'), ('<s>', 'NN'), ('NN', '</s>'), ('</s>', '</s>'), ('<s>', '<s>', 'NN'), ('<s>', 'NN', '</s>'), ('NN', '</s>', '</s>')]
[('<s>',), ('<s>',), ('NNS',), ('</s>',), ('</s>',), ('<s>', '<s>'), ('<s>', 'NNS'), ('NNS', '</s>'), ('</s>', '</s>'), ('<s>', '<s>', 'NNS'), ('<s>', 'NNS', '</s>'), ('NNS', '</s>', '</s>')]
[('<s>',), ('<s>',), ('NN',), ('</s>',), ('</s>',), ('<s>', '<s>'), ('<s>', 'NN'), ('NN', '</s>'), ('</s>', '</s>'), ('<s>', '<s>', 'NN'), ('<s>', 'NN', '</s>'), ('NN', '</s>', '</s>')]
[('<s>',), ('<s>',), ('NN',), ('</s>',), ('</s>',), ('<s>', '<s>'), ('<s>', 'NN'), ('NN', '</s>'), ('</s>', '</s>'), ('<s>', '<s>', 'NN'), ('<s>', 'NN', '</s>'), ('NN', '</s>', '</s>')]
[('<s>',), ('<s>',), ('DT',), ('NN',), ('VBD',), ('VBN',), ('IN'

In [11]:
model.fit(train, padded)

In [12]:
# model.score('is', 'language'.split())  # P('is'|'language')
# model.score('never', 'language is'.split())  # P('never'|'language is')

print(model.score('NN', 'IN'.split()))
print(model.score('VBN', 'VBD'.split()))
print(model.score('VBN', 'NN VBD'.split()))

0.40547703180212014
0.0525092936802974
0.2515923566878981


In [13]:
print(model.counts['NN'])
print(model.counts['<s>'])
print(model.counts['</s>'])

16873
33812
33812


# Filter predicted tags

In [14]:
# get data
commit_vs_predicted_message: Dict[str, List[List[Tuple[str, str]]]] = collections.defaultdict(list)
    
commit_index = 0
predicted_message_index = 5
    

code2seq_commits_all = set()
with open(data_file, newline='') as f:
    reader = csv.reader(f, delimiter='^')
    is_first = True
    last_commit = ""
    
    for row in reader:
        if is_first:
            last_commit = row[commit_index]
            code2seq_commits_all.add(last_commit)
            is_first = False
            
        if row[commit_index] != "":
            last_commit = row[commit_index]
        
        predicted_message = row[predicted_message_index]
        if predicted_message != "":
            tokens_and_tags = get_sentence_tokens_and_tags(predicted_message)
            if tokens_and_tags:
                code2seq_commits_all.add(last_commit)
                commit_vs_predicted_message[last_commit].append(tokens_and_tags)

In [15]:
len(commit_vs_predicted_message)

19036

In [16]:
j = 0
for key, value in commit_vs_predicted_message.items():
    j += 1
    if j > 5:
        break
    print(value)

[[('first', 'RB'), ('replace', 'VB'), ('statement', 'NN'), ('expression', 'NN'), ('implementation', 'NN'), ('used', 'VBN'), ('to', 'TO'), ('enable', 'VB'), ('the', 'DT'), ('testing', 'NN')], [('first', 'RB'), ('replace', 'VB'), ('statement', 'NN'), ('expression', 'NN'), ('implementation', 'NN'), ('used', 'VBN'), ('to', 'TO'), ('enable', 'VB'), ('the', 'DT'), ('testing', 'NN')]]
[[('add', 'VB'), ('support', 'NN'), ('for', 'IN'), ('different', 'JJ'), ('license', 'NN'), ('managers', 'NNS'), ('ui', 'VBP'), ('bug', 'NN')]]
[[('fixed', 'VBN'), ('issue', 'NN'), ('with', 'IN'), ('not', 'RB'), ('external', 'JJ'), ('up', 'RP'), ('in', 'IN'), ('xml', 'NN')], [('added', 'VBD'), ('css', 'NN'), ('xhtml', 'NNP'), ('binding', 'VBG'), ('second', 'JJ'), ('cut', 'NN'), ('specified', 'VBN'), ('if', 'IN')], [('added', 'JJ'), ('usage', 'NN'), ('view', 'NN'), ('to', 'TO'), ('openapi', 'VB'), ('bundle', 'JJ'), ('been', 'VBN'), ('check', 'VB'), ('start', 'NN')]]
[[('fixes', 'NNS'), ('sorting', 'VBG'), ('of', '

In [20]:
def get_proba(tag: str, resulted_token_and_tags: List[Tuple[str, str]]) -> float:
    global model
    if len(resulted_token_and_tags) >= 2:
        prev_tag = resulted_token_and_tags[-1][1]
        prev_prev_tag = resulted_token_and_tags[-2][1]

        bigram_proba = model.score(tag, prev_tag)
        threegram_proba = model.score(tag, [prev_prev_tag, prev_tag])

        return bigram_proba + threegram_proba
    elif len(resulted_token_and_tags) == 1:
        prev_tag = resulted_token_and_tags[-1][1]
        return model.score(tag, prev_tag)
    else:
        return 1.

    
def delete_serial_same_tokens(tokens_and_tags: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
    results_tokens = []

    prev = tokens_and_tags[0][0]
    results_tokens.append(tokens_and_tags[0])
    for token, tag in tokens_and_tags[1:]:
        if token != prev:
            results_tokens.append((token, tag))
            prev = token
    return results_tokens

def filter_tags_by_probability(tokens_and_tags: List[Tuple[str, str]], threshold=0.20) -> str:
    resulted_tuples: List[Tuple[str, str]] = []
    tokens_and_tags_no_d = delete_serial_same_tokens(tokens_and_tags)
    size = len(tokens_and_tags_no_d)
    
    if size > 1:
        mid = size // 2 
        for i in range(mid):
            resulted_tuples.append(tokens_and_tags_no_d[i])

        for token, tag in tokens_and_tags_no_d[mid:]:
            proba = get_proba(tag, resulted_tuples)
            if proba > threshold:
                resulted_tuples.append((token, tag))
        return [token for token, tag in resulted_tuples]
    else:
        return [token for token, tag in tokens_and_tags]

In [21]:
j = 0
for commit in commit_vs_predicted_message.keys():
    if commit_vs_predicted_message[commit]:
        j += 1

print(j)

19036


In [22]:
filtered_predicted_messages: Dict[str, List[List[str]]] = collections.defaultdict(list)

for commit in commit_vs_predicted_message.keys():    
    all_messages = commit_vs_predicted_message[commit]
    for message in all_messages:
        filtered_message = filter_tags_by_probability(message)
        filtered_predicted_messages[commit].append(filtered_message)

In [23]:
len(filtered_predicted_messages)

19036

In [25]:
len(commit_vs_predicted_message)

19036

# Merge all messages in one

In [271]:
def get_common_message_prefix(messages: List[List[str]]) -> Optional[List[str]]:
    result_tokens = []
    max_size = max([len(message) for message in messages])
    
    for i in range(max_size):
        tokens_slice = []
        for message in messages:
            try:
                tokens_slice.append(message[i])
            except IndexError:
                pass
        counter = Counter(tokens_slice)
        result_tokens.append(max(counter, key=counter.get))

        if result_tokens in messages:
            return result_tokens
    return None

def leave_only_one_message_per_commit(messages: List[List[str]]) -> str:
    messages_str = [" ".join([token for token in message]) for message in messages]
    unique_messages = set(messages_str)
    if len(messages_str) > len(unique_messages):
        counter = Counter(messages_str)
        max_message = max(counter, key=counter.get)
        number_of_dublicates = counter[max_message]
        if number_of_dublicates > 1:
            return max_message
    
    common_message: List[str] = get_common_message_prefix(messages)
    if common_message:
        return " ".join(common_message)
 
    return "massive changes"
        
# tests
a = [['fixed', 'another', 'issue', 'with', 'xml'],
    ['scr', 'num', 'get', 'file'],
    ['add', 'source', 'specification'],
    ['fixed', 'another', 'issue', 'with', 'xml', 'url'],
    ['add', 'children', 'profile'],
    ['add', 'default', 'hint'],
    ['add', 'default'],
    ['add', 'default', 'profile', 'hint', 'completion'],
    ['add', 'default', 'hint']]
print(f"max = {leave_only_one_message_per_commit(a)}")
# for commit in filtered_predicted_messages.keys():
        

max = add default hint


In [272]:
one_message_vs_commit: List[Dict[str, str]] = []
tmp_commits = set()

for commit in filtered_predicted_messages.keys():
    tmp_commits.add(commit)
    one_message_vs_commit.append({"commit" : commit,
                                  "message": leave_only_one_message_per_commit(filtered_predicted_messages[commit])})

In [273]:
len(one_message_vs_commit)

19036

In [274]:
len(code2seq_commits_all & tmp_commits)

19036

In [275]:
len(log_commits_all & tmp_commits)

19036

## Write result

In [277]:
csv_out_file = "/Users/natalia.murycheva/PycharmProjects/gitCommitMessageCollector/" \
               "naive_bayes/filtered.code2seq.result.csv"

csv_columns = ["commit", "message"]
try:
    with open(csv_out_file, 'w') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=csv_columns, )
        writer.writeheader()
        for data in one_message_vs_commit:
            writer.writerow(data)
except IOError:
    print("I/O error") 

In [278]:
log = "/Users/natalia.murycheva/Documents/"\
      "gitCommitMessageCollectorStorage/gcm_aurora_com_com_msg_author_date.log"
SEPARATOR = "THIS_STRING_WILL_NEVER_APPEAR_IN_DATASET_AND_IT_WILL_BE_USED_AS_SEPARATOR"

log_commits_all = set()
with open(log, 'r') as full_log_file:
    for line in full_log_file:
        if line.startswith("parent_commit_file_hash"):
            continue
        i += 1
        line_list = line.split(SEPARATOR)
        commit, message = line_list[1], line_list[4]
        log_commits_all.add(commit)


In [279]:
len(code2seq_commits_all)

19037

In [280]:
len(code2seq_commits_all & log_commits_all)

19037

In [281]:
len(log_commits_all)

37077