In [182]:
import collections
from collections import Counter
import csv
import nltk
from nltk.lm.preprocessing import padded_everygram_pipeline
import string
from typing import Dict, List, Tuple, Optional

In [107]:
def get_sentence_tokens_and_tags(sentence: str) -> List[Tuple[str, str]]:
    sentence_without_punctuation = sentence.translate(str.maketrans('', '', string.punctuation))
    tokens = nltk.word_tokenize(sentence_without_punctuation)
    return nltk.pos_tag(tokens)

def get_sentence_tags(sentence: str) -> List[str]:
    tokens: List[Tuple[str, str]] = get_sentence_tokens_and_tags(sentence)
#     print(tokens)
    only_tags: List[str] = []
    
    for _, tag in tokens:
        only_tags.append(tag)
#     print(only_tags)
    return only_tags

In [103]:
# test for tagging 

print(get_sentence_tags("This commit was manufactured by cvs2svn to create branch 'lesya'"))
print(get_sentence_tags("fixes: sorting of resource bundle properties in UI Designer; extra method invocation when finding classes in Debugger; resource bundles in UI designer can be used even if they are located in non-default package"))

[('This', 'DT'), ('commit', 'NN'), ('was', 'VBD'), ('manufactured', 'VBN'), ('by', 'IN'), ('cvs2svn', 'NN'), ('to', 'TO'), ('create', 'VB'), ('branch', 'NN'), ('lesya', 'NN')]
['DT', 'NN', 'VBD', 'VBN', 'IN', 'NN', 'TO', 'VB', 'NN', 'NN']
['DT', 'NN', 'VBD', 'VBN', 'IN', 'NN', 'TO', 'VB', 'NN', 'NN']
[('fixes', 'NNS'), ('sorting', 'VBG'), ('of', 'IN'), ('resource', 'NN'), ('bundle', 'NN'), ('properties', 'NNS'), ('in', 'IN'), ('UI', 'NNP'), ('Designer', 'NNP'), ('extra', 'JJ'), ('method', 'NN'), ('invocation', 'NN'), ('when', 'WRB'), ('finding', 'VBG'), ('classes', 'NNS'), ('in', 'IN'), ('Debugger', 'NNP'), ('resource', 'NN'), ('bundles', 'NNS'), ('in', 'IN'), ('UI', 'NNP'), ('designer', 'NN'), ('can', 'MD'), ('be', 'VB'), ('used', 'VBN'), ('even', 'RB'), ('if', 'IN'), ('they', 'PRP'), ('are', 'VBP'), ('located', 'VBN'), ('in', 'IN'), ('nondefault', 'NN'), ('package', 'NN')]
['NNS', 'VBG', 'IN', 'NN', 'NN', 'NNS', 'IN', 'NNP', 'NNP', 'JJ', 'NN', 'NN', 'WRB', 'VBG', 'NNS', 'IN', 'NNP', 

In [109]:
# get dataset for actual messages
data_file = "/Users/natalia.murycheva/Documents/code2seq/result.aurora.correct.csv"
commit_vs_message: Dict[str, str] = {}
commit_vs_tags: Dict[str, List[str]] = {}
commit_index = 0
actual_message_index = 3
i = 0   

with open(data_file, newline='') as f:
    reader = csv.reader(f, delimiter='^')
    is_first = True
    last_commit = ""
    
    for row in reader:
        if is_first:
            last_commit = row[commit_index]
            is_first = False
            
        if row[commit_index] != "":
            last_commit = row[commit_index]
        
            if row[actual_message_index] != "":
                commit_vs_message[last_commit] = row[actual_message_index]
                tags = get_sentence_tags(row[actual_message_index])
                if tags:                    
                    i += 1
                    commit_vs_tags[last_commit] = tags
                    
print(f"Number of messages = {i}")

Number of messages = 16923


In [110]:
len(commit_vs_tags)

16923

In [111]:
tags_dataset: List[List[str]] = []
    
for key, value in commit_vs_tags.items():
    tags_dataset.append(value)

In [112]:
print(tags_dataset[:10])

[['NN'], ['NN'], ['NNS'], ['NN'], ['NN'], ['DT', 'NN', 'VBD', 'VBN', 'IN', 'NN', 'TO', 'VB', 'NN', 'NN'], ['NN'], ['NN'], ['NN'], ['NN']]


# Bigram Language Model

In [113]:
from nltk.lm import MLE

n = 3
train, padded = padded_everygram_pipeline(n, tags_dataset)
model = MLE(n)

In [114]:
for i, a in enumerate(train):
    print(list(a))
    if i > 15:
        break

[('<s>',), ('<s>',), ('NN',), ('</s>',), ('</s>',), ('<s>', '<s>'), ('<s>', 'NN'), ('NN', '</s>'), ('</s>', '</s>'), ('<s>', '<s>', 'NN'), ('<s>', 'NN', '</s>'), ('NN', '</s>', '</s>')]
[('<s>',), ('<s>',), ('NN',), ('</s>',), ('</s>',), ('<s>', '<s>'), ('<s>', 'NN'), ('NN', '</s>'), ('</s>', '</s>'), ('<s>', '<s>', 'NN'), ('<s>', 'NN', '</s>'), ('NN', '</s>', '</s>')]
[('<s>',), ('<s>',), ('NNS',), ('</s>',), ('</s>',), ('<s>', '<s>'), ('<s>', 'NNS'), ('NNS', '</s>'), ('</s>', '</s>'), ('<s>', '<s>', 'NNS'), ('<s>', 'NNS', '</s>'), ('NNS', '</s>', '</s>')]
[('<s>',), ('<s>',), ('NN',), ('</s>',), ('</s>',), ('<s>', '<s>'), ('<s>', 'NN'), ('NN', '</s>'), ('</s>', '</s>'), ('<s>', '<s>', 'NN'), ('<s>', 'NN', '</s>'), ('NN', '</s>', '</s>')]
[('<s>',), ('<s>',), ('NN',), ('</s>',), ('</s>',), ('<s>', '<s>'), ('<s>', 'NN'), ('NN', '</s>'), ('</s>', '</s>'), ('<s>', '<s>', 'NN'), ('<s>', 'NN', '</s>'), ('NN', '</s>', '</s>')]
[('<s>',), ('<s>',), ('DT',), ('NN',), ('VBD',), ('VBN',), ('IN'

In [115]:
model.fit(train, padded)

In [116]:
# model.score('is', 'language'.split())  # P('is'|'language')
# model.score('never', 'language is'.split())  # P('never'|'language is')

print(model.score('NN', 'IN'.split()))
print(model.score('VBN', 'VBD'.split()))
print(model.score('VBN', 'NN VBD'.split()))

0.40547703180212014
0.0525092936802974
0.2515923566878981


In [117]:
print(model.counts['NN'])
print(model.counts['<s>'])
print(model.counts['</s>'])

16873
33812
33812


# Filter predicted tags

In [125]:
# get data
commit_vs_predicted_message: Dict[str, List[List[Tuple[str, str]]]] = collections.defaultdict(list)
    
commit_index = 0
predicted_message_index = 5
    

with open(data_file, newline='') as f:
    reader = csv.reader(f, delimiter='^')
    is_first = True
    last_commit = ""
    
    for row in reader:
        if is_first:
            last_commit = row[commit_index]
            is_first = False
            
        if row[commit_index] != "":
            last_commit = row[commit_index]
        
        predicted_message = row[predicted_message_index]
        if predicted_message != "":
            tokens_and_tags = get_sentence_tokens_and_tags(predicted_message)
            if tokens_and_tags:
                commit_vs_predicted_message[last_commit].append(tokens_and_tags)

In [127]:
j = 0
for key, value in commit_vs_predicted_message.items():
    j += 1
    if j > 5:
        break
    print(value)

[[('first', 'RB'), ('replace', 'VB'), ('statement', 'NN'), ('expression', 'NN'), ('implementation', 'NN'), ('used', 'VBN'), ('to', 'TO'), ('enable', 'VB'), ('the', 'DT'), ('testing', 'NN')], [('first', 'RB'), ('replace', 'VB'), ('statement', 'NN'), ('expression', 'NN'), ('implementation', 'NN'), ('used', 'VBN'), ('to', 'TO'), ('enable', 'VB'), ('the', 'DT'), ('testing', 'NN')]]
[[('add', 'VB'), ('support', 'NN'), ('for', 'IN'), ('different', 'JJ'), ('license', 'NN'), ('managers', 'NNS'), ('ui', 'VBP'), ('bug', 'NN')]]
[[('fixed', 'VBN'), ('issue', 'NN'), ('with', 'IN'), ('not', 'RB'), ('external', 'JJ'), ('up', 'RP'), ('in', 'IN'), ('xml', 'NN')], [('added', 'VBD'), ('css', 'NN'), ('xhtml', 'NNP'), ('binding', 'VBG'), ('second', 'JJ'), ('cut', 'NN'), ('specified', 'VBN'), ('if', 'IN')], [('added', 'JJ'), ('usage', 'NN'), ('view', 'NN'), ('to', 'TO'), ('openapi', 'VB'), ('bundle', 'JJ'), ('been', 'VBN'), ('check', 'VB'), ('start', 'NN')]]
[[('fixes', 'NNS'), ('sorting', 'VBG'), ('of', '

In [156]:
def get_proba(tag: str, resulted_token_and_tags: List[Tuple[str, str]]) -> float:
    global model
    if len(resulted_token_and_tags) >= 2:
        prev_tag = resulted_token_and_tags[-1][1]
        prev_prev_tag = resulted_token_and_tags[-2][1]

        bigram_proba = model.score(tag, prev_tag)
        threegram_proba = model.score(tag, [prev_prev_tag, prev_tag])

        return bigram_proba + threegram_proba
    else:
        prev_tag = resulted_token_and_tags[-1][1]
        return model.score(tag, prev_tag)

    
def delete_serial_same_tokens(tokens_and_tags: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
    results_tokens = []

    prev = tokens_and_tags[0][0]
    results_tokens.append(tokens_and_tags[0])
    for token, tag in tokens_and_tags[1:]:
        if token != prev:
            results_tokens.append((token, tag))
            prev = token

    return results_tokens

def filter_tags_by_probability(tokens_and_tags: List[Tuple[str, str]], threshold=0.20) -> str:
    resulted_tuples: List[Tuple[str, str]] = []
#     print(tokens_and_tags)
    tokens_and_tags = delete_serial_same_tokens(tokens_and_tags)
#     print(tokens_and_tags)
    size = len(tokens_and_tags)
    mid = size // 2   
    for i in range(mid):
        resulted_tuples.append(tokens_and_tags[i])
    
    for token, tag in tokens_and_tags[mid:]:
        proba = get_proba(tag, resulted_tuples)
        if proba > threshold:
            resulted_tuples.append((token, tag))
    print([token for token, tag in resulted_tuples])        
    return [token for token, tag in resulted_tuples]
        

In [157]:
filtered_predicted_messages: Dict[str, List[List[str]]] = collections.defaultdict(list)

for commit in commit_vs_predicted_message.keys():
    for token_and_tags in commit_vs_predicted_message[commit]:
        filtered_predicted_messages[commit].append(filter_tags_by_probability(token_and_tags))

['first', 'replace', 'statement', 'expression', 'implementation']
['first', 'replace', 'statement', 'expression', 'implementation']
['add', 'support', 'for', 'different', 'license', 'bug']
['fixed', 'issue', 'with', 'not']
['added', 'css', 'xhtml', 'binding', 'cut', 'if']
['added', 'usage', 'view', 'to', 'openapi', 'bundle']
['fixes', 'sorting', 'of', 'resource', 'bundle']
['fixed', 'read', 'access', 'allowed']
['fixes', 'sorting', 'of', 'resource', 'bundle']
['sh', 'mem', 'for', 'j', 'num']
['fixes', 'sorting', 'of', 'resource', 'bundle']
['fixes', 'sorting', 'of', 'resource', 'bundle']
['fixes', 'in', 'tests', 'tei']
['scr', 'num', 'helper']
['fixes', 'in', 'tests']
['tabs', 'refactored', 'to', 'outer', 'space', 'custom']
['rolled', 'back', 'stathik', 'changes', 'of', 'css']
['tree', 'node', 'extends', 'node', 'for', 'css']
['tree', 'node', 'extends', 'node', 'for', 'css']
['implementation', 'of', 'document', 'guarded', 'required', 'by', 'compu', 'ware']
['implementation', 'of', 'doc

['new', 'tree', 'structure', 'schema']
['new', 'tree', 'structure', 'schema']
['new', 'tree', 'structure', 'schema']
['new', 'tree', 'structure', 'jsp']
['fixed', 'issue', 'with', 'lexical', 'highlighting', 'in']
['new', 'tree', 'structure', 'into', 'code', 'jspx']
['new', 'tree', 'structure', 'schema']
['new', 'tree', 'structure', 'into', 'jspx', 'fix']
['new', 'tree', 'structure', 'into', 'jspx', 'fix']
['new', 'tree', 'structure', 'schema']
['new', 'tree', 'structure', 'into']
['new', 'tree', 'structure', 'jspx']
['new', 'tree', 'structure', 'into']
['tag', 'begin', 'end', 'navigation', 'highlighting']
['tag', 'begin', 'end', 'navigation']
['tag', 'begin', 'end']
['tag', 'begin', 'end', 'navigation', 'highlighting']
['tag', 'begin', 'end', 'navigation']
['tag', 'begin', 'end']
['new', 'tree', 'structure', 'into', 'jspx']
['new', 'tree', 'structure', 'in', 'xml']
['new', 'parser', 'added']
['new', 'tree', 'structure', 'into', 'closing']
['new', 'tree', 'structure', 'into', 'jspx']
['

['scr', 'num', 'and', 'root', 'impl']
['generics', 'resolve', 'with', 'substitutor', 'performance']
['fixed', 'issue', 'with', 'npes', 'accessing', 'after', 'async', 'processing']
['fixed', 'issue', 'with', 'npes', 'accessing', 'after', 'async', 'processing']
['fixed', 'issue', 'with', 'not', 'running', 'expand']
['scr', 'num', 'to', 'project', 'component']
['fixed', 'issue', 'with', 'truncation', 'of', 'number', 'map']
['scr', 'num', 'helper', 'get']
['scr', 'num', 'get']
['fixed', 'issue', 'with', 'npes', 'of', 'number', 'count']
['scr', 'num', 'and', 'includes']
['scr', 'num', 'tag']
['scr', 'num', 'not']
['scr', 'num', 'helper', 'get']
['scr', 'num', 'helper']
['refactoring', 'of', 'async', 'processing', 'interface', 'of', 'usageview', 'order']
['scr', 'num', 'get']
['scr', 'num', 'to', 'project', 'component']
['scr', 'num', 'and', 'helper', 'completion']
['scr', 'num', 'to', 'project', 'global']
['fixed', 'incorrect', 'replacement', 'there']
['scr', 'num', 'and', 'helper', 'comple

['xml', 'notation', 'parsing', 'doctype']
['performance', 'leak', 'fixed']
['test', 'fixed', 'option', 'in', 'method', 'access', 'in']
['performance', 'leak', 'fixed']
['new', 'tests']
['new', 'tests', 'for', 'jsp']
['performance', 'leak', 'fixed']
['performance', 'leak', 'fixed']
['performance', 'leak', 'fixed']
['parser', 'state', 'interface', 'changed']
['new', 'tests']
['performance', 'leak', 'fixed']
['xml', 'notation', 'parsing', 'doctype']
['new', 'tests', 'for', 'jsp']
['performance', 'leak', 'fixed']
['new', 'tests']
['parser', 'state', 'interface', 'changed']
['new', 'tests', 'for', 'method']
['performance', 'leak', 'fixed']
['splitters', 'of', 'wrap', 'designer']
['extracted', 'abstract', 'tree', 'node', 'and', 'optimized', 'project', 'view']
['visibility', 'comparator', 'fixed']
['visibility', 'comparator', 'fixed', 'bugs']
['visibility', 'comparator', 'fixed']
['visibility', 'comparator', 'fixed']
['visibility', 'comparator', 'fixed', 'impl', 'in', 'cache']
['splitters', '

['fixed', 'one', 'more', 'html', 'tag']
['scr', 'num']
['scr', 'num']
['num', 'of', 'wrap', 'done']
['num', 'of', 'wrap', 'done']
['scr', 'num', 'keep', 'option']
['num', 'of', 'wrap', 'num']
['scr', 'num', 'option']
['scr', 'num']
['scr']
['new', 'xml', 'structure']
['scr', 'num']
['num', 'of', 'wrap', 'done']
['fixed', 'startin', 'write', 'action', 'produce']
['scr', 'num', 'of', 'completion', 'file']
['tree', 'node', 'extends']
['tree', 'node', 'extends']
['first', 'replace', 'statement', 'expression', 'implementation']
['added', 'usage', 'view', 'to', 'openapi']
['first', 'replace', 'statement', 'expression', 'implementation']
['killing', 'deprecated', 'diff', 'api', 'find']
['fixed', 'non', 'code', 'num']
['first', 'removed', 'statement', 'expression', 'implementation']
['fixed', 'non', 'compilable', 'code']
['fixed', 'non', 'compilable', 'code', 'util']
['fixed', 'non', 'compilable', 'code']
['killing', 'deprecated', 'diff', 'api', 'find']
['fixed', 'non', 'compilable', 'code']
[

['fixed', 'npe', 'scr', 'num']
['fixed', 'incorrect', 'replacement', 'is', 'find']
['find', 'usages', 'for', 'style']
['scr', 'num', 'completion', 'corrected']
['scr', 'num', 'tag']
['fixed', 'process', 'declarations', 'is', 'for']
['show', 'applied', 'styles', 'action']
['test', 'refactored']
['scr', 'num', 'not', 'live']
['significant', 'ui', 'tweaks', 'first', 'cut']
['fixed', 'incorrect', 'replacement']
['fixed', 'find', 'declarations', 'on']
['fixed', 'issue', 'with', 'id']
['fixed', 'npe', 'scr', 'num']
['generics', 'resolve', 'with', 'substitutor', 'custom']
['refactoring', 'of', 'async', 'processing', 'interface', 'of', 'usageview', 'order']
['fixed', 'npe', 'on', 'null', 'file']
['scr', 'num']
['scr', 'num']
['fixed', 'incorrect', 'declarations', 'with']
['fixed', 'process', 'declarations', 'is', 'problems']
['scope', 'id', 'in', 'usage', 'view', 'title']
['fixed', 'issue', 'with', 'id']
['fixed', 'npe', 'on']
['fixed', 'issue', 'with', 'incorrect', 'replacement', 'on', 'async

['tree', 'node', 'extends', 'node', 'for']
['tree', 'node', 'extends', 'node', 'for']
['tree', 'node', 'extends']
['removed', 'interrupted', 'check', 'in', 'aquire', 'html']
['visibility', 'comparator', 'fixed']
['fixed', 'issue', 'with', 'tag', 'num', 'line']
['tree', 'node', 'extends', 'node', 'for']
['fixed', 'issue', 'with', 'id', 'dependent']
['tree', 'node', 'extends', 'node', 'for', 'css']
['scr', 'num', 'tag', 'structure']
['tree', 'node', 'extends']
['tree', 'node', 'extends', 'node', 'for']
['scr', 'num', 'tag', 'structure']
['tree', 'node', 'extends', 'node', 'for']
['tree', 'node', 'extends', 'node', 'for', 'css']
['generics', 'resolve', 'with', 'substitutor', 'performance']
['fixed', 'issue', 'with', 'id', 'dependent']
['tree', 'node', 'extends', 'node', 'for']
['tree', 'node', 'extends', 'node', 'for', 'css']
['removed', 'interrupted', 'check', 'in', 'aquire', 'html']
['tree', 'node', 'extends', 'node', 'for']
['tree', 'node', 'extends', 'node', 'for']
['fixed', 'issue', 

['supported', 'import', 'keyword', 'completion', 'fixed']
['tree', 'node', 'extends', 'node', 'for']
['tree', 'node', 'extends', 'node', 'for']
['visibility', 'comparator']
['visibility', 'comparator', 'fixed']
['support', 'for', 'multiple', 'xml', 'schemas']
['added', 'support', 'for', 'dag', 'type']
['first', 'replace', 'statement', 'expression', 'implementation']
['added', 'support', 'for', 'dag', 'type']
['scr', 'num', 'of', 'lookahead']
['scr', 'num', 'of', 'wrap', 'place']
['scr', 'idea', 'num', 'to', 'expiring', 'interface', 'as', 'get']
['scr', 'idea', 'num', 'to']
['new', 'ant', 'support', 'not']
['new', 'xml', 'structure']
['fix', 'for', 'num', 'positions', 'outside', 'file', 'body']
['first', 'chunk', 'of', 'make', 'optimization', 'field']
['fixing', 'num', 'leaks', 'if']
['fix', 'for', 'igor', 'at', 'body']
['tree', 'node', 'extends', 'node', 'for']
['visibility', 'comparator', 'fixed', 'case']
['tree', 'node', 'extends', 'node', 'for']
['removed', 'by', 'imports', 'in', 'a

IndexError: list index out of range

# Merge all messages in one

In [189]:
def get_common_message_prefix(messages: List[List[str]]) -> Optional[List[str]]:
    result_tokens = []
    max_size = max([len(message) for message in messages])
    
    for i in range(max_size):
        tokens_slice = []
        for message in messages:
            try:
                tokens_slice.append(message[i])
            except IndexError:
                pass
        counter = Counter(tokens_slice)
        result_tokens.append(max(counter, key=counter.get))

        if result_tokens in messages:
            return result_tokens
    return None

def leave_only_one_message_per_commit(messages: List[List[str]]) -> str:
    messages_str = [" ".join([token for token in message]) for message in messages]
    unique_messages = set(messages_str)
    if len(messages_str) > len(unique_messages):
        counter = Counter(messages_str)
        max_message = max(counter, key=counter.get)
        number_of_dublicates = counter[max_message]
        if number_of_dublicates > 1:
            return max_message
    
    common_message: List[str] = get_common_message_prefix(messages)
    if common_message:
        return " ".join(common_message)
 
    return "massive changes"
        
# tests
a = [['fixed', 'another', 'issue', 'with', 'xml'],
    ['scr', 'num', 'get', 'file'],
    ['add', 'source', 'specification'],
    ['fixed', 'another', 'issue', 'with', 'xml', 'url'],
    ['add', 'children', 'profile'],
    ['add', 'default', 'hint'],
    ['add', 'default'],
    ['add', 'default', 'profile', 'hint', 'completion'],
    ['add', 'default', 'hint']]
print(f"max = {leave_only_one_message_per_commit(a)}")
# for commit in filtered_predicted_messages.keys():
        

max = add default hint


In [193]:
one_message_vs_commit: List[Dict[str, str]] = []

for commit in filtered_predicted_messages.keys():
    one_message_vs_commit.append({"commit" : commit,
                                  "message": leave_only_one_message_per_commit(filtered_predicted_messages[commit])})

In [192]:
# k = 0
# for commit, message in one_message_vs_commit:
#     if message == "massive changes":
#         k += 1

# print(f"Number of MCH {k}, {k / len(one_message_vs_commit.keys())}")

Number of MCH 17, 0.02936096718480138


## Write result

In [195]:
csv_out_file = "/Users/natalia.murycheva/Documents/code2seq/filtered.code2seq.result.csv"

csv_columns = ["commit", "message"]
try:
    with open(csv_out_file, 'w') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
        writer.writeheader()
        for data in one_message_vs_commit:
            writer.writerow(data)
except IOError:
    print("I/O error") 