In [251]:
import json
import os
from keras.preprocessing.text import Tokenizer
import numpy as np
from nltk.translate.bleu_score import sentence_bleu
from pandas import DataFrame
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_selection import VarianceThreshold
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.model_selection import train_test_split
from typing import List, Dict, Set
from collections import OrderedDict
import itertools

In [2]:
def load_data():
    data_dir = "/Users/natalia.murycheva/Documents/gitCommitMessageCollectorStorage"
    input_file_1 = os.path.join(data_dir, "aurora_diff_blobs_identifiers_all_aurora_1.json")
    input_file_2 = os.path.join(data_dir, "aurora_diff_blobs_identifiers_all_aurora_2.json")
    
    with open(input_file_1, 'r') as f:
        data_1 = json.load(f)
    with open(input_file_2, 'r') as f:
        data_2 = json.load(f)

    data_1.extend(data_2)
    return data_1

commits_per_file = load_data()

In [3]:
len(commits_per_file)
for i, m in enumerate(commits_per_file):
    print(m)
    if i > 400:
        break

{'commit': '433c115eb5577eba8efcc7ed34adaeb59eba5269', 'filePath': 'tools/tools/ant-runner/src/jetbrains/buildServer/agent/ant/AntBuildRunner.java', 'addedTokens': ['/*\nchanged\n * Copyright (c) 2005 Your Corporation. All Rights Reserved.\n */'], 'deletedTokens': ['/*\n * Copyright (c) 2005 Your Corporation. All Rights Reserved.\n */'], 'message': 'test'}
{'commit': 'fd66e8f7ae2bef9487be3c0d98e7b2a74126cfe4', 'filePath': 'tools/tools/ant-runner/src/build-agent-plugin.xml', 'addedTokens': ['container', '\n\n  ', 'component', 'class', '"jetbrains.buildServer.agent.ant.AntBuildRunner"', '/>', '\n\n', '/', 'container', '\n'], 'deletedTokens': [], 'message': "This commit was manufactured by cvs2svn to create branch 'lesya'."}
{'commit': '11cf93493eab7650c9138cc2da25c3981bd8bdca', 'filePath': 'tools/tools/ant-runner/src/build-agent-plugin.xml', 'addedTokens': ['container', '\n\n  ', 'component', 'class', '"jetbrains.buildServer.agent.ant.AntBuildRunner"', '/>', '\n\n', '/', 'container', '\n

In [5]:
commit = "commit"
message = "message"
filePath = "filePath"
addedTokens = "addedTokens"
deletedTokens = "deletedTokens"

In [16]:
# union all changed files during one commit in one commit

commits: Dict[str, Dict] = {}
for changed_file in commits_per_file:
    if changed_file[commit] in commits.keys():
        cur_com = commits[changed_file[commit]]
        cur_com[filePath].append(changed_file[filePath])
        cur_com[addedTokens] |= set(changed_file[addedTokens])
        cur_com[deletedTokens] |= set(changed_file[deletedTokens])
    else:
        commits[changed_file[commit]] = {commit: changed_file[commit],
                                         message: changed_file[message],
                                         filePath: [changed_file[filePath]],
                                         addedTokens: set(changed_file[addedTokens]),
                                         deletedTokens: set(changed_file[deletedTokens])}

In [17]:
print(len(commits))

21984


# Analyze commit messages

In [27]:
import re
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/natalia.murycheva/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/natalia.murycheva/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/natalia.murycheva/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [19]:
msgs = [info[message] for c, info in commits.items()]

In [20]:
for i, m in enumerate(msgs):
    print(m)
    if i > 20:
        break

test
This commit was manufactured by cvs2svn to create branch 'lesya'.
import
test
init

No changes actually
No changes actually
No changes actually
No changes actually
saving user templates in configuration
fixing improper usage of optional debug API (location.method().isObsolete()). When debugging JRockit VM versions 1.4.x, all methods were shown as 'obsolete'
setModel is called after init of myMemberInfoModel (to fix JDK 1.5 NPE)
no message
fixes: sorting of resource bundle properties in UI Designer; extra method invocation when finding classes in Debugger; resource bundles in UI designer can be used even if they are located in non-default package
MacOS debugger/mousewheel bug worked around
no message
Opening page for tomcat 4.0.6 fixed
OpenSource and 1-Year Educational license types supported.
fix SCR 42003
no message
no message


In [21]:
# filter messages
messages = [m for m in msgs if m != "no message" and m != "*** empty log message ***" and not m.startswith("This commit was manufactured by")]

In [22]:
for i, m in enumerate(messages):
    print(m)
    if i > 20:
        break

test
import
test
init

No changes actually
No changes actually
No changes actually
No changes actually
saving user templates in configuration
fixing improper usage of optional debug API (location.method().isObsolete()). When debugging JRockit VM versions 1.4.x, all methods were shown as 'obsolete'
setModel is called after init of myMemberInfoModel (to fix JDK 1.5 NPE)
fixes: sorting of resource bundle properties in UI Designer; extra method invocation when finding classes in Debugger; resource bundles in UI designer can be used even if they are located in non-default package
MacOS debugger/mousewheel bug worked around
Opening page for tomcat 4.0.6 fixed
OpenSource and 1-Year Educational license types supported.
fix SCR 42003
added test case for completion from custom tag lib
fixed NPE
small change to remove duplicated strings
first working test for jspx highlighting
fix


In [30]:
# punctuation

without_punctuation_messages = []
for m in messages:
    without_punctuation_messages.append(m.translate(str.maketrans('', '', string.punctuation)))

In [31]:
# tokenization
tokenized_messages = []

for m in without_punctuation_messages:
    tokenized = word_tokenize(m)
    if tokenized:
        tokenized_messages.append(tokenized)    

In [32]:
# remove stop-words

stop_words = set(stopwords.words('english'))
without_stop_words_messages = []

for m in tokenized_messages:
    without_stop_words_messages.append([word for word in m if not word in stop_words])

In [33]:
# split tokens
def split_commit_message(message: str) -> List[str]:
    message = re.sub(r'[-+]?[0-9]*\.?[0-9]+', '<num>', message)
    message_tokenized = re.findall(r"[A-Z]*[a-z]*|<num>", message)
    return [x.lower() for x in message_tokenized if x != '']

splitted_messages = []
for m in without_stop_words_messages:
    splitted_messages.append(split_commit_message(" ".join(m)))

In [34]:
for i, m in enumerate(splitted_messages):
    print(m)
    if i > 20:
        break

['test']
['import']
['test']
['init']
['no', 'changes', 'actually']
['no', 'changes', 'actually']
['no', 'changes', 'actually']
['no', 'changes', 'actually']
['saving', 'user', 'templates', 'configuration']
['fixing', 'improper', 'usage', 'optional', 'debug', 'api', 'locationmethodis', 'obsolete', 'when', 'debugging', 'jrockit', 'vm', 'versions', '<num>', 'x', 'methods', 'shown', 'obsolete']
['set', 'model', 'called', 'init', 'my', 'member', 'info', 'model', 'fix', 'jdk', '<num>', 'npe']
['fixes', 'sorting', 'resource', 'bundle', 'properties', 'ui', 'designer', 'extra', 'method', 'invocation', 'finding', 'classes', 'debugger', 'resource', 'bundles', 'ui', 'designer', 'used', 'even', 'located', 'nondefault', 'package']
['mac', 'os', 'debuggermousewheel', 'bug', 'worked', 'around']
['opening', 'page', 'tomcat', '<num>', 'fixed']
['open', 'source', '<num>', 'year', 'educational', 'license', 'types', 'supported']
['fix', 'scr', '<num>']
['added', 'test', 'case', 'completion', 'custom', 'ta

In [35]:
# lemmatization
lemmatizer = WordNetLemmatizer()
lematized_messages = []

for m in splitted_messages:
    lematized_messages.append([lemmatizer.lemmatize(word, pos="v") for word in m])

In [36]:
for i, m in enumerate(lematized_messages):
    print(m)
    if i > 20:
        break

['test']
['import']
['test']
['init']
['no', 'change', 'actually']
['no', 'change', 'actually']
['no', 'change', 'actually']
['no', 'change', 'actually']
['save', 'user', 'templates', 'configuration']
['fix', 'improper', 'usage', 'optional', 'debug', 'api', 'locationmethodis', 'obsolete', 'when', 'debug', 'jrockit', 'vm', 'versions', '<num>', 'x', 'methods', 'show', 'obsolete']
['set', 'model', 'call', 'init', 'my', 'member', 'info', 'model', 'fix', 'jdk', '<num>', 'npe']
['fix', 'sort', 'resource', 'bundle', 'properties', 'ui', 'designer', 'extra', 'method', 'invocation', 'find', 'class', 'debugger', 'resource', 'bundle', 'ui', 'designer', 'use', 'even', 'locate', 'nondefault', 'package']
['mac', 'os', 'debuggermousewheel', 'bug', 'work', 'around']
['open', 'page', 'tomcat', '<num>', 'fix']
['open', 'source', '<num>', 'year', 'educational', 'license', 'type', 'support']
['fix', 'scr', '<num>']
['add', 'test', 'case', 'completion', 'custom', 'tag', 'lib']
['fix', 'npe']
['small', 'chan

In [38]:
# top words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lematized_messages)

def invert_dict(input_dict: Dict[str, int]) -> Dict[int, List[str]]:
    result = dict()

    for key, value in input_dict.items():
        if value not in result:
            result[value] = [key]
        else:
            result[value].append(key)

    return result

counts_vs_word = invert_dict(tokenizer.word_counts)
counts_vs_word_sorted = OrderedDict(sorted(counts_vs_word.items(), reverse=True))
top_popular_words = dict(itertools.islice(counts_vs_word_sorted.items(), 0, 55))
print(top_popular_words)

{1191: ['fix'], 497: ['<num>'], 273: ['test'], 267: ['scr'], 199: ['npe'], 185: ['completion'], 184: ['add'], 154: ['change'], 153: ['file'], 143: ['issue'], 137: ['support'], 118: ['xml'], 105: ['class'], 103: ['new'], 102: ['bug'], 97: ['type', 'tag'], 88: ['cache'], 87: ['remove'], 85: ['jsp'], 82: ['resolve'], 77: ['case'], 73: ['back'], 69: ['null'], 68: ['name'], 64: ['search'], 63: ['css'], 61: ['find', 'html'], 59: ['make'], 58: ['reference'], 57: ['update'], 56: ['action'], 53: ['get'], 52: ['highlight'], 50: ['some'], 48: ['one', 'code', 'check', 'correct'], 46: ['comment'], 45: ['use'], 44: ['problem'], 43: ['move', 'module', 'error'], 42: ['first'], 41: ['fabrique'], 40: ['method', 'handle'], 39: ['set', 'refactoring'], 38: ['parse'], 37: ['value', 'match', 'revert'], 36: ['methods'], 35: ['license', 'inside', 'attribute', 'psi'], 34: ['tree', 'parameters'], 33: ['pattern'], 32: ['empty', 'project', 'incorrect'], 31: ['work', 'schema'], 30: ['roll', 'usages'], 29: ['editor'

## Classes from these top words

```
* fix, support, bug, problem, error, issue, scr
* refactoring, class, replace, comment, move, module
* psi, tree, parse
* npe, null
* completion
* css
* html
* xml
* highlight
* test
* jsp
```

In [611]:
fix = "fix"
refactoring = "refactoring"
psi = "psi"
test = "test"
npe = "npe"
completion = "completion"
css = "css"
html = "html"
xml = "xml"
highlight = "highlight"
jsp = "jsp"

def isFix(msg: List[str]) -> bool:
    return "fix" in msg or "bug" in msg or "error" in msg \
            or "issue" in msg or "scr" in msg

def isRefactoring(msg: List[str]) -> bool:
    return "refactoring" in msg or "class" in msg or "replace" in msg \
           or "comment" in msg or "move" in msg or "module" in msg

def isPsi(msg: List[str]) -> bool:
    return "psi" in msg or "tree" in msg or "parse" in msg 

def isTest(msg: List[str]) -> bool:
    return "test" in msg

def isNpe(msg: List[str]) -> bool:
    return "npe" in msg or "null" in msg

def isCompletion(msg: List[str]) -> bool:
    return "completion" in msg

def isCss(msg: List[str]) -> bool:
    return "css" in msg

def isHtml(msg: List[str]) -> bool:
    return "html" in msg

def isXml(msg: List[str]) -> bool:
    return "xml" in msg

def isHighlight(msg: List[str]) -> bool:
    return "highlight" in msg

def isJsp(msg: List[str]) -> bool:
    return "jsp" in msg

# Make dataset

In [41]:
def normalize_message(message: str) -> List[str]:
    without_punctuation_message = message.translate(str.maketrans('', '', string.punctuation))
    tokenized = word_tokenize(without_punctuation_message)
    without_stop_words = [word for word in tokenized if not word in stop_words]
    splitted = split_commit_message(" ".join(without_stop_words))
    return [lemmatizer.lemmatize(word, pos="v") for word in splitted]

In [108]:
def split_tokens(message: str) -> Set[str]:
    message = re.sub(r'[-+]?[0-9]*\.?[0-9]+', '<num>', message)
    message_tokenized = re.findall(r"[A-Z]*[a-z]*|<num>", message)
    return set([x.lower() for x in message_tokenized if x != ''])


def normalize_tokens(tokens: Set[str]) -> Set[str]:
    result = set()
    for t in tokens:
        pattern = re.compile(r'\s+|\n')
        t = re.sub(pattern, '', t) 
        if not t.startswith("/*") and t != '':
            splitted = split_tokens(t)
            if len(splitted) > 0:
                result |= splitted
            
    return result

In [109]:
normalize_message("This was my test3 and % experiments mice")

['this', 'test', '<num>', 'experiment', 'mice']

In [680]:
result = []
other_count = 0
fix_count = 0
number_tokens = []

for c, info in commits.items():
    msg = info[message]
    normalized = normalize_message(msg)
    
    classes = []
    if isFix(normalized):
        fix_count += 1
        if fix_count < 1000:
            classes.append(fix)
    if isRefactoring(normalized):
        classes.append(refactoring)
    if isPsi(normalized):
        classes.append(psi)
    if isTest(normalized):
        classes.append(test)
    if isNpe(normalized):
        classes.append(npe)
    if isCompletion(normalized):
        classes.append(completion)
    if isCss(normalized):
        classes.append(css)
    if isHtml(normalized):
        classes.append(html)
    if isXml(normalized):
        classes.append(xml)
    if isHighlight(normalized):
        classes.append(highlight)
    if isJsp(normalized):
        classes.append(jsp)
    
    if not classes:
        if len(normalize_tokens(info[addedTokens]) | normalize_tokens(info[deletedTokens])) < 20 and other_count < 200:
            other_count += 1
            result.append({"class": [other],
                           "tokens": " ".join(normalize_tokens(info[addedTokens]) | normalize_tokens(info[deletedTokens])), 
                           message: info[message],
                           commit: info[commit]})
            number_tokens.append(len(normalize_tokens(info[addedTokens]) | normalize_tokens(info[deletedTokens])))
    else:
#         for class_ in classes:
        result.append({"class": classes,
                       "tokens": " ".join(normalize_tokens(info[addedTokens]) | normalize_tokens(info[deletedTokens])), 
                       message: info[message],
                       commit: info[commit]})
        number_tokens.append(len(normalize_tokens(info[addedTokens]) | normalize_tokens(info[deletedTokens])))

In [681]:
data = DataFrame(result)

In [682]:
np.median(number_tokens)

14.0

In [683]:
mlb = MultiLabelBinarizer()
binary_classes = mlb.fit_transform(data["class"])
list(mlb.classes_)

['completion',
 'css',
 'fix',
 'highlight',
 'html',
 'jsp',
 'npe',
 'other',
 'psi',
 'refactoring',
 'test',
 'xml']

In [684]:
binary_classes.shape

(2018, 12)

In [685]:
np.sum(binary_classes, axis=0)

array([181,  59, 999,  52,  59,  83, 237, 200, 105, 270, 267, 113])

In [686]:
np.median(np.sum(binary_classes, axis=0))

147.0

In [687]:
tokens_train, tokens_test, y_train, y_test = train_test_split(data["tokens"], binary_classes,
                                                              test_size=0.3, random_state=242)

In [688]:
tokens_test[:10]

1420    after b java new inner show action file before...
1137    some null find n mythread a <num> findinnercla...
464               e exception true runtime occurred false
1626    error panel attr name template get my componen...
843     suffix tochangebodyofimplementedmethodsuse res...
146     thread debuggermanagerthread debugger ifcurren...
1522                                                     
790     tree null element get composite subtree parent...
1938    failonerror package plugins smartcvs project c...
19      jspx code library roots intellij analyzer path...
Name: tokens, dtype: object

In [689]:
y_test[:10]

array([[1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]])

# NB

In [690]:
pipeline = Pipeline([
    ('vect', CountVectorizer(max_df=0.95)),
    ('selection', VarianceThreshold(threshold=0.01)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB(alpha=0.75)),
])

ovr = OneVsRestClassifier(pipeline)
ovr.fit(tokens_train, y_train)

OneVsRestClassifier(estimator=Pipeline(memory=None,
                                       steps=[('vect',
                                               CountVectorizer(analyzer='word',
                                                               binary=False,
                                                               decode_error='strict',
                                                               dtype=<class 'numpy.int64'>,
                                                               encoding='utf-8',
                                                               input='content',
                                                               lowercase=True,
                                                               max_df=0.95,
                                                               max_features=None,
                                                               min_df=1,
                                                               ngram_range=(1,
        

In [691]:
################### classififcation results ###################
predicted = ovr.predict(tokens_test)
np.mean(predicted == y_test)    
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.84      0.31      0.45        52
           1       0.00      0.00      0.00        21
           2       0.61      0.60      0.60       295
           3       0.00      0.00      0.00        15
           4       0.00      0.00      0.00        10
           5       0.00      0.00      0.00        23
           6       0.00      0.00      0.00        75
           7       0.00      0.00      0.00        55
           8       0.00      0.00      0.00        31
           9       0.50      0.04      0.08        93
          10       0.80      0.05      0.10        78
          11       0.00      0.00      0.00        33

   micro avg       0.60      0.26      0.36       781
   macro avg       0.23      0.08      0.10       781
weighted avg       0.42      0.26      0.28       781
 samples avg       0.33      0.28      0.29       781



  'precision', 'predicted', average, warn_for)


In [692]:
predicted

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 1, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

## Dump results

In [495]:
# get all dataset 

tmp = []

for c, info in commits.items():
    msg = info[message]
    normalized = normalize_message(msg)    
    
    tmp.append({"class": [other],
                "tokens": " ".join(normalize_tokens(info[addedTokens]) | normalize_tokens(info[deletedTokens])), 
                message: info[message],
                commit: info[commit]})

final_data = DataFrame(tmp)

In [693]:
results = ovr.predict(final_data["tokens"])
results

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 1, 0, 0],
       [0, 0, 1, ..., 1, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 1, 1, 1]])

In [694]:
results_classes = mlb.inverse_transform(results)
results_classes

[(),
 ('fix', 'refactoring'),
 ('fix', 'refactoring'),
 (),
 ('refactoring',),
 ('refactoring',),
 (),
 (),
 (),
 (),
 ('fix',),
 ('fix',),
 (),
 ('fix',),
 (),
 ('fix',),
 ('fix',),
 ('fix',),
 ('fix',),
 (),
 ('fix',),
 ('fix',),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 ('fix',),
 (),
 ('fix',),
 ('fix',),
 (),
 (),
 ('fix',),
 ('fix',),
 (),
 (),
 ('fix',),
 (),
 (),
 (),
 (),
 ('fix',),
 (),
 (),
 ('fix',),
 (),
 (),
 (),
 ('fix',),
 (),
 ('fix',),
 ('fix',),
 ('fix',),
 ('fix',),
 ('fix',),
 (),
 (),
 ('fix',),
 (),
 ('fix',),
 (),
 ('fix',),
 (),
 (),
 (),
 (),
 ('fix',),
 (),
 (),
 (),
 ('fix',),
 ('fix',),
 (),
 ('fix',),
 ('fix',),
 ('fix',),
 (),
 (),
 ('fix',),
 ('fix',),
 (),
 ('fix',),
 (),
 (),
 ('fix',),
 (),
 (),
 ('fix',),
 (),
 ('fix',),
 (),
 (),
 (),
 (),
 ('fix',),
 (),
 (),
 (),
 (),
 ('fix',),
 (),
 (),
 ('fix',),
 ('fix',),
 (),
 ('fix',),
 (),
 (),
 (),
 (),
 ('fix',),
 (),
 ('fix',),
 (),
 ('fix',),
 ('fix',),
 ('fix',),
 ('fix',),
 ('fix',),
 (),
 ('fix',),
 

In [695]:
probabilities = ovr.predict_proba(final_data["tokens"])
probabilities

array([[0.09135977, 0.02691218, 0.49858357, ..., 0.12535411, 0.13385269,
        0.05665722],
       [0.02145925, 0.05672555, 0.59978674, ..., 0.62585521, 0.23574459,
        0.12250717],
       [0.02145925, 0.05672555, 0.59978674, ..., 0.62585521, 0.23574459,
        0.12250717],
       ...,
       [0.06376178, 0.00491286, 0.53212174, ..., 0.04222665, 0.05175858,
        0.02046452],
       [0.07612995, 0.00821663, 0.40334406, ..., 0.15400407, 0.08985649,
        0.01781708],
       [0.67280912, 0.9858697 , 0.37048089, ..., 0.97568524, 0.97192144,
        0.9739926 ]])

In [696]:
class_vs_index = {cl: i for i, cl in enumerate(mlb.classes_)}
class_vs_index

{'completion': 0,
 'css': 1,
 'fix': 2,
 'highlight': 3,
 'html': 4,
 'jsp': 5,
 'npe': 6,
 'other': 7,
 'psi': 8,
 'refactoring': 9,
 'test': 10,
 'xml': 11}

In [697]:
probabilities.shape

(21984, 12)

In [698]:
index_vs_class = {i: c for c, i in class_vs_index.items()}
index_vs_class

{0: 'completion',
 1: 'css',
 2: 'fix',
 3: 'highlight',
 4: 'html',
 5: 'jsp',
 6: 'npe',
 7: 'other',
 8: 'psi',
 9: 'refactoring',
 10: 'test',
 11: 'xml'}

In [699]:
len(tmp)

21984

In [700]:
final = []
for i, c in enumerate(tmp):
    classes_ = results_classes[i]
    final.append({commit: tmp[i][commit],
                  "classes": [(c_, probabilities[i][class_vs_index[c_]]) for c_ in classes_]})    

In [701]:
import csv

csv_file = "/Users/natalia.murycheva/PycharmProjects/gitCommitMessageCollector/naive_bayes/results.csv"
csv_columns = [commit, 'classes']
try:
    with open(csv_file, 'w') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
        writer.writeheader()
        for data in final:
            writer.writerow(data)
except IOError:
    print("I/O error") 