In [202]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim
from gensim.models import Word2Vec
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

import time

## Word2Vec

In [230]:
#splitting preprocessed messages
messages = pd.read_csv('preprocessed.csv')
messages = messages.drop(labels = ['Unnamed: 0'], axis=1)
messages['clean message'] = messages['Commit message'].apply(lambda x: x.split())
messages.head()

Unnamed: 0,Commit message,Class,clean message
0,extract method,extract,"[extract, method]"
1,minor tweak follow review extraction method pl...,extract,"[minor, tweak, follow, review, extraction, met..."
2,extract stuff method git p depot path coremedi...,extract,"[extract, stuff, method, git, p, depot, path, ..."
3,extract method doiserviceimpl,extract,"[extract, method, doiserviceimpl]"
4,refactoring getmenuspace navigation extract is...,extract,"[refactoring, getmenuspace, navigation, extrac..."


In [231]:
from GensimWord2Vec import GensimWord2VecVectorizer as word2vec

In [232]:
X = messages['clean message']
y = messages['Class']
SVC_pipeline = Pipeline([
    ('w2v', word2vec()),
    ('svc', SVC(decision_function_shape='ovr'))
])

In [209]:
skf = StratifiedKFold(n_splits=10)
report_list = list()

for train_index, test_index in skf.split(X,y):
    x_train_fold, x_test_fold = X[train_index], X[test_index]
    y_train_fold, y_test_fold = y[train_index], y[test_index]
    SVC_pipeline.fit(x_train_fold, y_train_fold)
    y_pred = SVC_pipeline.predict(x_test_fold)
    report = classification_report(y_test_fold, y_pred, output_dict=True)
    report_list.append(pd.DataFrame(report).transpose())
    

In [210]:
total = report_list[0].stack()
for i in range(1, 10):
    total = total.add(report_list[i].stack())

avgs = total / 10
avgs.unstack()[:6]

Unnamed: 0,precision,recall,f1-score,support
extract,0.355326,0.294865,0.3174,83.4
inline,0.225219,0.431583,0.294789,83.4
move,0.268724,0.371515,0.309941,83.4
pull up,0.211383,0.152194,0.176686,83.4
push down,0.27111,0.077912,0.117005,83.4
rename,0.605211,0.496787,0.542732,83.4


In [101]:
#Grid search to determine optimal parameters for Word2Vec vectorizer
from sklearn.model_selection import GridSearchCV
params = [{'w2v__vector_size': [500,750,1000],
         'w2v__window': [5,10],
         'w2v__sg': [0,1],
         'w2v__hs': [0,1],
         'w2v__epochs': [5,10]}]

In [102]:
gs_svc = GridSearchCV(SVC_pipeline, param_grid=params, scoring='accuracy', cv=10)
gs_svc.fit(X_train, y_train)

In [103]:
gs_svc.best_params_

{'w2v__epochs': 10,
 'w2v__hs': 1,
 'w2v__sg': 1,
 'w2v__vector_size': 750,
 'w2v__window': 10}

In [233]:
#Defining Word2Vec parameters
w2v = word2vec(vector_size=750,
               window=10,
               epochs=10,
               hs=1,
               sg=1)

In [234]:
#function for averaging classification reports of 10-fold cv
def reports(report_list):
    total = report_list[0].stack()
    for i in range(1, 10):
        total = total.add(report_list[i].stack())
    avgs = total / 10
    return avgs.unstack()[:6]

In [235]:
#function for training and testing models
def model(pipeline, X, y):
    start = time.time()
    report_list = list()
    skf = StratifiedKFold(n_splits=10, shuffle=True)
    fold = 1
    start = time.time()
    for train_index, test_index in skf.split(X,y):
        x_train_fold, x_test_fold = X[train_index], X[test_index]
        y_train_fold, y_test_fold = y[train_index], y[test_index]
        pipeline.fit(x_train_fold, y_train_fold)
        y_pred = pipeline.predict(x_test_fold)
        report = classification_report(y_test_fold, y_pred, output_dict=True)
        report_list.append(pd.DataFrame(report).transpose())
        end = time.time()
        print("Fold ", fold, ": ", (end-start)/60, " mins" )
        fold += 1
    return reports(report_list)

In [236]:
#Logistic Regresion

logReg_pipeline = Pipeline([
    ('w2v', w2v),
    ('lr', LogisticRegression(multi_class='ovr'))
])

model(logReg_pipeline, X, y)

Fold  1 :  0.20880109866460164  mins
Fold  2 :  0.42183827559153236  mins
Fold  3 :  0.6391542514165243  mins
Fold  4 :  0.8484534382820129  mins
Fold  5 :  1.0605006059010824  mins
Fold  6 :  1.2772259672482809  mins
Fold  7 :  1.4870119969050088  mins
Fold  8 :  1.6893486897150676  mins
Fold  9 :  1.9064593394597371  mins
Fold  10 :  2.111933696269989  mins


Unnamed: 0,precision,recall,f1-score,support
extract,0.552819,0.631741,0.588839,83.4
inline,0.39817,0.434151,0.413793,83.4
move,0.529465,0.541968,0.53445,83.4
pull up,0.405832,0.344148,0.371424,83.4
push down,0.361107,0.32609,0.341651,83.4
rename,0.888114,0.863239,0.874951,83.4


In [237]:
#SVC

SVC_pipeline = Pipeline([
    ('w2v', w2v),
    ('svc', SVC(decision_function_shape='ovr'))
])

model(SVC_pipeline, X, y)

Fold  1 :  0.25259145100911456  mins
Fold  2 :  0.5073038180669148  mins
Fold  3 :  0.7622583548227946  mins
Fold  4 :  1.0103224913279216  mins
Fold  5 :  1.2564892133076986  mins
Fold  6 :  1.5138962785402934  mins
Fold  7 :  1.7712507685025534  mins
Fold  8 :  2.030527341365814  mins
Fold  9 :  2.279437208175659  mins
Fold  10 :  2.5350239634513856  mins


Unnamed: 0,precision,recall,f1-score,support
extract,0.584184,0.591165,0.586606,83.4
inline,0.407429,0.424369,0.414213,83.4
move,0.486344,0.623465,0.545213,83.4
pull up,0.370414,0.393144,0.380978,83.4
push down,0.409577,0.301979,0.347096,83.4
rename,0.907932,0.769851,0.832059,83.4


In [238]:
#Random Forest
randomForest_pipeline = Pipeline([
    ('w2v', w2v),
    ('rf', RandomForestClassifier())
])
model(randomForest_pipeline, X, y)

Fold  1 :  0.38609453837076824  mins
Fold  2 :  0.7668399095535279  mins
Fold  3 :  1.148150885105133  mins
Fold  4 :  1.527122155825297  mins
Fold  5 :  1.914990484714508  mins
Fold  6 :  2.2977229674657185  mins
Fold  7 :  2.675361454486847  mins
Fold  8 :  3.063961621125539  mins
Fold  9 :  3.450395953655243  mins
Fold  10 :  3.8215964317321776  mins


Unnamed: 0,precision,recall,f1-score,support
extract,0.512256,0.537235,0.522735,83.4
inline,0.375392,0.374154,0.374085,83.4
move,0.430912,0.592154,0.498587,83.4
pull up,0.395171,0.351348,0.371184,83.4
push down,0.406817,0.329834,0.363789,83.4
rename,0.848845,0.744449,0.792281,83.4


In [239]:
#Gradient Boosting Machine
gbm_pipeline = Pipeline([
    ('w2v', w2v),
    ('gbm', GradientBoostingClassifier())
])
model(gbm_pipeline, X, y)

Fold  1 :  10.849093886216481  mins
Fold  2 :  21.601022334893546  mins
Fold  3 :  32.29528861840566  mins
Fold  4 :  43.14466885328293  mins
Fold  5 :  54.08975896040599  mins
Fold  6 :  64.84622598091761  mins
Fold  7 :  75.46686004797617  mins
Fold  8 :  86.0982028166453  mins
Fold  9 :  96.29571729501089  mins
Fold  10 :  106.5666233698527  mins


Unnamed: 0,precision,recall,f1-score,support
extract,0.563126,0.54313,0.551832,83.4
inline,0.422275,0.393302,0.405869,83.4
move,0.457076,0.615103,0.523471,83.4
pull up,0.367399,0.365792,0.36627,83.4
push down,0.427146,0.339343,0.377482,83.4
rename,0.859293,0.81169,0.833752,83.4
