In [130]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import unicodedata
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from xgboost import XGBClassifier
import xgboost

In [131]:
SAVE_PATH = '../data/our_data/'
filename = 'commits_violations.pkl'
df = pd.read_pickle(SAVE_PATH + filename)
df = df.drop(['projectID', 'commitHash', 'blockerViolations','class', 
          'criticalViolations', 'majorViolations', 'minorViolations', 
          'bin_criticalViolations', 'bin_blockerViolations', 'bin_mamiViolations',
          'class_blockerViolations', 'class_criticalViolations', 'class_mamiViolations'
         ], axis = 1)
df.head()

Unnamed: 0,commitMessage,complexity,lines,commentLines,duplicatedLines,violations,bugs,codeSmells,mamiViolations,category
0,ACCUMULO-1 initial checkin of source code\n\ng...,43137,263680,13509,46445,18314,464,17012,17199,21
1,"ACCUMULO-2 moved cloudtrace for TRUNK, fixed p...",43137,262753,13507,46301,18169,344,16987,17054,21
2,ACCUMULO-2 fixed thrift build script to point ...,43137,262753,13507,46301,18169,344,16987,17054,21
3,ACCUMULO-2 reverted change to config.html\n\ng...,43137,263643,13507,46445,18315,464,17013,17200,21
4,ACCUMULO-2 fixed cloudtrace references in cont...,43137,263639,13507,46445,18315,464,17013,17200,21


#### Commit message

In [132]:
def escape_special_chars(s):

    s = s.replace('(', '\(').replace(')', '\)')
    s = s.replace('+', '\+').replace('?', '\?').replace('*', '\*')
    s = s.replace('[', '\[').replace(']', '\]')
    s = s.replace('{', '\{').replace('}', '\}')
    
    return s

In [133]:
def clean_word(word):
    '''Returns the longest prefix of a word made of latin unicode
       characters.'''
    for i, c in enumerate(word):
        if not unicodedata.name(c).startswith("LATIN"):
            return word[:i].lower()
    return word.lower()


def clean_words(words):
    """Cleans all words in a string."""
    return " ".join(map(clean_word, words.split()))

In [134]:
def clean(s):
    s = s.lower()
    s = s.strip().replace('\n', '')
    s = s.strip().replace('->', 'to')
    s = re.sub(r'[^A-Za-z0-9\s:\-]+', ' ', s)
    s = re.sub(r'pr[0-9]+:', 'pr num', s)
    s = re.sub(r'[0-9]+', 'num', s)
    s = ' '.join(s.split())
    
    project_name_reg = re.compile('[^\s]*')
    project_name = re.match(project_name_reg, s)
    
    patterns = ['git-svn-id:','author:','reviewers:','http','obtained from:','submitted by:','reviewed by:']
    traces_reg = [re.compile(f'{i}.*') for i in patterns]
    
    if project_name != None: 
        reg = project_name.group()
        reg = escape_special_chars(reg)
        s = re.sub(f'{reg}', '', s)
        
    for r in traces_reg:
        traces = re.search(r, s)
        if traces != None: 
            reg = traces.group()
            reg = escape_special_chars(reg)
            s = re.sub(f'{reg}', '', s)
    
    s = s.strip().replace('-', '')
    s = ' '.join(s.split())
        
    return s

In [135]:
df['commitMessageClean'] = df['commitMessage'].apply(lambda x: clean(x))

In [136]:
print(df.shape)
df = df.drop_duplicates().copy()
print(df.shape)

(55629, 11)
(55622, 11)


In [137]:
df['commitMessageClean'].nunique() / df['commitMessageClean'].count() * 100

77.27697673582395

In [138]:
# min_df min abs freq 
tf_idf = TfidfVectorizer(min_df = 20, max_df = 0.5, ngram_range = (1, 3), max_features = 1000, stop_words = 'english')
print(tf_idf)

TfidfVectorizer(max_df=0.5, max_features=1000, min_df=20, ngram_range=(1, 3),
                stop_words='english')


In [139]:
tf_idf_features = tf_idf.fit_transform(df['commitMessageClean'])
print(tf_idf_features.toarray())
tf_idf_df = pd.DataFrame(tf_idf_features.toarray(), columns = tf_idf.get_feature_names())
tf_idf_df.head()

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


Unnamed: 0,ability,able,abstract,access,account,accumulo,accumulonum,action,active,actual,...,xalan,xerces,xml,xmlconfiguration,xpath,xsp,yang,year,yusaku,zookeeper
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [140]:
df = df.drop(['commitMessage','commitMessageClean'], axis = 1)
df = pd.concat((df, tf_idf_df), axis = 1).dropna()
Y = df['category']
X = df.drop(['category'], axis = 1)
print(Y.shape, X.shape)
X.head()

(55615,) (55615, 1008)


Unnamed: 0,complexity,lines,commentLines,duplicatedLines,violations,bugs,codeSmells,mamiViolations,ability,able,...,xalan,xerces,xml,xmlconfiguration,xpath,xsp,yang,year,yusaku,zookeeper
0,43137.0,263680.0,13509.0,46445.0,18314.0,464.0,17012.0,17199.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,43137.0,262753.0,13507.0,46301.0,18169.0,344.0,16987.0,17054.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,43137.0,262753.0,13507.0,46301.0,18169.0,344.0,16987.0,17054.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,43137.0,263643.0,13507.0,46445.0,18315.0,464.0,17013.0,17200.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,43137.0,263639.0,13507.0,46445.0,18315.0,464.0,17013.0,17200.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [175]:
seed = 777
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.4, random_state = seed)

In [160]:
X_train.shape, Y_train.shape

((44492, 1008), (44492,))

In [147]:
classifiers = [
    KNeighborsClassifier(3),
    #SVC(kernel="rbf", C=0.025, probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier()]
    #XGBClassifier(),
    #AdaBoostClassifier(),
    #GradientBoostingClassifier(),
    #GaussianNB(),
    #LinearDiscriminantAnalysis()]

In [176]:
Y_test.nunique(), pred.shape

(20, (44492, 22))

In [165]:
Y.nunique()

22

In [164]:
Y_test.unique()

array([ 0., 13.,  6.,  9., 16., 21., 17.,  7., 19., 14.,  4.,  1.,  3.,
       20., 11., 10., 12.,  2.,  8., 15.])

In [181]:
from sklearn.metrics import make_scorer, log_loss
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold

model = GaussianNB()

splits = 5

rkf = RepeatedKFold(n_splits = splits, n_repeats = 1, random_state = 123)
resultados_nk = []
    
for train_index, test_index in rkf.split(X):
    Xc_train_nk, Xc_test_nk = X.values[train_index], X.values[test_index]
    Yc_train_nk, Yc_test_nk = Y.values[train_index], Y.values[test_index]

    scores_train = -cross_val_score(model, Xc_train_nk, Yc_train_nk, cv = splits, scoring = 'neg_log_loss')
    resultados_nk.append(scores_train.mean())

resultados_train.append(resultados_nk)
print("Model cv: %f (%f)" % (np.mean(resultados_nk), np.std(resultados_nk)))




ValueError: y_true and y_pred contain different number of classes 19, 22. Please provide the true labels explicitly through the labels argument. Classes found in y_true: [ 0.  1.  2.  3.  4.  6.  7.  8.  9. 10. 11. 12. 13. 14. 16. 17. 19. 20.
 21.]

In [163]:
model = KNeighborsClassifier(3)
model.fit(X_train, Y_train)
print(X_train.shape, Y_train.shape)
pred = model.predict(X_train)
print(pred.shape)
pred = model.predict_proba(X_train)
print(pred.shape)
eval = log_loss(Y_test, pred) 
print(eval)

(44492, 1008) (44492,)
(44492,)
(44492, 22)


ValueError: Found input variables with inconsistent numbers of samples: [44492, 11123]

In [151]:
log_cols=["Classifier", "Accuracy", "Log Loss"]
log = pd.DataFrame(columns=log_cols)

for clf in classifiers:
    clf.fit(X_train, Y_train)
    name = clf.__class__.__name__
    
    print("="*30)
    print(name)
    
    print('****Results****')
    train_predictions = clf.predict(X_test)
    acc = accuracy_score(Y_test, train_predictions)
    print("Accuracy: {:.4%}".format(acc))
    
    train_predictions = clf.predict_proba(X_test)
    ll = log_loss(Y_test, train_predictions)
    print("Log Loss: {}".format(ll))
    
    log_entry = pd.DataFrame([[name, acc*100, ll]], columns=log_cols)
    log_entry = pd.DataFrame([[name, acc*100]], columns=log_cols)
    log = log.append(log_entry)
    
print("="*30)


KNeighborsClassifier
****Results****
Accuracy: 99.8382%


ValueError: y_true and y_pred contain different number of classes 20, 22. Please provide the true labels explicitly through the labels argument. Classes found in y_true: [ 0.  1.  2.  3.  4.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16. 17. 19.
 20. 21.]