In [27]:
import warnings
warnings.filterwarnings('ignore')

from dbn.tensorflow import SupervisedDBNClassification
from dbn.tensorflow.models import UnsupervisedDBN

import numpy as np
import pandas as pd
import csv, re
from sklearn.model_selection import train_test_split
from sklearn.metrics.classification import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import itertools

from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score

from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

from sklearn.pipeline import Pipeline

from sklearn import svm, tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

import time
import re

import eli5

In [28]:
def processTokensOfOneFile( oneFileContent ):
    stemmer_obj  = SnowballStemmer("english")
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and
    # the output is a single string (a preprocessed movie review)
    
    # Remove non-letters
    letters_only = re.sub("[^a-zA-Z]", " ", oneFileContent)
    
    # Convert to lower case, split into individual words
    words = letters_only.lower().split()
    
    # In Python, searching a set is much faster than searching a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))
    
    # Remove stop words
    meaningful_words = [w for w in words if not w in stops]
    
    # Only inlcude words at least of length 3
    valid_len_words = [w for w in meaningful_words if len(w) >= 3]

    # convert words to utf
    stemmed_words = [stemmer_obj.stem(token) for token in valid_len_words]
    
    #Join the words back into one string separated by space, and return the result.
    return( " ".join( stemmed_words ))

In [29]:
def giveFileContent(fileNameParam):
    str2ret=""
    for line_ in open("..//" + re.split('V5/', fileNameParam)[1], 'rU'):
        li=line_.strip()
        str2ret = str2ret + line_.rstrip()
    return str2ret

In [30]:
def getTokensForTokenization(file_name):
    completeCorpus    = [] ## a list of lists with tokens from defected and non defected files
    for fileToRead in file_name:
        fileContentAsStr = giveFileContent(fileToRead)
        filtered_str_from_one_file = processTokensOfOneFile(fileContentAsStr)
        completeCorpus.append(filtered_str_from_one_file)       
    return completeCorpus

In [31]:
def measure_performance(true_label, predicted_label):   
    precision = recall = f1 = np.zeros(2, dtype=np.float32)
    report = classification_report(true_label, predicted_label, digits=3)
    precision = precision_score(true_label, predicted_label, average=None, labels=[0,1])
    recall = recall_score(true_label, predicted_label, average=None, labels=[0,1])
    f1 = f1_score(true_label, predicted_label, average=None, labels=[0,1])
    return recall, precision, f1

In [32]:
class MyLabelEncoder(object):
    def __init__(self):
        """
        It differs from LabelEncoder by handling new classes and providing a value for it [Unknown]
        Unknown will be added in fit and transform will take care of new item. It gives unknown class id
        """
        self.label_encoder = LabelEncoder()
        # self.classes_ = self.label_encoder.classes_

    def fit(self, data_list):
        """
        This will fit the encoder for all the unique values and introduce unknown value
        :param data_list: A list of string
        :return: self
        """
        self.label_encoder = self.label_encoder.fit(list(data_list) + ['Unknown'])
        self.classes_ = self.label_encoder.classes_

        return self

    def transform(self, data_list):
        """
        This will transform the data_list to id list where the new values get assigned to Unknown class
        :param data_list:
        :return:
        """
        new_data_list = list(data_list)
        for unique_item in np.unique(data_list):
            if unique_item not in self.label_encoder.classes_:
                new_data_list = ['Unknown' if x==unique_item else x for x in new_data_list]

        return self.label_encoder.transform(new_data_list)

In [33]:
def parse_source_code(file_name):     
    unfilteredTokensFromFile = getTokensForTokenization(file_name)
    return unfilteredTokensFromFile

In [34]:
def encode_vector_of_nodes(docs):
    #documents
    #docs = ['if foo for bar car', 'foo for if bar']
    
    docs = [x.lower() for x in docs]
  
    # split documents to tokens
    words = [doc.split(" ") for doc in docs]
    #print("All tokens: ", words)
    
    # find the length of vectors
    max_len = np.max([len(x) for x in words])
    #print("Vector Length: ", max_len)
    
    # convert list of of token-lists to one flat list of tokens
    flatten_words = list(itertools.chain.from_iterable(words))
    #print("Flatten tokens: ", flatten_words)
    
    #fine all the unique tokens
    unique_words = np.unique(flatten_words)
    #print("Unique tokens: ", unique_words)
    print("Feature Number: ", unique_words.size)
    
    # integer encode
    encoded_docs = []
    label_encoder = MyLabelEncoder()
    label_encoder.fit(unique_words)
    for doc in docs:
        #print(doc)
        words = doc.split(" ")
        #print(words)
        integer_encoded = label_encoder.transform(words)
        integer_encoded = np.pad(integer_encoded, (0, max_len - len(integer_encoded))) #padding with 0 to make fixed sized vectors
        #print(integer_encoded)
        encoded_docs.append(integer_encoded)
    
    #print(encoded_docs)
    return encoded_docs, unique_words

In [35]:
def apply_PCA(train, test):
    # Since PCA is effected by scale, we need to scale the features in the data before applying PCA
    scaler = StandardScaler()
    # Fit on training set only.
    scaler.fit(train)
    # Apply transform to both the training set and the test set.
    train = scaler.transform(train)
    test = scaler.transform(test)

    # Make an instance of the Model
    pca = PCA(0.95) #  choose the minimum number of principal components such that 95% of the variance is retained.
    # We are fitting PCA on the training set only.
    pca.fit(train)
    print ("Number of selected components: ", pca.n_components_)
    #print (pd.DataFrame(pca.components_))
    
    # Apply the mapping (transform) to both the training set and the test set
    print("Before applying PCA train set size: ", train.shape)
    print("Before applying PCA test set size: ", test.shape)
    train = pca.transform(train)
    test = pca.transform(test)
    print("After applying PCA train set size: ", train.shape)
    print("After applying PCA test set size: ", test.shape)
    return train,test

In [36]:
def semantic_classifier(clf, feature_list):
    dbn = UnsupervisedDBN(hidden_layers_structure=[feature_list.size, feature_list.size],
                              batch_size=64,
                              learning_rate_rbm=0.06,
                              n_epochs_rbm=1,
                              activation_function='sigmoid',
                              verbose =0)

    classifier = Pipeline(steps=[('dbn', dbn), ('clf', clf)], verbose= True)  
    
    return classifier

In [37]:
def kfold_cv(data, true_label, feature_list):
    
    data = np.array(data)
    
    # 10 fold cv
    kf = KFold(n_splits=2, shuffle = True, random_state = 7)

    cv_recall_DT = []
    cv_precision_DT = []
    cv_f1_DT = []
    cv_fit_time_DT = []
    cv_predict_time_DT = []
    
    cv_recall_KNN = []
    cv_precision_KNN = []
    cv_f1_KNN = []
    cv_fit_time_KNN = []
    cv_predict_time_KNN = []
    
    cv_recall_SVM = []
    cv_precision_SVM = []
    cv_f1_SVM = []
    cv_fit_time_SVM = []
    cv_predict_time_SVM = []
    
    cv_recall_NB = []
    cv_precision_NB = []
    cv_f1_NB = []
    cv_fit_time_NB = []
    cv_predict_time_NB = []
    
    cv_recall_RF = []
    cv_precision_RF = []
    cv_f1_RF = []
    cv_fit_time_RF =[]
    cv_predict_time_RF = []


    for train_index, test_index in kf.split(data):        
        train, test = data[train_index], data[test_index]
        train_label, test_label = true_label[train_index], true_label[test_index]
        
        train, test = apply_PCA(train, test)
        
        clf = tree.DecisionTreeClassifier(criterion='gini', splitter='best', 
                                      min_samples_split = 2, min_weight_fraction_leaf=0.0)
        classifier = semantic_classifier(clf, feature_list)
        start = time.perf_counter()
        classifier.fit(train, train_label)
        end = time.perf_counter()
        fit_time = end - start
        start = time.perf_counter()
        predicted_label = classifier.predict(test)
        end = time.perf_counter()
        predict_time = end - start
        recall, precision, f1 = measure_performance(test_label, predicted_label)
        cv_recall_DT.append(recall)
        cv_precision_DT.append(precision)
        cv_f1_DT.append(f1)
        cv_fit_time_DT.append(fit_time)
        cv_predict_time_DT.append(predict_time)  
        print("----------------CART------------------")
        CART_features = eli5.explain_weights_df(classifier.named_steps['clf'], top=50, feature_names = feature_list)
        print(CART_features)      
        print("*"*100)

        
        clf = KNeighborsClassifier(n_neighbors=5, weights='distance')
        classifier = semantic_classifier(clf, feature_list)
        start = time.perf_counter()
        classifier.fit(train, train_label)
        end = time.perf_counter()
        fit_time = end - start
        start = time.perf_counter()
        predicted_label = classifier.predict(test)
        end = time.perf_counter()
        predict_time = end - start
        recall, precision, f1 = measure_performance(test_label, predicted_label)
        cv_recall_KNN.append(recall)
        cv_precision_KNN.append(precision)
        cv_f1_KNN.append(f1)
        cv_fit_time_KNN.append(fit_time)
        cv_predict_time_KNN.append(predict_time)
        print("----------------KNN------------------")
        KNN_features = eli5.explain_weights_df(classifier.named_steps['clf'], top=50, feature_names = feature_list)
        print(KNN_features)      
        print("*"*100)

        
        clf = svm.SVC(gamma='auto', C = 20.0, kernel='rbf')
        classifier = semantic_classifier(clf, feature_list)
        start = time.perf_counter()
        classifier.fit(train, train_label)
        end = time.perf_counter()
        fit_time = end - start
        start = time.perf_counter()
        predicted_label = classifier.predict(test)
        end = time.perf_counter()
        predict_time = end - start
        recall, precision, f1 = measure_performance(test_label, predicted_label)
        cv_recall_SVM.append(recall)
        cv_precision_SVM.append(precision)
        cv_f1_SVM.append(f1)
        cv_fit_time_SVM.append(fit_time)
        cv_predict_time_SVM.append(predict_time)
        print("----------------SVM------------------")
        SVM_features = eli5.explain_weights_df(classifier.named_steps['clf'], top=50, feature_names = feature_list)
        print(SVM_features)      
        print("*"*100)
        
        
        clf = GaussianNB()
        classifier = semantic_classifier(clf, feature_list)
        start = time.perf_counter()
        classifier.fit(train, train_label)
        end = time.perf_counter()
        fit_time = end - start
        start = time.perf_counter()
        predicted_label = classifier.predict(test)
        end = time.perf_counter()
        predict_time = end - start
        recall, precision, f1 = measure_performance(test_label, predicted_label)
        cv_recall_NB.append(recall)
        cv_precision_NB.append(precision)
        cv_f1_NB.append(f1)
        cv_fit_time_NB.append(fit_time)
        cv_predict_time_NB.append(predict_time)
        print("----------------NB------------------")
        NB_features = eli5.explain_weights_df(classifier.named_steps['clf'], top=50, feature_names = feature_list)
        print(NB_features)      
        print("*"*100)      
        
        
        clf = RandomForestClassifier(n_estimators=10, criterion='gini')
        classifier = semantic_classifier(clf, feature_list)
        start = time.perf_counter()
        classifier.fit(train, train_label)
        end = time.perf_counter()
        fit_time = end - start
        start = time.perf_counter()
        predicted_label = classifier.predict(test)
        end = time.perf_counter()
        predict_time = end - start
        recall, precision, f1 = measure_performance(test_label, predicted_label)
        cv_recall_RF.append(recall)
        cv_precision_RF.append(precision)
        cv_f1_RF.append(f1)
        cv_fit_time_RF.append(fit_time)
        cv_predict_time_RF.append(predict_time)
        print("----------------RF------------------")
        RF_features = eli5.explain_weights_df(classifier.named_steps['clf'], top=50, feature_names = feature_list)
        print(RF_features)      
        print("*"*100)

        
    recall_DT = np.mean(cv_recall_DT, axis= 0)
    precision_DT = np.mean(cv_precision_DT, axis= 0)
    f1_DT = np.mean(cv_f1_DT, axis= 0)
    fit_time_DT = np.mean(cv_fit_time_DT)
    predict_time_DT = np.mean(cv_predict_time_DT)

    recall_KNN = np.mean(cv_recall_KNN, axis= 0)
    precision_KNN = np.mean(cv_precision_KNN, axis= 0)
    f1_KNN = np.mean(cv_f1_KNN, axis= 0)
    fit_time_KNN = np.mean(cv_fit_time_KNN)
    predict_time_KNN = np.mean(cv_predict_time_KNN)
    
    recall_SVM = np.mean(cv_recall_SVM, axis= 0)
    precision_SVM = np.mean(cv_precision_SVM, axis= 0)
    f1_SVM =  np.mean(cv_f1_SVM, axis= 0)
    fit_time_SVM = np.mean(cv_fit_time_SVM)
    predict_time_SVM = np.mean(cv_predict_time_SVM)
    
    recall_NB = np.mean(cv_recall_NB, axis= 0)
    precision_NB = np.mean(cv_precision_NB, axis= 0)
    f1_NB = np.mean(cv_f1_NB, axis= 0)
    fit_time_NB = np.mean(cv_fit_time_NB)
    predict_time_NB = np.mean(cv_predict_time_NB)
    
    recall_RF = np.mean(cv_recall_RF, axis= 0)
    precision_RF = np.mean(cv_precision_RF, axis= 0)
    f1_RF = np.mean(cv_f1_RF, axis= 0)
    fit_time_RF = np.mean(cv_fit_time_RF)
    predict_time_RF = np.mean(cv_predict_time_RF)
    
    return recall_DT, precision_DT, f1_DT, fit_time_DT, predict_time_DT, recall_KNN, precision_KNN, f1_KNN,\
    fit_time_KNN, predict_time_KNN, recall_SVM, precision_SVM, f1_SVM, fit_time_SVM, predict_time_SVM, recall_NB,\
    precision_NB, f1_NB, fit_time_NB, predict_time_NB, recall_RF, precision_RF, f1_RF, fit_time_RF, predict_time_RF

In [38]:
def repeated_test(data, true_label, unique_words):
    repeated_recall_DT = []
    repeated_precision_DT = []
    repeated_f1_DT = []
    repeated_fit_time_DT = []
    repeated_predict_time_DT = []
    
    repeated_recall_KNN = []
    repeated_precision_KNN = []
    repeated_f1_KNN = []
    repeated_fit_time_KNN = []
    repeated_predict_time_KNN = []
    
    repeated_recall_SVM = []
    repeated_precision_SVM = []
    repeated_f1_SVM = []
    repeated_fit_time_SVM = []
    repeated_predict_time_SVM = []
    
    repeated_recall_NB = []
    repeated_precision_NB = []
    repeated_f1_NB = []
    repeated_fit_time_NB = []
    repeated_predict_time_NB = []
    
    repeated_recall_RF = []
    repeated_precision_RF = []
    repeated_f1_RF = []
    repeated_fit_time_RF = []
    repeated_predict_time_RF = []
    
    recall_DT= precision_DT= f1_DT= fit_time_DT= predict_time_DT= recall_KNN= precision_KNN= f1_KNN=\
    fit_time_KNN= predict_time_KNN= recall_SVM = precision_SVM= f1_SVM= fit_time_SVM= predict_time_SVM\
    = recall_NB= precision_NB= f1_NB= fit_time_NB= predict_time_NB= recall_RF= precision_RF= f1_RF\
    = fit_time_RF= predict_time_RF = 0
    
    for i in range(1):
        recall_DT, precision_DT, f1_DT, fit_time_DT, predict_time_DT, recall_KNN, precision_KNN, f1_KNN,\
        fit_time_KNN, predict_time_KNN, recall_SVM, precision_SVM, f1_SVM, fit_time_SVM, predict_time_SVM,\
        recall_NB, precision_NB, f1_NB, fit_time_NB, predict_time_NB, recall_RF, precision_RF, f1_RF, fit_time_RF,\
        predict_time_RF = kfold_cv(data, true_label, unique_words)
        
        repeated_recall_DT.append(recall_DT)
        repeated_precision_DT.append(precision_DT)
        repeated_f1_DT.append(f1_DT)
        repeated_fit_time_DT.append(fit_time_DT) 
        repeated_predict_time_DT.append(predict_time_DT)

        repeated_recall_KNN.append(recall_KNN)
        repeated_precision_KNN.append(precision_KNN)
        repeated_f1_KNN.append(f1_KNN)
        repeated_fit_time_KNN.append(fit_time_KNN) 
        repeated_predict_time_KNN.append(predict_time_KNN)

        repeated_recall_SVM.append(recall_SVM)
        repeated_precision_SVM.append(precision_SVM)
        repeated_f1_SVM.append(f1_SVM)
        repeated_fit_time_SVM.append(fit_time_SVM) 
        repeated_predict_time_SVM.append(predict_time_SVM)
        
        repeated_recall_NB.append(recall_NB)
        repeated_precision_NB.append(precision_NB)
        repeated_f1_NB.append(f1_NB)
        repeated_fit_time_NB.append(fit_time_NB) 
        repeated_predict_time_NB.append(predict_time_NB)
        
        repeated_recall_RF.append(recall_RF)
        repeated_precision_RF.append(precision_RF)
        repeated_f1_RF.append(f1_RF)
        repeated_fit_time_RF.append(fit_time_RF) 
        repeated_predict_time_RF.append(predict_time_RF)
        
    print("-------DT-------")
    print("Recall:", np.median(repeated_recall_DT, axis= 0))
    print("Precision:", np.median(repeated_precision_DT, axis= 0))
    print("f1 score:", np.median(repeated_f1_DT, axis= 0))
    print("Fit time:", np.median(repeated_fit_time_DT))
    print("Predict time:", np.median(repeated_predict_time_DT))

    print("-------KNN-------")
    print("Recall:", np.median(repeated_recall_KNN, axis= 0))
    print("Precision:", np.median(repeated_precision_KNN, axis= 0))
    print("f1 score:", np.median(repeated_f1_KNN, axis= 0))
    print("Fit time:", np.median(repeated_fit_time_KNN))
    print("Predict time:", np.median(repeated_predict_time_KNN))

    print("-------SVM-------")
    print("Recall:", np.median(repeated_recall_SVM, axis= 0))
    print("Precision:", np.median(repeated_precision_SVM, axis= 0))
    print("f1 score:", np.median(repeated_f1_SVM, axis= 0))
    print("Fit time:", np.median(repeated_fit_time_SVM))
    print("Predict time:", np.median(repeated_predict_time_SVM))
    
    print("-------NB-------")
    print("Recall:", np.median(repeated_recall_NB, axis= 0))
    print("Precision:", np.median(repeated_precision_NB, axis= 0))
    print("f1 score:", np.median(repeated_f1_NB, axis= 0))
    print("Fit time:", np.median(repeated_fit_time_NB))
    print("Predict time:", np.median(repeated_predict_time_NB))
    
    print("-------RF-------")
    print("Recall:", np.median(repeated_recall_RF, axis= 0))
    print("Precision:", np.median(repeated_precision_RF, axis= 0))
    print("f1 score:", np.median(repeated_f1_RF, axis= 0))
    print("Fit time:", np.median(repeated_fit_time_RF))
    print("Predict time:", np.median(repeated_predict_time_RF))

In [39]:
#process_data = pd.read_csv('..//FINAL_PROCESS_METRICS.csv') 
process_data = pd.read_csv('..//test.csv') 
print("Initial process data shape: ", process_data.shape)
code_data = pd.read_csv('..//FINAL_CODE_METRICS.csv') 
print("Initial code data shape: ", code_data.shape)

actual_process_file_name = process_data['file_']
actual_code_file_name = code_data['FILE_PATH']

formatted_process_file_name = []
formatted_code_file_name = []   

for item in actual_process_file_name:
    formatted_process_file_name.append(re.split('V5/', item)[1]) 
for item in actual_code_file_name:
    formatted_code_file_name.append(re.split('V5/', item)[1])
    
process_data['file_'] =  formatted_process_file_name   
code_data['FILE_PATH'] =  formatted_code_file_name   
    
formatted_process_file_name = set(line.strip() for line in formatted_process_file_name)
formatted_code_file_name = set(line.strip() for line in formatted_code_file_name)    
    
common_file_name = []
true_label = []
for common_entry in formatted_process_file_name & formatted_code_file_name:
    if common_entry:
        process_index =  process_data[process_data['file_'] == common_entry].index[0]
        common_file_name.append(process_data.iloc[process_index]['FILE_PATH'])
        true_label.append(process_data.iloc[process_index]['defect_status'])

true_label = np.array(true_label)
file_name = np.array(common_file_name)

parsed_files = parse_source_code(file_name)
encoded_files, unique_words = encode_vector_of_nodes(parsed_files)

repeated_test(encoded_files, true_label, unique_words) #repeat kfold 10 times and report avarage performance

Initial process data shape:  (43, 12)
Initial code data shape:  (6396, 10)
Feature Number:  1201
Number of selected components:  6
Before applying PCA train set size:  (13, 1770)
Before applying PCA test set size:  (14, 1770)
After applying PCA train set size:  (13, 6)
After applying PCA test set size:  (14, 6)
[Pipeline] ............... (step 1 of 2) Processing dbn, total=   1.8s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.0s
----------------CART------------------
                       feature  weight
0                callablebyown     1.0
1                       getter     0.0
2                         flag     0.0
3                          fix     0.0
4                          fit     0.0
5                   firstblood     0.0
6                        first     0.0
7                         firm     0.0
8                         fire     0.0
9                        finir     0.0
10                        find     0.0
11                       final     0.0

[Pipeline] ............... (step 1 of 2) Processing dbn, total=   2.0s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.0s
----------------KNN------------------
None
****************************************************************************************************
[Pipeline] ............... (step 1 of 2) Processing dbn, total=   2.1s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.0s
----------------SVM------------------
None
****************************************************************************************************
[Pipeline] ............... (step 1 of 2) Processing dbn, total=   2.5s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.0s
----------------NB------------------
None
****************************************************************************************************
[Pipeline] ............... (step 1 of 2) Processing dbn, total=   2.3s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.0s
-