In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn import svm, tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import time
import re

import warnings
warnings.filterwarnings('ignore')

In [2]:
def read_csv_file (filename):
    df = pd.read_csv(filename)  # read the csv file
    return df

In [3]:
def perfrom_CART(train, test, train_label, test_label):
    clf = tree.DecisionTreeClassifier(criterion='gini', splitter='best', 
                                      min_samples_split = 2, min_weight_fraction_leaf=0.0)
    start = time.perf_counter()
    clf.fit(train, train_label)
    end = time.perf_counter()
    fit_time = end - start
    start = time.perf_counter()
    predicted_label = clf.predict(test)
    end = time.perf_counter()
    predict_time = end - start
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1, fit_time, predict_time

In [4]:
def perfrom_KNN(train, test, train_label, test_label):
    clf = KNeighborsClassifier(n_neighbors=5, weights='distance')
    start = time.perf_counter()
    clf.fit(train, train_label)
    end = time.perf_counter()
    fit_time = end - start
    start = time.perf_counter()
    predicted_label = clf.predict(test)
    end = time.perf_counter()
    predict_time = end - start
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1, fit_time, predict_time

In [5]:
def perfrom_SVM(train, test, train_label, test_label):
    clf = svm.SVC(gamma='auto', C = 20.0, kernel='rbf')
    start = time.perf_counter()
    clf.fit(train, train_label)
    end = time.perf_counter()
    fit_time = end - start
    start = time.perf_counter()
    predicted_label = clf.predict(test)
    end = time.perf_counter()
    predict_time = end - start
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1, fit_time, predict_time

In [6]:
def perfrom_NB(train, test, train_label, test_label):
    clf = GaussianNB()
    start = time.perf_counter()
    clf.fit(train, train_label)
    end = time.perf_counter()
    fit_time = end - start
    start = time.perf_counter()
    predicted_label = clf.predict(test)
    end = time.perf_counter()
    predict_time = end - start
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1, fit_time, predict_time

In [7]:
def perfrom_RF(train, test, train_label, test_label):
    clf = RandomForestClassifier(n_estimators=10, criterion='gini')
    start = time.perf_counter()
    clf.fit(train, train_label)
    end = time.perf_counter()
    fit_time = end - start
    start = time.perf_counter()
    predicted_label = clf.predict(test)
    end = time.perf_counter()
    predict_time = end - start
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1, fit_time, predict_time

In [8]:
def measure_performance(true_label, predicted_label):   
    precision = recall = f1 = np.zeros(2, dtype=np.float32)
    report = classification_report(true_label, predicted_label, digits=3)
    precision = precision_score(true_label, predicted_label, average=None, labels=[0,1])
    recall = recall_score(true_label, predicted_label, average=None, labels=[0,1])
    f1 = f1_score(true_label, predicted_label, average=None, labels=[0,1])
    return recall, precision, f1

In [9]:
def kfold_cv(data, true_label):
    # 10 fold cv
    kf = KFold(n_splits=10, shuffle = True, random_state = 7)

    cv_recall_DT = []
    cv_precision_DT = []
    cv_f1_DT = []
    cv_fit_time_DT = []
    cv_predict_time_DT = []
    
    cv_recall_KNN = []
    cv_precision_KNN = []
    cv_f1_KNN = []
    cv_fit_time_KNN = []
    cv_predict_time_KNN = []
    
    cv_recall_SVM = []
    cv_precision_SVM = []
    cv_f1_SVM = []
    cv_fit_time_SVM = []
    cv_predict_time_SVM = []
    
    cv_recall_NB = []
    cv_precision_NB = []
    cv_f1_NB = []
    cv_fit_time_NB = []
    cv_predict_time_NB = []
    
    cv_recall_RF = []
    cv_precision_RF = []
    cv_f1_RF = []
    cv_fit_time_RF =[]
    cv_predict_time_RF = []


    for train_index, test_index in kf.split(data):
        train, test = data[train_index], data[test_index]
        train_label, test_label = true_label[train_index], true_label[test_index]
    
        train = train.reshape(-1, 1)
        test = test.reshape(-1, 1)

        recall, precision, f1, fit_time, predict_time = perfrom_CART(train, test, train_label, test_label)
        cv_recall_DT.append(recall)
        cv_precision_DT.append(precision)
        cv_f1_DT.append(f1)
        cv_fit_time_DT.append(fit_time)
        cv_predict_time_DT.append(predict_time)

        recall, precision, f1, fit_time, predict_time = perfrom_KNN(train, test, train_label, test_label)
        cv_recall_KNN.append(recall)
        cv_precision_KNN.append(precision)
        cv_f1_KNN.append(f1)
        cv_fit_time_KNN.append(fit_time)
        cv_predict_time_KNN.append(predict_time)

        recall, precision, f1, fit_time, predict_time = perfrom_SVM(train, test, train_label, test_label)
        cv_recall_SVM.append(recall)
        cv_precision_SVM.append(precision)
        cv_f1_SVM.append(f1)
        cv_fit_time_SVM.append(fit_time)
        cv_predict_time_SVM.append(predict_time)

        recall, precision, f1, fit_time, predict_time = perfrom_NB(train, test, train_label, test_label)
        cv_recall_NB.append(recall)
        cv_precision_NB.append(precision)
        cv_f1_NB.append(f1)
        cv_fit_time_NB.append(fit_time)
        cv_predict_time_NB.append(predict_time)

        recall, precision, f1, fit_time, predict_time = perfrom_RF(train, test, train_label, test_label)
        cv_recall_RF.append(recall)
        cv_precision_RF.append(precision)
        cv_f1_RF.append(f1)
        cv_fit_time_RF.append(fit_time)
        cv_predict_time_RF.append(predict_time)

    recall_DT = np.mean(cv_recall_DT, axis= 0)
    precision_DT = np.mean(cv_precision_DT, axis= 0)
    f1_DT = np.mean(cv_f1_DT, axis= 0)
    fit_time_DT = np.mean(cv_fit_time_DT)
    predict_time_DT = np.mean(cv_predict_time_DT)

    recall_KNN = np.mean(cv_recall_KNN, axis= 0)
    precision_KNN = np.mean(cv_precision_KNN, axis= 0)
    f1_KNN = np.mean(cv_f1_KNN, axis= 0)
    fit_time_KNN = np.mean(cv_fit_time_KNN)
    predict_time_KNN = np.mean(cv_predict_time_KNN)
    
    recall_SVM = np.mean(cv_recall_SVM, axis= 0)
    precision_SVM = np.mean(cv_precision_SVM, axis= 0)
    f1_SVM =  np.mean(cv_f1_SVM, axis= 0)
    fit_time_SVM = np.mean(cv_fit_time_SVM)
    predict_time_SVM = np.mean(cv_predict_time_SVM)
    
    recall_NB = np.mean(cv_recall_NB, axis= 0)
    precision_NB = np.mean(cv_precision_NB, axis= 0)
    f1_NB = np.mean(cv_f1_NB, axis= 0)
    fit_time_NB = np.mean(cv_fit_time_NB)
    predict_time_NB = np.mean(cv_predict_time_NB)
    
    recall_RF = np.mean(cv_recall_RF, axis= 0)
    precision_RF = np.mean(cv_precision_RF, axis= 0)
    f1_RF = np.mean(cv_f1_RF, axis= 0)
    fit_time_RF = np.mean(cv_fit_time_RF)
    predict_time_RF = np.mean(cv_predict_time_RF)
    
    return recall_DT, precision_DT, f1_DT, fit_time_DT, predict_time_DT, recall_KNN, precision_KNN, f1_KNN,\
    fit_time_KNN, predict_time_KNN, recall_SVM, precision_SVM, f1_SVM, fit_time_SVM, predict_time_SVM, recall_NB,\
    precision_NB, f1_NB, fit_time_NB, predict_time_NB, recall_RF, precision_RF, f1_RF, fit_time_RF, predict_time_RF

In [10]:
def repeated_test(data, true_label):
    repeated_recall_DT = []
    repeated_precision_DT = []
    repeated_f1_DT = []
    repeated_fit_time_DT = []
    repeated_predict_time_DT = []
    
    repeated_recall_KNN = []
    repeated_precision_KNN = []
    repeated_f1_KNN = []
    repeated_fit_time_KNN = []
    repeated_predict_time_KNN = []
    
    repeated_recall_SVM = []
    repeated_precision_SVM = []
    repeated_f1_SVM = []
    repeated_fit_time_SVM = []
    repeated_predict_time_SVM = []
    
    repeated_recall_NB = []
    repeated_precision_NB = []
    repeated_f1_NB = []
    repeated_fit_time_NB = []
    repeated_predict_time_NB = []
    
    repeated_recall_RF = []
    repeated_precision_RF = []
    repeated_f1_RF = []
    repeated_fit_time_RF = []
    repeated_predict_time_RF = []
    
    recall_DT= precision_DT= f1_DT= fit_time_DT= predict_time_DT= recall_KNN= precision_KNN= f1_KNN=\
    fit_time_KNN= predict_time_KNN= recall_SVM = precision_SVM= f1_SVM= fit_time_SVM= predict_time_SVM\
    = recall_NB= precision_NB= f1_NB= fit_time_NB= predict_time_NB= recall_RF= precision_RF= f1_RF\
    = fit_time_RF= predict_time_RF = 0
    
    for i in range(10):
        recall_DT, precision_DT, f1_DT, fit_time_DT, predict_time_DT, recall_KNN, precision_KNN, f1_KNN,\
        fit_time_KNN, predict_time_KNN, recall_SVM, precision_SVM, f1_SVM, fit_time_SVM, predict_time_SVM,\
        recall_NB, precision_NB, f1_NB, fit_time_NB, predict_time_NB, recall_RF, precision_RF, f1_RF, fit_time_RF,\
        predict_time_RF = kfold_cv(data, true_label)
        
        repeated_recall_DT.append(recall_DT)
        repeated_precision_DT.append(precision_DT)
        repeated_f1_DT.append(f1_DT)
        repeated_fit_time_DT.append(fit_time_DT) 
        repeated_predict_time_DT.append(predict_time_DT)

        repeated_recall_KNN.append(recall_KNN)
        repeated_precision_KNN.append(precision_KNN)
        repeated_f1_KNN.append(f1_KNN)
        repeated_fit_time_KNN.append(fit_time_KNN) 
        repeated_predict_time_KNN.append(predict_time_KNN)

        repeated_recall_SVM.append(recall_SVM)
        repeated_precision_SVM.append(precision_SVM)
        repeated_f1_SVM.append(f1_SVM)
        repeated_fit_time_SVM.append(fit_time_SVM) 
        repeated_predict_time_SVM.append(predict_time_SVM)
        
        repeated_recall_NB.append(recall_NB)
        repeated_precision_NB.append(precision_NB)
        repeated_f1_NB.append(f1_NB)
        repeated_fit_time_NB.append(fit_time_NB) 
        repeated_predict_time_NB.append(predict_time_NB)
        
        repeated_recall_RF.append(recall_RF)
        repeated_precision_RF.append(precision_RF)
        repeated_f1_RF.append(f1_RF)
        repeated_fit_time_RF.append(fit_time_RF) 
        repeated_predict_time_RF.append(predict_time_RF)
        
    print("-------DT-------")
    print("Recall:", np.median(repeated_recall_DT, axis= 0))
    print("Precision:", np.median(repeated_precision_DT, axis= 0))
    print("f1 score:", np.median(repeated_f1_DT, axis= 0))
    print("Fit time:", np.median(repeated_fit_time_DT))
    print("Predict time:", np.median(repeated_predict_time_DT))

    print("-------KNN-------")
    print("Recall:", np.median(repeated_recall_KNN, axis= 0))
    print("Precision:", np.median(repeated_precision_KNN, axis= 0))
    print("f1 score:", np.median(repeated_f1_KNN, axis= 0))
    print("Fit time:", np.median(repeated_fit_time_KNN))
    print("Predict time:", np.median(repeated_predict_time_KNN))

    print("-------SVM-------")
    print("Recall:", np.median(repeated_recall_SVM, axis= 0))
    print("Precision:", np.median(repeated_precision_SVM, axis= 0))
    print("f1 score:", np.median(repeated_f1_SVM, axis= 0))
    print("Fit time:", np.median(repeated_fit_time_SVM))
    print("Predict time:", np.median(repeated_predict_time_SVM))
    
    print("-------NB-------")
    print("Recall:", np.median(repeated_recall_NB, axis= 0))
    print("Precision:", np.median(repeated_precision_NB, axis= 0))
    print("f1 score:", np.median(repeated_f1_NB, axis= 0))
    print("Fit time:", np.median(repeated_fit_time_NB))
    print("Predict time:", np.median(repeated_predict_time_NB))
    
    print("-------RF-------")
    print("Recall:", np.median(repeated_recall_RF, axis= 0))
    print("Precision:", np.median(repeated_precision_RF, axis= 0))
    print("f1 score:", np.median(repeated_f1_RF, axis= 0))
    print("Fit time:", np.median(repeated_fit_time_RF))
    print("Predict time:", np.median(repeated_predict_time_RF))

In [11]:
process_data = pd.read_csv('..//FINAL_PROCESS_METRICS.csv') 
print("Initial process data shape: ", process_data.shape)
code_data = pd.read_csv('..//FINAL_CODE_METRICS.csv') 
print("Initial code data shape: ", code_data.shape)

actual_process_file_name = process_data['file_']
actual_code_file_name = code_data['FILE_PATH']

formatted_process_file_name = []
formatted_code_file_name = []   

for item in actual_process_file_name:
    formatted_process_file_name.append(re.split('V5/', item)[1]) 
for item in actual_code_file_name:
    formatted_code_file_name.append(re.split('V5/', item)[1])
    
process_data['file_'] =  formatted_process_file_name   
code_data['FILE_PATH'] =  formatted_code_file_name   
    
formatted_process_file_name = set(line.strip() for line in formatted_process_file_name)
formatted_code_file_name = set(line.strip() for line in formatted_code_file_name)    
    

true_label = []
loc_data = []
for common_entry in formatted_process_file_name & formatted_code_file_name:
    if common_entry:
        process_index =  process_data[process_data['file_'] == common_entry].index[0]
        code_index =  code_data[code_data['FILE_PATH'] == common_entry].index[0]
        true_label.append(process_data.iloc[process_index]['defect_status'])
        loc_data.append(code_data.iloc[code_index]['SLOC'])

true_label = np.array(true_label)
loc_data = np.array(loc_data)

repeated_test(loc_data, true_label) #repeat kfold 10 times and report avarage performance

Initial process data shape:  (6477, 12)
Initial code data shape:  (6396, 10)
-------DT-------
Recall: [0.99335817 0.11259213]
Precision: [0.94915174 0.49352259]
f1 score: [0.97074569 0.18040838]
Fit time: 0.0031223281999928076
Predict time: 0.000171301500017762
-------KNN-------
Recall: [0.98508527 0.1684564 ]
Precision: [0.95182356 0.45538518]
f1 score: [0.9681246 0.22984  ]
Fit time: 0.0014468177000253489
Predict time: 0.001489203899984659
-------SVM-------
Recall: [0.99567612 0.10718673]
Precision: [0.94896127 0.66458874]
f1 score: [0.97175181 0.17832607]
Fit time: 0.11773631334996253
Predict time: 0.00898062830000299
-------NB-------
Recall: [0.04079454 0.95480803]
Precision: [0.93728648 0.05643696]
f1 score: [0.07810583 0.10648833]
Fit time: 0.000619145399991794
Predict time: 0.00015799855000295793
-------RF-------
Recall: [0.99178966 0.11203613]
Precision: [0.94903598 0.44859886]
f1 score: [0.96979103 0.17531689]
Fit time: 0.03495797920000996
Predict time: 0.0017374400999869977
