In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.neighbors import KNeighborsClassifier
from dbn.tensorflow import SupervisedDBNClassification
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix
from sklearn.preprocessing import StandardScaler
from collections import Counter

import tensorflow as tf 
tf.compat.v1.disable_eager_execution()

In [16]:
def findCorrectBuggy(yTest, yPredict):
    CorrectbuggyCount = 0
    CorrectNonbuggyCount = 0
    for index in range(len(yTest)):
        if(yTest[index] == 1 and yPredict[index] == 1):
            CorrectbuggyCount += 1
        elif(yTest[index] == 0 and yPredict[index] == 0):
            CorrectNonbuggyCount += 1
    print("{}, {}, {}, {}, {}, {}".format(
          Counter(yTest)[1],  
          Counter(yTest)[0],                            
          Counter(yPredict)[1],
          Counter(yPredict)[0],
          CorrectbuggyCount,
          CorrectNonbuggyCount))
#     print("yTest:", Counter(yTest)[1], Counter(yTest)[0],
#           "yPredict:", Counter(yPredict)[1], Counter(yPredict)[0], 
#           "Correct Buggy:", CorrectbuggyCount, 
#           "Correct Non Buggy:", CorrectNonbuggyCount)

def runML(algo, xTrain, xTest, yTrain, yTest):
    if(algo=="lr"):
        #For small datasets, ‘liblinear’ is a good choice, whereas ‘sag’ and ‘saga’ are faster for large ones.
        #Default  max_iter=100 created convergence problem for some cases!!!
        #In this experiment, 'lr' provided ConvergenceWarning: with all the datasets/optimizer. 
        clf = LogisticRegression(random_state=42, solver='liblinear')
    elif(algo=="rf"):
        clf = RandomForestClassifier(random_state=42)
    elif(algo=="knn"):
        clf = KNeighborsClassifier(n_neighbors=5)
    elif(algo=="gbc"):
        clf = GradientBoostingClassifier()
    elif(algo=="pct"):
        clf = Perceptron()
    elif(algo=="dbn"):
        #https://stats.stackexchange.com/questions/181/
        hLayers = int((xTrain.shape[1]+2) / 2) #Hidden Layer Nodes: Mean of Input Layer and Output Layer Nodes.
        if(hLayers < 100):
            hLayers = xTrain.shape[1]
            
        if(hLayers > 2500):
            hLayers = 2500
        
        print("Input Nodes:", xTrain.shape[1], "Hidden Nodes:", hLayers)
            
#         elif(xTrain.shape[0] > 500):
#             hLayers = int(xTrain.shape[0] *0.50)
#         else:
#             hLayers = int(xTrain.shape[0])
            
        clf = SupervisedDBNClassification(hidden_layers_structure =[hLayers], 
                                             learning_rate_rbm=0.0001, 
                                             learning_rate=0.0001, 
                                             n_epochs_rbm=15, 
                                             n_iter_backprop=10, 
                                             batch_size=32, 
                                             activation_function='relu', 
                                             dropout_p=0.20)
        
    clf.fit(xTrain, yTrain)
    
    yPredict = clf.predict(xTest)
    
    #print(Counter(yTest), Counter(yPredict))
    
    #print(confusion_matrix(yTest, yPredict))
    findCorrectBuggy(list(yTest), list(yPredict))
    
    return precision_recall_fscore_support(yTest, yPredict, average='binary')

def runRF(xTrain, xTest, yTrain, yTest, max_depth=100, n_estimators=100):
    clf = RandomForestClassifier(max_depth=100, random_state=42, n_estimators=100)
    clf.fit(xTrain, yTrain)
    yPredict = clf.predict(xTest)    
    return precision_recall_fscore_support(yTest, yPredict, average='binary')
    

In [3]:
file_path = "H:/BIC-Review-1/Baselines/JITLine-replication-package-master/JITLine/data/"

qt_gs = pd.read_csv(file_path+ "qt_gs.csv") 
qt_ts = pd.read_csv(file_path+ "qt_ts.csv") 
qt_tp = pd.read_csv(file_path+ "qt_tp.csv") 

openstack_gs = pd.read_csv(file_path+ "openstack_gs.csv") 
openstack_ts = pd.read_csv(file_path+ "openstack_ts.csv") 

for ds in [qt_gs, qt_ts, qt_tp, openstack_gs, openstack_ts]:
    training_size = int(ds.shape[0] * 0.70)
    
    print("Shape:", ds.shape, "Training Size:", training_size)

Shape: (2443, 32) Training Size: 1710
Shape: (2447, 6465) Training Size: 1712
Shape: (2443, 4632) Training Size: 1710
Shape: (1571, 32) Training Size: 1099
Shape: (1571, 217) Training Size: 1099


In [5]:
print("QT:", Counter(qt_gs['bugcount']))
print("OpenStack:", Counter(openstack_gs['bugcount']))

QT: Counter({0: 1396, 1: 1047})
OpenStack: Counter({1: 908, 0: 663})


In [4]:
qt_gs_ts = pd.merge(qt_gs,
                  qt_ts,
                  on=['commit_id', 'author_date', 'bugcount', 'fixcount'])

qt_gs_tp = pd.merge(qt_gs,
                  qt_tp,
                  on=['commit_id', 'author_date', 'bugcount', 'fixcount'])

qt_ts_tp = pd.merge(qt_ts,
                  qt_tp,
                  on=['commit_id', 'author_date', 'bugcount', 'fixcount'])

qt_gs_ts_tp = pd.merge(pd.merge(qt_gs,
                  qt_ts,
                  on=['commit_id', 'author_date', 'bugcount', 'fixcount']), 
                qt_tp, 
                on=['commit_id', 'author_date', 'bugcount', 'fixcount'])

openstack_gs_ts = pd.merge(openstack_gs,
                  openstack_ts,
                  on=['commit_id', 'author_date', 'bugcount', 'fixcount'])

In [17]:
for ds in [qt_gs, qt_ts, qt_tp, qt_gs_ts, qt_gs_tp, qt_ts_tp, qt_gs_ts_tp, openstack_gs, openstack_ts, openstack_gs_ts]:
    training_size = int(ds.shape[0] * 0.70)
    
    X = ds.drop(columns=['commit_id', 'author_date', 'bugcount', 'fixcount'])
    Y = ds['bugcount']
    
    trainX = X.iloc[0: training_size, :]
    trainY = Y[0: training_size]
    
    testX = X.iloc[training_size: , :]
    testY = Y[training_size: ]
    
    #print(trainX.shape, trainY.shape, testX.shape, testY.shape)
    
    for algo in ["rf", "knn", "gbc", "pct"]:   
        result=runML(algo, trainX, 
                         testX, 
                         trainY, testY)

        print(algo, [round(float(x), 2) for x in result[0:3]])
    print("--------------")

220, 513, 179, 554, 89, 423
rf [0.5, 0.4, 0.45]
220, 513, 267, 466, 103, 349
knn [0.39, 0.47, 0.42]
220, 513, 228, 505, 110, 395
gbc [0.48, 0.5, 0.49]
220, 513, 241, 492, 91, 363
pct [0.38, 0.41, 0.39]
--------------
219, 516, 329, 406, 145, 332
rf [0.44, 0.66, 0.53]
219, 516, 292, 443, 125, 349
knn [0.43, 0.57, 0.49]
219, 516, 280, 455, 127, 363
gbc [0.45, 0.58, 0.51]
219, 516, 508, 227, 159, 167
pct [0.31, 0.73, 0.44]
--------------
220, 513, 325, 408, 135, 323
rf [0.42, 0.61, 0.5]
220, 513, 213, 520, 81, 381
knn [0.38, 0.37, 0.37]
220, 513, 261, 472, 120, 372
gbc [0.46, 0.55, 0.5]
220, 513, 386, 347, 135, 262
pct [0.35, 0.61, 0.45]
--------------
219, 511, 290, 440, 138, 359
rf [0.48, 0.63, 0.54]
219, 511, 266, 464, 102, 347
knn [0.38, 0.47, 0.42]
219, 511, 228, 502, 117, 400
gbc [0.51, 0.53, 0.52]
219, 511, 425, 305, 104, 190
pct [0.24, 0.47, 0.32]
--------------
220, 513, 236, 497, 116, 393
rf [0.49, 0.53, 0.51]
220, 513, 267, 466, 103, 349
knn [0.39, 0.47, 0.42]
220, 513, 209, 52

In [104]:
for ds in [qt_gs, qt_ts, qt_tp, qt_gs_ts, qt_gs_tp, qt_ts_tp, qt_gs_ts_tp, openstack_gs, openstack_ts, openstack_gs_ts]:
    training_size = int(ds.shape[0] * 0.70)
    
    X = ds.drop(columns=['commit_id', 'author_date', 'bugcount', 'fixcount'])
    Y = ds['bugcount']
    
    ss=StandardScaler()
    X = ss.fit_transform(X)
    
    trainX = X[0: training_size, :]
    trainY = Y[0: training_size]
    
    testX = X[training_size: , :]
    testY = Y[training_size: ]
    
    for algo in ["dbn"]:   
        result=runML(algo, trainX, 
                         testX, 
                         trainY, testY)

        print(algo, [round(float(x), 2) for x in result[0:3]])
    print("--------------")

Input Nodes: 28 Hidden Nodes: 15
[START] Pre-training step:
>> Epoch 1 finished 	RBM Reconstruction error 24.930562
>> Epoch 2 finished 	RBM Reconstruction error 24.921023
>> Epoch 3 finished 	RBM Reconstruction error 24.911267
>> Epoch 4 finished 	RBM Reconstruction error 24.901730
>> Epoch 5 finished 	RBM Reconstruction error 24.891861
>> Epoch 6 finished 	RBM Reconstruction error 24.882382
>> Epoch 7 finished 	RBM Reconstruction error 24.873202
>> Epoch 8 finished 	RBM Reconstruction error 24.864701
>> Epoch 9 finished 	RBM Reconstruction error 24.856902
>> Epoch 10 finished 	RBM Reconstruction error 24.849675
>> Epoch 11 finished 	RBM Reconstruction error 24.843268
>> Epoch 12 finished 	RBM Reconstruction error 24.837389
>> Epoch 13 finished 	RBM Reconstruction error 24.832146
>> Epoch 14 finished 	RBM Reconstruction error 24.827505
>> Epoch 15 finished 	RBM Reconstruction error 24.823663
[END] Pre-training step
[START] Fine tuning step:
>> Epoch 0 finished 	ANN training loss 0.693

>> Epoch 8 finished 	RBM Reconstruction error 10692.399757
>> Epoch 9 finished 	RBM Reconstruction error 10464.393128
>> Epoch 10 finished 	RBM Reconstruction error 10311.058631
>> Epoch 11 finished 	RBM Reconstruction error 10102.186236
>> Epoch 12 finished 	RBM Reconstruction error 10048.649563
>> Epoch 13 finished 	RBM Reconstruction error 9934.496160
>> Epoch 14 finished 	RBM Reconstruction error 9946.059170
>> Epoch 15 finished 	RBM Reconstruction error 9927.421720
[END] Pre-training step
[START] Fine tuning step:
>> Epoch 0 finished 	ANN training loss 0.685374
>> Epoch 1 finished 	ANN training loss 0.684074
>> Epoch 2 finished 	ANN training loss 0.683038
>> Epoch 3 finished 	ANN training loss 0.683068
>> Epoch 4 finished 	ANN training loss 0.681369
>> Epoch 5 finished 	ANN training loss 0.680412
>> Epoch 6 finished 	ANN training loss 0.679676
>> Epoch 7 finished 	ANN training loss 0.678979
>> Epoch 8 finished 	ANN training loss 0.678338
>> Epoch 9 finished 	ANN training loss 0.67

In [8]:
for ds in [qt_gs, qt_ts, qt_tp, qt_gs_ts, qt_gs_tp, qt_ts_tp, qt_gs_ts_tp, openstack_gs, openstack_ts, openstack_gs_ts]:
    training_size = int(ds.shape[0] * 0.70)
    
    X = ds.drop(columns=['commit_id', 'author_date', 'bugcount', 'fixcount'])
    Y = ds['bugcount']
    
    ss=StandardScaler()
    X = ss.fit_transform(X)
    
    trainX = X[0: training_size, :]
    trainY = Y[0: training_size]
    
    testX = X[training_size: , :]
    testY = Y[training_size: ]
    
    for algo in ["dbn"]:   
        result=runML(algo, trainX, 
                         testX, 
                         trainY, testY)

        print(algo, [round(float(x), 2) for x in result[0:3]])
    print("--------------")

Input Nodes: 28 Hidden Nodes: 28
[START] Pre-training step:
>> Epoch 1 finished 	RBM Reconstruction error 24.830735
>> Epoch 2 finished 	RBM Reconstruction error 24.820828
>> Epoch 3 finished 	RBM Reconstruction error 24.810973
>> Epoch 4 finished 	RBM Reconstruction error 24.801591
>> Epoch 5 finished 	RBM Reconstruction error 24.792341
>> Epoch 6 finished 	RBM Reconstruction error 24.783740
>> Epoch 7 finished 	RBM Reconstruction error 24.775177
>> Epoch 8 finished 	RBM Reconstruction error 24.767218
>> Epoch 9 finished 	RBM Reconstruction error 24.760063
>> Epoch 10 finished 	RBM Reconstruction error 24.753389
>> Epoch 11 finished 	RBM Reconstruction error 24.747713
>> Epoch 12 finished 	RBM Reconstruction error 24.742929
>> Epoch 13 finished 	RBM Reconstruction error 24.738936
>> Epoch 14 finished 	RBM Reconstruction error 24.735686
>> Epoch 15 finished 	RBM Reconstruction error 24.732630
[END] Pre-training step
[START] Fine tuning step:
>> Epoch 0 finished 	ANN training loss 0.693

>> Epoch 8 finished 	RBM Reconstruction error 10682.339442
>> Epoch 9 finished 	RBM Reconstruction error 10529.018591
>> Epoch 10 finished 	RBM Reconstruction error 10331.831056
>> Epoch 11 finished 	RBM Reconstruction error 10200.482435
>> Epoch 12 finished 	RBM Reconstruction error 10052.518127
>> Epoch 13 finished 	RBM Reconstruction error 10011.869177
>> Epoch 14 finished 	RBM Reconstruction error 9930.736882
>> Epoch 15 finished 	RBM Reconstruction error 9930.964099
[END] Pre-training step
[START] Fine tuning step:
>> Epoch 0 finished 	ANN training loss 0.684557
>> Epoch 1 finished 	ANN training loss 0.683350
>> Epoch 2 finished 	ANN training loss 0.682475
>> Epoch 3 finished 	ANN training loss 0.681589
>> Epoch 4 finished 	ANN training loss 0.680912
>> Epoch 5 finished 	ANN training loss 0.680008
>> Epoch 6 finished 	ANN training loss 0.679316
>> Epoch 7 finished 	ANN training loss 0.678657
>> Epoch 8 finished 	ANN training loss 0.678057
>> Epoch 9 finished 	ANN training loss 0.6

In [87]:
Counter([1, 2, 1, 2, 1, 1])[1]

4

In [82]:
ExcludedAutoSpearman = open(file_path+"../ranked_feature_names/excludeAutoSpearman.txt").read().split('\n')[0:-1]
print(len(ExcludedAutoSpearman), len(qt_tp.columns))
importantFeatures = []
for c in qt_tp.columns:
    if c not in ExcludedAutoSpearman:
        importantFeatures.append(c)

print(len(importantFeatures))

2992 4632
1704


In [74]:
# for ds in [qt_gs, qt_ts, qt_tp, openstack_gs, openstack_ts]:
for ds in [qt_tp]:
    training_size = int(ds.shape[0] * 0.70)
    
    X = ds[importantFeatures].drop(columns=['commit_id', 'author_date', 'bugcount', 'fixcount'])
    Y = ds['bugcount']
    
    ss=StandardScaler()
    X = ss.fit_transform(X)
    
    trainX = X[0: training_size, :]
    trainY = Y[0: training_size]
    
    testX = X[training_size: , :]
    testY = Y[training_size: ]
    
    for algo in ["dbn"]:   
        result=runML(algo, trainX, 
                         testX, 
                         trainY, testY)

        print(algo, [round(float(x), 2) for x in result[0:3]])
    print("--------------")

[START] Pre-training step:
>> Epoch 1 finished 	RBM Reconstruction error 1675.878165
>> Epoch 2 finished 	RBM Reconstruction error 1674.296602
>> Epoch 3 finished 	RBM Reconstruction error 1672.147958
>> Epoch 4 finished 	RBM Reconstruction error 1669.355983
>> Epoch 5 finished 	RBM Reconstruction error 1665.863487
>> Epoch 6 finished 	RBM Reconstruction error 1661.572489
>> Epoch 7 finished 	RBM Reconstruction error 1656.441172
>> Epoch 8 finished 	RBM Reconstruction error 1650.456078
>> Epoch 9 finished 	RBM Reconstruction error 1643.563776
>> Epoch 10 finished 	RBM Reconstruction error 1635.801804
>> Epoch 11 finished 	RBM Reconstruction error 1627.208685
>> Epoch 12 finished 	RBM Reconstruction error 1617.971643
>> Epoch 13 finished 	RBM Reconstruction error 1608.326682
>> Epoch 14 finished 	RBM Reconstruction error 1598.623935
>> Epoch 15 finished 	RBM Reconstruction error 1589.285727
[END] Pre-training step
[START] Fine tuning step:
>> Epoch 0 finished 	ANN training loss 0.691617