In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn import svm, tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import time

import warnings
warnings.filterwarnings('ignore')

In [2]:
def read_csv_file (filename):
    df = pd.read_csv(filename)  # read the csv file
    return df

In [3]:
def convert_label_to_numeric(label):
    converted_label = np.empty(len(label), dtype=object) 
    for i in range(len(label)):
        if label[i] == "INSECURE":
            converted_label[i] = 1
        elif label[i] == "NEUTRAL":
            converted_label[i] = 0
        else: 
            converted_label[i] = 2
    converted_label = converted_label.astype('int')
    return converted_label

In [4]:
def perfrom_CART(train, test, train_label, test_label):
    clf = tree.DecisionTreeClassifier(criterion='gini', splitter='best', 
                                      min_samples_split = 2, min_weight_fraction_leaf=0.0)
    start = time.perf_counter()
    clf.fit(train, train_label)
    end = time.perf_counter()
    print("CART fit time", (end - start))
    start = time.perf_counter()
    predicted_label = clf.predict(test)
    end = time.perf_counter()
    print("CART predict time", (end - start))
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1

In [5]:
def perfrom_KNN(train, test, train_label, test_label):
    clf = KNeighborsClassifier(n_neighbors=5, weights='distance')
    start = time.perf_counter()
    clf.fit(train, train_label)
    end = time.perf_counter()
    print("KNN fit time", (end - start))
    start = time.perf_counter()
    predicted_label = clf.predict(test)
    end = time.perf_counter()
    print("KNN predict time", (end - start))
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1

In [6]:
def perfrom_SVM(train, test, train_label, test_label):
    clf = svm.SVC(gamma='auto', C = 20.0, kernel='rbf')
    start = time.perf_counter()
    clf.fit(train, train_label)
    end = time.perf_counter()
    print("SVM fit time", (end - start))
    start = time.perf_counter()
    predicted_label = clf.predict(test)
    end = time.perf_counter()
    print("SVM predict time", (end - start))
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1

In [7]:
def perfrom_NB(train, test, train_label, test_label):
    clf = GaussianNB()
    start = time.perf_counter()
    clf.fit(train, train_label)
    end = time.perf_counter()
    print("NB fit time", (end - start))
    start = time.perf_counter()
    predicted_label = clf.predict(test)
    end = time.perf_counter()
    print("NB predict time", (end - start))
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1

In [8]:
def perfrom_RF(train, test, train_label, test_label):
    clf = RandomForestClassifier(n_estimators=10, criterion='gini')
    start = time.perf_counter()
    clf.fit(train, train_label)
    end = time.perf_counter()
    print("RF fit time", (end - start))
    start = time.perf_counter()
    predicted_label = clf.predict(test)
    end = time.perf_counter()
    print("RF predict time", (end - start))
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1

In [9]:
def measure_performance(true_label, predicted_label):   
    precision = recall = f1 = np.zeros(3, dtype=np.float32)
    report = classification_report(true_label, predicted_label, digits=3)
    precision = precision_score(true_label, predicted_label, average=None, labels=[0,1])
    recall = recall_score(true_label, predicted_label, average=None, labels=[0,1])
    f1 = f1_score(true_label, predicted_label, average=None, labels=[0,1])
    return recall, precision, f1

In [10]:
def kfold_cv(data, true_label):
    # 10 fold cv
    kf = KFold(n_splits=10, shuffle = True, random_state = 7)

    cv_recall_DT = []
    cv_precision_DT = []
    cv_f1_DT = []

    cv_recall_KNN = []
    cv_precision_KNN = []
    cv_f1_KNN = []

    cv_recall_SVM = []
    cv_precision_SVM = []
    cv_f1_SVM = []

    cv_recall_NB = []
    cv_precision_NB = []
    cv_f1_NB = []

    cv_recall_RF = []
    cv_precision_RF = []
    cv_f1_RF = []


    for train_index, test_index in kf.split(data):
        train, test = data.loc[train_index], data.loc[test_index]
        train_label, test_label = true_label[train_index], true_label[test_index]
    
        train = train.values.reshape(-1, 1)
        test = test.values.reshape(-1, 1)

        recall, precision, f1 = perfrom_CART(train, test, train_label, test_label)
        cv_recall_DT.append(recall)
        cv_precision_DT.append(precision)
        cv_f1_DT.append(f1)

        recall, precision, f1 = perfrom_KNN(train, test, train_label, test_label)
        cv_recall_KNN.append(recall)
        cv_precision_KNN.append(precision)
        cv_f1_KNN.append(f1)

        recall, precision, f1 = perfrom_SVM(train, test, train_label, test_label)
        cv_recall_SVM.append(recall)
        cv_precision_SVM.append(precision)
        cv_f1_SVM.append(f1)

        recall, precision, f1 = perfrom_NB(train, test, train_label, test_label)
        cv_recall_NB.append(recall)
        cv_precision_NB.append(precision)
        cv_f1_NB.append(f1)

        recall, precision, f1 = perfrom_RF(train, test, train_label, test_label)
        cv_recall_RF.append(recall)
        cv_precision_RF.append(precision)
        cv_f1_RF.append(f1)

    recall_DT = np.mean(cv_recall_DT, axis= 0)
    precision_DT = np.mean(cv_precision_DT, axis= 0)
    f1_DT = np.mean(cv_f1_DT, axis= 0)

    recall_KNN = np.mean(cv_recall_KNN, axis= 0)
    precision_KNN = np.mean(cv_precision_KNN, axis= 0)
    f1_KNN = np.mean(cv_f1_KNN, axis= 0)

    recall_SVM = np.mean(cv_recall_SVM, axis= 0)
    precision_SVM = np.mean(cv_precision_SVM, axis= 0)
    f1_SVM =  np.mean(cv_f1_SVM, axis= 0)

    recall_NB = np.mean(cv_recall_NB, axis= 0)
    precision_NB = np.mean(cv_precision_NB, axis= 0)
    f1_NB = np.mean(cv_f1_NB, axis= 0)

    recall_RF = np.mean(cv_recall_RF, axis= 0)
    precision_RF = np.mean(cv_precision_RF, axis= 0)
    f1_RF = np.mean(cv_f1_RF, axis= 0)
    
    return recall_DT, precision_DT, f1_DT, recall_KNN, precision_KNN, f1_KNN, recall_SVM, precision_SVM,\
    f1_SVM, recall_NB, precision_NB, f1_NB, recall_RF, precision_RF, f1_RF

In [11]:
def different_file_test(train, test, train_label, test_label):
    
    train = train.values.reshape(-1, 1)
    test = test.values.reshape(-1, 1)

    recall_DT, precision_DT, f1_DT = perfrom_CART(train, test, train_label, test_label)
    recall_KNN, precision_KNN, f1_KNN = perfrom_KNN(train, test, train_label, test_label)
    recall_SVM, precision_SVM, f1_SVM = perfrom_SVM(train, test, train_label, test_label)
    recall_NB, precision_NB, f1_NB = perfrom_NB(train, test, train_label, test_label)
    recall_RF, precision_RF, f1_RF = perfrom_RF(train, test, train_label, test_label)

    return recall_DT, precision_DT, f1_DT, recall_KNN, precision_KNN, f1_KNN, recall_SVM, precision_SVM,\
    f1_SVM, recall_NB, precision_NB, f1_NB, recall_RF, precision_RF, f1_RF

In [12]:
def repeated_test(data, true_label, train_data, test_data, train_label, test_label, test_name):
    repeated_recall_DT = []
    repeated_precision_DT = []
    repeated_f1_DT = []

    repeated_recall_KNN = []
    repeated_precision_KNN = []
    repeated_f1_KNN = []

    repeated_recall_SVM = []
    repeated_precision_SVM = []
    repeated_f1_SVM = []

    repeated_recall_NB = []
    repeated_precision_NB = []
    repeated_f1_NB = []

    repeated_recall_RF = []
    repeated_precision_RF = []
    repeated_f1_RF = []
    
    recall_DT= precision_DT= f1_DT= recall_KNN= precision_KNN= f1_KNN= recall_SVM= precision_SVM= f1_SVM\
    = recall_NB= precision_NB= f1_NB= recall_RF= precision_RF= f1_RF = 0
    
    for i in range(10):
        if test_name == "k_fold":
            recall_DT, precision_DT, f1_DT, recall_KNN, precision_KNN, f1_KNN, recall_SVM, precision_SVM, f1_SVM,\
            recall_NB, precision_NB, f1_NB, recall_RF, precision_RF, f1_RF = kfold_cv(data, true_label)
        else: 
            recall_DT, precision_DT, f1_DT, recall_KNN, precision_KNN, f1_KNN, recall_SVM, precision_SVM, f1_SVM,\
            recall_NB, precision_NB, f1_NB, recall_RF, precision_RF, f1_RF = different_file_test(train_data, test_data, train_label, test_label)
        
        repeated_recall_DT.append(recall_DT)
        repeated_precision_DT.append(precision_DT)
        repeated_f1_DT.append(f1_DT)

        repeated_recall_KNN.append(recall_KNN)
        repeated_precision_KNN.append(precision_KNN)
        repeated_f1_KNN.append(f1_KNN)

        repeated_recall_SVM.append(recall_SVM)
        repeated_precision_SVM.append(precision_SVM)
        repeated_f1_SVM.append(f1_SVM)

        repeated_recall_NB.append(recall_NB)
        repeated_precision_NB.append(precision_NB)
        repeated_f1_NB.append(f1_NB)

        repeated_recall_RF.append(recall_RF)
        repeated_precision_RF.append(precision_RF)
        repeated_f1_RF.append(f1_RF)
        
    print("-------DT-------")
    print("Recall:", np.median(repeated_recall_DT, axis= 0))
    print("Precision:", np.median(repeated_precision_DT, axis= 0))
    print("f1 score:", np.median(repeated_f1_DT, axis= 0))

    print("-------KNN-------")
    print("Recall:", np.median(repeated_recall_KNN, axis= 0))
    print("Precision:", np.median(repeated_precision_KNN, axis= 0))
    print("f1 score:", np.median(repeated_f1_KNN, axis= 0))

    print("-------SVM-------")
    print("Recall:", np.median(repeated_recall_SVM, axis= 0))
    print("Precision:", np.median(repeated_precision_SVM, axis= 0))
    print("f1 score:", np.median(repeated_f1_SVM, axis= 0))

    print("-------NB-------")
    print("Recall:", np.median(repeated_recall_NB, axis= 0))
    print("Precision:", np.median(repeated_precision_NB, axis= 0))
    print("f1 score:", np.median(repeated_f1_NB, axis= 0))

    print("-------RF-------")
    print("Recall:", np.median(repeated_recall_RF, axis= 0))
    print("Precision:", np.median(repeated_precision_RF, axis= 0))
    print("f1 score:", np.median(repeated_f1_RF, axis= 0))

In [13]:
data, true_label, train_data, test_data, train_label, test_label = [], [], [], [], [], [] # necessary variables

In [14]:
data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Data set: COLOCATED_MOZILLA.csv")
print("Initial data shape: ", data.shape)
true_label = data['ICP_STATUS']
true_label = convert_label_to_numeric(true_label)
data = data['SLOC']

repeated_test(data, true_label, train_data, test_data, train_label, test_label, "k_fold") #repeat kfold 10 times and report avarage performance

Data set: COLOCATED_MOZILLA.csv
Initial data shape:  (1613, 16)
CART fit time 0.003447921999850223
CART predict time 0.0012000800002169854
KNN fit time 0.0010890670000662794
KNN predict time 0.004619581000042672
SVM fit time 0.03609834500002762
SVM predict time 0.0037564520002888457
NB fit time 0.001598820999788586
NB predict time 0.00025634499979787506
RF fit time 0.03333687599979385
RF predict time 0.002964194000014686
CART fit time 0.0010530829999879643
CART predict time 0.0005915820001973771
KNN fit time 0.0006853489999230078
KNN predict time 0.0014455349996751465
SVM fit time 0.02520212499985064
SVM predict time 0.0022512120003739255
NB fit time 0.0006131589998403797
NB predict time 0.00023213499980556662
RF fit time 0.026573712999834243
RF predict time 0.0022792079998907866
CART fit time 0.0009037559998432698
CART predict time 0.0005317830000421964
KNN fit time 0.0004926120000163792
KNN predict time 0.0016172449995792704
SVM fit time 0.02081943399980446
SVM predict time 0.0020258

RF fit time 0.02894749100005356
RF predict time 0.002666690000296512
CART fit time 0.0009689979997347109
CART predict time 0.0003679689998534741
KNN fit time 0.0006534069998451741
KNN predict time 0.0014037579999239824
SVM fit time 0.02336926099997072
SVM predict time 0.0019374159996914386
NB fit time 0.0007427880000250298
NB predict time 0.0002441280003040447
RF fit time 0.024874424999779876
RF predict time 0.0029510170002140512
CART fit time 0.0010514870000406518
CART predict time 0.0002950900002360868
KNN fit time 0.0006503320000774693
KNN predict time 0.0017349119998470997
SVM fit time 0.02613356300025771
SVM predict time 0.002573041999767156
NB fit time 0.0007339210001191532
NB predict time 0.0002270449999741686
RF fit time 0.026363032000062958
RF predict time 0.002961632000278769
CART fit time 0.0009166479999294097
CART predict time 0.0002923850001934625
KNN fit time 0.0004674219999287743
KNN predict time 0.0014832140000180516
SVM fit time 0.028969490999770642
SVM predict time 0.

NB fit time 0.002710743000079674
NB predict time 0.0002809200000228884
RF fit time 0.026911678000033135
RF predict time 0.0029947400003038638
CART fit time 0.0010348839996368042
CART predict time 0.0006238410001060402
KNN fit time 0.0007291950000762881
KNN predict time 0.001681119000295439
SVM fit time 0.02283662500030914
SVM predict time 0.002044720000412781
NB fit time 0.0006307879998530552
NB predict time 0.000264106000031461
RF fit time 0.025755239000318397
RF predict time 0.00205133500003285
CART fit time 0.0011333560000821308
CART predict time 0.00019538600008672802
KNN fit time 0.0008822899999358924
KNN predict time 0.0013518690002456424
SVM fit time 0.02340463600012299
SVM predict time 0.002056107000043994
NB fit time 0.0006757050000487652
NB predict time 0.0002826479999384901
RF fit time 0.027622314999916853
RF predict time 0.0019525050001902855
CART fit time 0.0009754339998835349
CART predict time 0.00015961300005074008
KNN fit time 0.0006776710001759056
KNN predict time 0.00

RF fit time 0.02407439800026623
RF predict time 0.0026354369997534377
CART fit time 0.0009146320003310393
CART predict time 0.00021169000001464156
KNN fit time 0.0007096760000422364
KNN predict time 0.002674270999705186
SVM fit time 0.02317793600013829
SVM predict time 0.002009633999932703
NB fit time 0.0005862279999746534
NB predict time 0.00022928699991098256
RF fit time 0.024996148999889556
RF predict time 0.001957581000169739
CART fit time 0.0014280890000009094
CART predict time 0.0006404900000234193
KNN fit time 0.000715016999947693
KNN predict time 0.001394195000102627
SVM fit time 0.023154402999807644
SVM predict time 0.0018739629999799945
NB fit time 0.0006953340002837649
NB predict time 0.00022172099988893024
RF fit time 0.030024627999864606
RF predict time 0.0025966119997065107
CART fit time 0.001144410000051721
CART predict time 0.0005325940001057461
KNN fit time 0.0006578029997399426
KNN predict time 0.001497771000231296
SVM fit time 0.023815546999685466
SVM predict time 0.

NB fit time 0.0007850980000512209
NB predict time 0.0002524220003579103
RF fit time 0.022426501999689208
RF predict time 0.002837056999851484
CART fit time 0.000910412999928667
CART predict time 0.0007042289998935303
KNN fit time 0.0009275180000258842
KNN predict time 0.0013298329999997804
SVM fit time 0.022190943000168772
SVM predict time 0.0018974619997607078
NB fit time 0.0006723850001435494
NB predict time 0.00025682900013634935
RF fit time 0.031567430999984936
RF predict time 0.002283101999637438
CART fit time 0.0012121449999540346
CART predict time 0.0005936190000284114
KNN fit time 0.0008435869999630086
KNN predict time 0.0014284599997154146
SVM fit time 0.02078415000005407
SVM predict time 0.002000154000143084
NB fit time 0.00221510599976682
NB predict time 0.00030394800023714197
RF fit time 0.05636116799996671
RF predict time 0.0025194140002895438
CART fit time 0.0010582039999462722
CART predict time 0.00020141799996054033
KNN fit time 0.0007095190003383323
KNN predict time 0.

In [15]:
data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Data set: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", data.shape)
true_label = data['ICP_STATUS']
true_label = convert_label_to_numeric(true_label)
data = data['SLOC']

repeated_test(data, true_label, train_data, test_data, train_label, test_label, "k_fold") #repeat kfold 10 times and report avarage performance

Data set: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
CART fit time 0.0021232839999356656
CART predict time 0.00022846000001663924
KNN fit time 0.0009595699998499185
KNN predict time 0.0016461760001220682
SVM fit time 0.12239332299986927
SVM predict time 0.008395135999762715
NB fit time 0.0007265940002980642
NB predict time 0.00026438499980940833
RF fit time 0.04573755599994911
RF predict time 0.0046401009999499365
CART fit time 0.0020200899998599198
CART predict time 0.0005781529998785118
KNN fit time 0.0009704359999886947
KNN predict time 0.003363820000231499
SVM fit time 0.16916360400000485
SVM predict time 0.009275811999941652
NB fit time 0.0006768820003344445
NB predict time 0.0002559909999035881
RF fit time 0.032539109000026656
RF predict time 0.003291703000286361
CART fit time 0.0018222409998998046
CART predict time 0.0006352170003083302
KNN fit time 0.0009584150002410752
KNN predict time 0.002056388999790215
SVM fit time 0.12366031200008365
SVM predict time 0.008517

SVM fit time 0.12442456400003721
SVM predict time 0.009420621000117535
NB fit time 0.0006918529998074519
NB predict time 0.0008460439998998481
RF fit time 0.03478440300023067
RF predict time 0.0030699389999426785
CART fit time 0.0019436329998825386
CART predict time 0.0005048190000707109
KNN fit time 0.0009571300001880445
KNN predict time 0.001404540999828896
SVM fit time 0.13451347199998054
SVM predict time 0.008517866000147478
NB fit time 0.0007449800000358664
NB predict time 0.00022452399980465998
RF fit time 0.046362897000108205
RF predict time 0.0030059139999139006
CART fit time 0.0020805690000997856
CART predict time 0.0006395290001819376
KNN fit time 0.0010293779996572994
KNN predict time 0.00169618600011745
SVM fit time 0.13700387900007627
SVM predict time 0.010740670000359387
NB fit time 0.0007699500001763226
NB predict time 0.0004264190001777024
RF fit time 0.03531911699974444
RF predict time 0.002893997999763087
CART fit time 0.001884265000171581
CART predict time 0.00053532

SVM fit time 0.11678323800015278
SVM predict time 0.0083601239998643
NB fit time 0.0007515719998991699
NB predict time 0.00028543200005515246
RF fit time 0.03152338400013832
RF predict time 0.0022279660001913726
CART fit time 0.0017655390001891647
CART predict time 0.0005961089996162627
KNN fit time 0.0009467109998695378
KNN predict time 0.0027537109999684617
SVM fit time 0.13048610899977575
SVM predict time 0.008622216999810917
NB fit time 0.0006473849998656078
NB predict time 0.00044820999983130605
RF fit time 0.0416270139999142
RF predict time 0.001970510999854014
CART fit time 0.001827810000122554
CART predict time 0.00018352200004301267
KNN fit time 0.0009557170001244231
KNN predict time 0.0012139750001551874
SVM fit time 0.12976644900027168
SVM predict time 0.012188925000373274
NB fit time 0.0006931070001883199
NB predict time 0.00024022900015552295
RF fit time 0.04443398499961404
RF predict time 0.002383408000241616
CART fit time 0.0017703720000099565
CART predict time 0.0007176

SVM fit time 0.11664507300019977
SVM predict time 0.008907184999770834
NB fit time 0.0007510479999837116
NB predict time 0.0007156030001169711
RF fit time 0.03468090399974244
RF predict time 0.003066306999699009
CART fit time 0.002003940000122384
CART predict time 0.0002265049997731694
KNN fit time 0.0008645009997962916
KNN predict time 0.001497343000210094
SVM fit time 0.12470014000018637
SVM predict time 0.008812513000066247
NB fit time 0.0006858659999124939
NB predict time 0.00028369399979055743
RF fit time 0.03726759400024093
RF predict time 0.002093236999826331
CART fit time 0.00176439400001982
CART predict time 0.00020954100000381004
KNN fit time 0.001073150000138412
KNN predict time 0.001607197999874188
SVM fit time 0.12833970300016517
SVM predict time 0.008620065000286559
NB fit time 0.0006857859998490312
NB predict time 0.0005878409997421841
RF fit time 0.03669156900014059
RF predict time 0.0029118919997017656
CART fit time 0.0018637489997672674
CART predict time 0.00050726800

SVM fit time 0.12013225200007582
SVM predict time 0.008837180000227818
NB fit time 0.0011445800000728923
NB predict time 0.0002786559998639859
RF fit time 0.03720862999989549
RF predict time 0.002275452000048972
CART fit time 0.0018703089999689837
CART predict time 0.00020783299987670034
KNN fit time 0.0010445679999975255
KNN predict time 0.0015080139996825892
SVM fit time 0.12571058799994717
SVM predict time 0.00923919399974693
NB fit time 0.0008186950003619131
NB predict time 0.000647865999781061
RF fit time 0.039916938000260416
RF predict time 0.002193067999996856
CART fit time 0.002562819999639032
CART predict time 0.00037870599999223487
KNN fit time 0.0009613819997866813
KNN predict time 0.0014696029998049198
SVM fit time 0.13785177800036763
SVM predict time 0.011314701000173955
NB fit time 0.0007499369999095507
NB predict time 0.0002586929999779386
RF fit time 0.0379342189999079
RF predict time 0.00303322699983255
CART fit time 0.002171754000301007
CART predict time 0.00063391899

In [16]:
data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Data set: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", data.shape)
true_label = data['ICP_STATUS']
true_label = convert_label_to_numeric(true_label)
data = data['SLOC']

repeated_test(data, true_label, train_data, test_data, train_label, test_label, "k_fold") #repeat kfold 10 times and report avarage performance

Data set: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
CART fit time 0.0017740279999998165
CART predict time 0.00021222600025794236
KNN fit time 0.0014584269997612864
KNN predict time 0.0018784779999805323
SVM fit time 0.12663674199984598
SVM predict time 0.008746395999878587
NB fit time 0.0009687239999038866
NB predict time 0.0003092210004069784
RF fit time 0.03730013599988524
RF predict time 0.0037611549996654503
CART fit time 0.0010019430001193541
CART predict time 0.0016839909999362135
KNN fit time 0.0009987720000026457
KNN predict time 0.0014805290002186666
SVM fit time 0.1225521569999728
SVM predict time 0.008775909999712894
NB fit time 0.000687770999775239
NB predict time 0.0007155850003073283
RF fit time 0.03066394699999364
RF predict time 0.0029082290002406808
CART fit time 0.004151884000293649
CART predict time 0.0003521979997458402
KNN fit time 0.0009657809996497235
KNN predict time 0.0015557759998046095
SVM fit time 0.11128001599990966
SVM predict time 0.00819567

KNN fit time 0.0008774600000833743
KNN predict time 0.0015798899999026617
SVM fit time 0.10518249000006108
SVM predict time 0.008128379000027053
NB fit time 0.0006683229998998286
NB predict time 0.0002192880001530284
RF fit time 0.028279990000100952
RF predict time 0.0024330999999619962
CART fit time 0.0011172610002176953
CART predict time 0.00042139799961660174
KNN fit time 0.0009998699997595395
KNN predict time 0.001410231000136264
SVM fit time 0.10134998100011217
SVM predict time 0.007367559000158508
NB fit time 0.0008124590003717458
NB predict time 0.0005041670001446619
RF fit time 0.03537626499974067
RF predict time 0.0031616309997843928
CART fit time 0.0014206490000105987
CART predict time 0.0005305899999257235
KNN fit time 0.0009861739999905694
KNN predict time 0.0025110569999924337
SVM fit time 0.11453279300030772
SVM predict time 0.007843774000320991
NB fit time 0.000721491000149399
NB predict time 0.00031225800012180116
RF fit time 0.028214355000272917
RF predict time 0.00268

SVM fit time 0.09571432099983213
SVM predict time 0.006262868999783677
NB fit time 0.0004351580000729882
NB predict time 0.00039602600008947775
RF fit time 0.023195662000034645
RF predict time 0.0025785339998947165
CART fit time 0.001443016999928659
CART predict time 0.00033423900003981544
KNN fit time 0.000957531000040035
KNN predict time 0.0013131410000823962
SVM fit time 0.09605825500011633
SVM predict time 0.007374619000074745
NB fit time 0.0007028820000414271
NB predict time 0.0002259850002701569
RF fit time 0.03010603699976855
RF predict time 0.0026019779998023296
CART fit time 0.0009910630001286336
CART predict time 0.00040658300031282124
KNN fit time 0.0008686569999554195
KNN predict time 0.0014490509997813206
SVM fit time 0.09173832899978152
SVM predict time 0.007396266999876389
NB fit time 0.00064828899985514
NB predict time 0.00023685199994361028
RF fit time 0.02526925800020763
RF predict time 0.002116507999744499
CART fit time 0.0014775900003769493
CART predict time 0.00044

SVM fit time 0.1008027210000364
SVM predict time 0.007594690999667364
NB fit time 0.0004453550000107498
NB predict time 0.00039940999977261527
RF fit time 0.02165842199974577
RF predict time 0.0024702200003048347
CART fit time 0.001443745999949897
CART predict time 0.0003155139997943479
KNN fit time 0.000619360000200686
KNN predict time 0.0013479019999067532
SVM fit time 0.09574985200015362
SVM predict time 0.007920275000287802
NB fit time 0.000616421999893646
NB predict time 0.00038057900019339286
RF fit time 0.028852889000063442
RF predict time 0.0025570750003680587
CART fit time 0.002022421000219765
CART predict time 0.00034890799997810973
KNN fit time 0.000972782000189909
KNN predict time 0.001672432999839657
SVM fit time 0.10230165200027841
SVM predict time 0.007362270000157878
NB fit time 0.0006431630004044564
NB predict time 0.00044050999986211536
RF fit time 0.027621529000043665
RF predict time 0.0026740889998109196
CART fit time 0.00144123900008708
CART predict time 0.00036306

RF fit time 0.0416423639999266
RF predict time 0.0026027080002677394
CART fit time 0.001774147000105586
CART predict time 0.00037727500011897064
KNN fit time 0.0009979550000025483
KNN predict time 0.0016093320000436506
SVM fit time 0.12504783099984706
SVM predict time 0.010028167000200483
NB fit time 0.000733459999992192
NB predict time 0.00470058099972448
RF fit time 0.04660692600009497
RF predict time 0.002924276000157988
CART fit time 0.0034674140001698106
CART predict time 0.00022540899999512476
KNN fit time 0.0009116669998547877
KNN predict time 0.0024646609999763314
SVM fit time 0.12022131399999125
SVM predict time 0.011644586000329582
NB fit time 0.0007027810002000479
NB predict time 0.00023858300028223312
RF fit time 0.03231514400022206
RF predict time 0.0023786900001141476
CART fit time 0.0014694380001856189
CART predict time 0.0008801999997558596
KNN fit time 0.0010110520001944678
KNN predict time 0.002112048999606486
SVM fit time 0.1215187220000189
SVM predict time 0.0083124

In [17]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Train Data: COLOCATED_MOZILLA.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['ICP_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data['SLOC']

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Test Data: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['ICP_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data['SLOC']

repeated_test(data, true_label,train_data, test_data, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

Train Data: COLOCATED_MOZILLA.csv
Initial data shape:  (1613, 16)
Test Data: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
CART fit time 0.001638220000131696
CART predict time 0.0018670019999262877
KNN fit time 0.0007016019999355194
KNN predict time 0.006479089000094973
SVM fit time 0.027253609000126744
SVM predict time 0.03359777900004701
NB fit time 0.0006131709997134749
NB predict time 0.0008115230002658791
RF fit time 0.02419558000019606
RF predict time 0.004668908999974519
CART fit time 0.0009786119999262155
CART predict time 0.0004837770002268371
KNN fit time 0.0006611180001527828
KNN predict time 0.0055101200000535755
SVM fit time 0.026902596000127232
SVM predict time 0.036785737999707635
NB fit time 0.0007651939999959723
NB predict time 0.0012657940001190582
RF fit time 0.10187806400017507
RF predict time 0.007504956000047969
CART fit time 0.0013619669998661266
CART predict time 0.0019691379998221237
KNN fit time 0.0006912859998919885
KNN predict time 0.00694484499990

In [18]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Initial data shape: ", train_data.shape)
print("Train Data: COLOCATED_MOZILLA.csv")
train_label = train_data['ICP_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data['SLOC']

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Test Data: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['ICP_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data['SLOC']

repeated_test(data, true_label,train_data, test_data, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

Initial data shape:  (1613, 16)
Train Data: COLOCATED_MOZILLA.csv
Test Data: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
CART fit time 0.0010446060000504076
CART predict time 0.0006028480001987191
KNN fit time 0.000669204000132595
KNN predict time 0.007170545000008133
SVM fit time 0.02709930999981225
SVM predict time 0.03364969199992629
NB fit time 0.0005949540000074194
NB predict time 0.0008511569999427593
RF fit time 0.02678307499991206
RF predict time 0.00475239999968835
CART fit time 0.0010094860003846406
CART predict time 0.0005871090002074197
KNN fit time 0.0006689609999739332
KNN predict time 0.005569958000251063
SVM fit time 0.02758724599971174
SVM predict time 0.03388819200017679
NB fit time 0.0004338389999247738
NB predict time 0.000861033000091993
RF fit time 0.01889669699994556
RF predict time 0.0036903770001117664
CART fit time 0.0009725159998197341
CART predict time 0.0007126089999474061
KNN fit time 0.0007105220001903945
KNN predict time 0.005530737999833946


In [19]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Train Data: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['ICP_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data['SLOC']

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Test Data: COLOCATED_MOZILLA.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['ICP_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data['SLOC']

repeated_test(data, true_label,train_data, test_data, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

Train Data: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
Test Data: COLOCATED_MOZILLA.csv
Initial data shape:  (1613, 16)
CART fit time 0.002009963999626052
CART predict time 0.000318221999805246
KNN fit time 0.0009895840003082412
KNN predict time 0.006259891999889078
SVM fit time 0.15228646100013066
SVM predict time 0.056723595000221394
NB fit time 0.0006963610003367648
NB predict time 0.002377775999775622
RF fit time 0.03588314199987508
RF predict time 0.005688367999937327
CART fit time 0.0018254930000694003
CART predict time 0.0005901399999856949
KNN fit time 0.0009206599997924059
KNN predict time 0.003338772000006429
SVM fit time 0.16500229700022828
SVM predict time 0.05693279600018286
NB fit time 0.00046969199956947705
NB predict time 0.0006162870004118304
RF fit time 0.0329721499997504
RF predict time 0.003910262999852421
CART fit time 0.0020953409998583084
CART predict time 0.0005993340000713943
KNN fit time 0.0009572809999554011
KNN predict time 0.0035039889999097795

In [20]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Train Data: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['ICP_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data['SLOC']

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Test Data: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['ICP_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data['SLOC']

repeated_test(data, true_label,train_data, test_data, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

Train Data: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
Test Data: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
CART fit time 0.002824144999976852
CART predict time 0.0005531020001399156
KNN fit time 0.0010461219999342575
KNN predict time 0.006679153999812115
SVM fit time 0.13676160099976187
SVM predict time 0.09080662199994549
NB fit time 0.0006626199997299409
NB predict time 0.000710006000190333
RF fit time 0.03230995200010511
RF predict time 0.004931950999889523
CART fit time 0.0020047019997946336
CART predict time 0.000560730999950465
KNN fit time 0.0006862590003038349
KNN predict time 0.004652269000416709
SVM fit time 0.1294865029999528
SVM predict time 0.08916507100002491
NB fit time 0.0008069880000221019
NB predict time 0.0008784829997239285
RF fit time 0.03152233199989496
RF predict time 0.005200829999921552
CART fit time 0.001785037999979977
CART predict time 0.0006395709997377708
KNN fit time 0.0009977670001717343
KNN predict time 0.004250904999935301
S

In [21]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Train Data: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['ICP_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data['SLOC']

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Test Data: COLOCATED_MOZILLA.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['ICP_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data['SLOC']

repeated_test(data, true_label,train_data, test_data, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

Train Data: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
Test Data: COLOCATED_MOZILLA.csv
Initial data shape:  (1613, 16)
CART fit time 0.001640990000396414
CART predict time 0.0004526410002654302
KNN fit time 0.0009596909999345371
KNN predict time 0.004649668000183738
SVM fit time 0.1886608940003498
SVM predict time 0.0522668009998597
NB fit time 0.0007442470000569301
NB predict time 0.0004891319999842381
RF fit time 0.03078022699992289
RF predict time 0.0040469450000273355
CART fit time 0.0017390489997524128
CART predict time 0.0003679360002024623
KNN fit time 0.001093700000183162
KNN predict time 0.0037391119999483635
SVM fit time 0.13975823800001308
SVM predict time 0.05047716600029162
NB fit time 0.005444914000236167
NB predict time 0.0007120039999790606
RF fit time 0.036004485999910685
RF predict time 0.003866725000079896
CART fit time 0.0015311830002247007
CART predict time 0.0005028890000176034
KNN fit time 0.0010471099999449507
KNN predict time 0.005181109000204742


In [22]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Train Data: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['ICP_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data['SLOC']

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Test Data: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['ICP_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data['SLOC']

repeated_test(data, true_label,train_data, test_data, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

Train Data: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
Test Data: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
CART fit time 0.001646138000069186
CART predict time 0.0012290810000195052
KNN fit time 0.0009939179999491898
KNN predict time 0.0104150839997601
SVM fit time 0.16675820299997213
SVM predict time 0.094922946999759
NB fit time 0.0007658099998479884
NB predict time 0.0008667469996908039
RF fit time 0.038792196000031254
RF predict time 0.006341696999697888
CART fit time 0.002049295000233542
CART predict time 0.0005734699998356518
KNN fit time 0.0010184660000049917
KNN predict time 0.00800458499998058
SVM fit time 0.13563662600017778
SVM predict time 0.09228951099976257
NB fit time 0.0009125340002356097
NB predict time 0.0004652040001928981
RF fit time 0.03397492700014482
RF predict time 0.007671672000014951
CART fit time 0.0017376880000483652
CART predict time 0.0007657339997422241
KNN fit time 0.0011863949998769385
KNN predict time 0.005976288999590906
SV