In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import svm, tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import time

import warnings
warnings.filterwarnings('ignore')

In [2]:
def read_csv_file (filename):
    df = pd.read_csv(filename)  # read the csv file
    return df

In [3]:
def convert_label_to_numeric(label):
    converted_label = np.empty(len(label), dtype=object) 
    for i in range(len(label)):
        if label[i] == "INSECURE":
            converted_label[i] = 1
        elif label[i] == "NEUTRAL":
            converted_label[i] = 0
        else: 
            converted_label[i] = 2
    converted_label = converted_label.astype('int')
    return converted_label

In [4]:
def apply_PCA(train, test):
    # Since PCA is effected by scale, we need to scale the features in the data before applying PCA
    scaler = StandardScaler()
    # Fit on training set only.
    scaler.fit(train)
    # Apply transform to both the training set and the test set.
    train = scaler.transform(train)
    test = scaler.transform(test)

    # Make an instance of the Model
    pca = PCA(.95) #  choose the minimum number of principal components such that 95% of the variance is retained.
    # We are fitting PCA on the training set only.
    pca.fit(train)
    #print ("Number of selected components: ", pca.n_components_)
    #print (pd.DataFrame(pca.components_))
    
    # Apply the mapping (transform) to both the training set and the test set
    #print("Before applying PCA train set size: ", train.shape)
    #print("Before applying PCA test set size: ", test.shape)
    train = pca.transform(train)
    test = pca.transform(test)
    #print("After applying PCA train set size: ", train.shape)
    #print("After applying PCA test set size: ", test.shape)
    return train,test

In [5]:
def perfrom_CART(train, test, train_label, test_label):
    clf = tree.DecisionTreeClassifier(criterion='gini', splitter='best', 
                                      min_samples_split = 2, min_weight_fraction_leaf=0.0)
    start = time.perf_counter()
    clf.fit(train, train_label)
    end = time.perf_counter()
    print("CART fit time", (end - start))
    start = time.perf_counter()
    predicted_label = clf.predict(test)
    end = time.perf_counter()
    print("CART predict time", (end - start))
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1

In [6]:
def perfrom_KNN(train, test, train_label, test_label):
    clf = KNeighborsClassifier(n_neighbors=5, weights='distance')
    start = time.perf_counter()
    clf.fit(train, train_label)
    end = time.perf_counter()
    print("KNN fit time", (end - start))
    start = time.perf_counter()
    predicted_label = clf.predict(test)
    end = time.perf_counter()
    print("KNN predict time", (end - start))
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1

In [7]:
def perfrom_SVM(train, test, train_label, test_label):
    clf = svm.SVC(gamma='auto', C = 20.0, kernel='rbf')
    start = time.perf_counter()
    clf.fit(train, train_label)
    end = time.perf_counter()
    print("SVM fit time", (end - start))
    start = time.perf_counter()
    predicted_label = clf.predict(test)
    end = time.perf_counter()
    print("SVM predict time", (end - start))
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1

In [8]:
def perfrom_NB(train, test, train_label, test_label):
    clf = GaussianNB()
    start = time.perf_counter()
    clf.fit(train, train_label)
    end = time.perf_counter()
    print("NB fit time", (end - start))
    start = time.perf_counter()
    predicted_label = clf.predict(test)
    end = time.perf_counter()
    print("NB predict time", (end - start))
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1

In [9]:
def perfrom_RF(train, test, train_label, test_label):
    clf = RandomForestClassifier(n_estimators=10, criterion='gini')
    start = time.perf_counter()
    clf.fit(train, train_label)
    end = time.perf_counter()
    print("RF fit time", (end - start))
    start = time.perf_counter()
    predicted_label = clf.predict(test)
    end = time.perf_counter()
    print("RF predict time", (end - start))
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1

In [10]:
def measure_performance(true_label, predicted_label):   
    precision = recall = f1 = np.zeros(3, dtype=np.float32)
    report = classification_report(true_label, predicted_label)
    precision = precision_score(true_label, predicted_label, average=None, labels=[0,1])
    recall = recall_score(true_label, predicted_label, average=None, labels=[0,1])
    f1 = f1_score(true_label, predicted_label, average=None, labels=[0,1])
    return recall, precision, f1

In [11]:
def kfold_cv(data, true_label):
    # 10 fold cv
    kf = KFold(n_splits=10, shuffle = True, random_state = 7)

    cv_recall_DT = []
    cv_precision_DT = []
    cv_f1_DT = []

    cv_recall_KNN = []
    cv_precision_KNN = []
    cv_f1_KNN = []

    cv_recall_SVM = []
    cv_precision_SVM = []
    cv_f1_SVM = []

    cv_recall_NB = []
    cv_precision_NB = []
    cv_f1_NB = []

    cv_recall_RF = []
    cv_precision_RF = []
    cv_f1_RF = []


    for train_index, test_index in kf.split(data):
        train, test = data.loc[train_index], data.loc[test_index]
        train_label, test_label = true_label[train_index], true_label[test_index]

        train, test = apply_PCA(train, test)

        recall, precision, f1 = perfrom_CART(train, test, train_label, test_label)
        cv_recall_DT.append(recall)
        cv_precision_DT.append(precision)
        cv_f1_DT.append(f1)

        recall, precision, f1 = perfrom_KNN(train, test, train_label, test_label)
        cv_recall_KNN.append(recall)
        cv_precision_KNN.append(precision)
        cv_f1_KNN.append(f1)

        recall, precision, f1 = perfrom_SVM(train, test, train_label, test_label)
        cv_recall_SVM.append(recall)
        cv_precision_SVM.append(precision)
        cv_f1_SVM.append(f1)

        recall, precision, f1 = perfrom_NB(train, test, train_label, test_label)
        cv_recall_NB.append(recall)
        cv_precision_NB.append(precision)
        cv_f1_NB.append(f1)

        recall, precision, f1 = perfrom_RF(train, test, train_label, test_label)
        cv_recall_RF.append(recall)
        cv_precision_RF.append(precision)
        cv_f1_RF.append(f1)

        
    recall_DT = np.mean(cv_recall_DT, axis= 0)
    precision_DT = np.mean(cv_precision_DT, axis= 0)
    f1_DT = np.mean(cv_f1_DT, axis= 0)

    recall_KNN = np.mean(cv_recall_KNN, axis= 0)
    precision_KNN = np.mean(cv_precision_KNN, axis= 0)
    f1_KNN = np.mean(cv_f1_KNN, axis= 0)

    recall_SVM = np.mean(cv_recall_SVM, axis= 0)
    precision_SVM = np.mean(cv_precision_SVM, axis= 0)
    f1_SVM =  np.mean(cv_f1_SVM, axis= 0)

    recall_NB = np.mean(cv_recall_NB, axis= 0)
    precision_NB = np.mean(cv_precision_NB, axis= 0)
    f1_NB = np.mean(cv_f1_NB, axis= 0)

    recall_RF = np.mean(cv_recall_RF, axis= 0)
    precision_RF = np.mean(cv_precision_RF, axis= 0)
    f1_RF = np.mean(cv_f1_RF, axis= 0)
    
    return recall_DT, precision_DT, f1_DT, recall_KNN, precision_KNN, f1_KNN, recall_SVM, precision_SVM,\
    f1_SVM, recall_NB, precision_NB, f1_NB, recall_RF, precision_RF, f1_RF

In [12]:
def different_file_test(train, test, train_label, test_label):
    
    train, test = apply_PCA(train, test)
        
    recall_DT, precision_DT, f1_DT = perfrom_CART(train, test, train_label, test_label)
    recall_KNN, precision_KNN, f1_KNN = perfrom_KNN(train, test, train_label, test_label)
    recall_SVM, precision_SVM, f1_SVM = perfrom_SVM(train, test, train_label, test_label)
    recall_NB, precision_NB, f1_NB = perfrom_NB(train, test, train_label, test_label)
    recall_RF, precision_RF, f1_RF = perfrom_RF(train, test, train_label, test_label)

    return recall_DT, precision_DT, f1_DT, recall_KNN, precision_KNN, f1_KNN, recall_SVM, precision_SVM,\
    f1_SVM, recall_NB, precision_NB, f1_NB, recall_RF, precision_RF, f1_RF

In [13]:
def repeated_test(data, true_label, train_data, test_data, train_label, test_label, test_name):
    repeated_recall_DT = []
    repeated_precision_DT = []
    repeated_f1_DT = []

    repeated_recall_KNN = []
    repeated_precision_KNN = []
    repeated_f1_KNN = []

    repeated_recall_SVM = []
    repeated_precision_SVM = []
    repeated_f1_SVM = []

    repeated_recall_NB = []
    repeated_precision_NB = []
    repeated_f1_NB = []

    repeated_recall_RF = []
    repeated_precision_RF = []
    repeated_f1_RF = []
    
    recall_DT= precision_DT= f1_DT= recall_KNN= precision_KNN= f1_KNN= recall_SVM= precision_SVM= f1_SVM\
    = recall_NB= precision_NB= f1_NB= recall_RF= precision_RF= f1_RF = []
    
    for i in range(10):
        if test_name == "k_fold":
            recall_DT, precision_DT, f1_DT, recall_KNN, precision_KNN, f1_KNN, recall_SVM, precision_SVM, f1_SVM,\
            recall_NB, precision_NB, f1_NB, recall_RF, precision_RF, f1_RF = kfold_cv(data, true_label)
        else: 
            recall_DT, precision_DT, f1_DT, recall_KNN, precision_KNN, f1_KNN, recall_SVM, precision_SVM, f1_SVM,\
            recall_NB, precision_NB, f1_NB, recall_RF, precision_RF, f1_RF = different_file_test(train_data, test_data, train_label, test_label)
        
        repeated_recall_DT.append(recall_DT)
        repeated_precision_DT.append(precision_DT)
        repeated_f1_DT.append(f1_DT)

        repeated_recall_KNN.append(recall_KNN)
        repeated_precision_KNN.append(precision_KNN)
        repeated_f1_KNN.append(f1_KNN)

        repeated_recall_SVM.append(recall_SVM)
        repeated_precision_SVM.append(precision_SVM)
        repeated_f1_SVM.append(f1_SVM)

        repeated_recall_NB.append(recall_NB)
        repeated_precision_NB.append(precision_NB)
        repeated_f1_NB.append(f1_NB)

        repeated_recall_RF.append(recall_RF)
        repeated_precision_RF.append(precision_RF)
        repeated_f1_RF.append(f1_RF)
        
    print("-------DT-------")
    print("Recall:", np.median(repeated_recall_DT, axis= 0))
    print("Precision:", np.median(repeated_precision_DT, axis= 0))
    print("f1 score:", np.median(repeated_f1_DT, axis= 0))

    print("-------KNN-------")
    print("Recall:", np.median(repeated_recall_KNN, axis= 0))
    print("Precision:", np.median(repeated_precision_KNN, axis= 0))
    print("f1 score:", np.median(repeated_f1_KNN, axis= 0))

    print("-------SVM-------")
    print("Recall:", np.median(repeated_recall_SVM, axis= 0))
    print("Precision:", np.median(repeated_precision_SVM, axis= 0))
    print("f1 score:", np.median(repeated_f1_SVM, axis= 0))

    print("-------NB-------")
    print("Recall:", np.median(repeated_recall_NB, axis= 0))
    print("Precision:", np.median(repeated_precision_NB, axis= 0))
    print("f1 score:", np.median(repeated_f1_NB, axis= 0))

    print("-------RF-------")
    print("Recall:", np.median(repeated_recall_RF, axis= 0))
    print("Precision:", np.median(repeated_precision_RF, axis= 0))
    print("f1 score:", np.median(repeated_f1_RF, axis= 0))

In [14]:
def calcFeatureImp(feature_vec, label_vec, feature_names_param, repeat=10):
    header_str, output= '', ''
    for name_ in feature_names_param:
        header_str = header_str + name_ + ','
    theRndForestModel = RandomForestClassifier()
    theRndForestModel.fit(feature_vec, label_vec)
    feat_imp_vector=theRndForestModel.feature_importances_

    for ind_ in range(repeat):
        for imp_vec_index in range(len(feat_imp_vector)):
            feat_imp_val = round(feat_imp_vector[imp_vec_index], 5)
            output = output +  str(feat_imp_val) + ','
        output = output + '\n'
    output_status = header_str + '\n' + output
    #print ("Feature importance: ", output_status)
    
    feat_imp_vector=list(feat_imp_vector)
    sorted_feat_imp_vector= [x_ for x_ in feat_imp_vector]
    sorted_feat_imp_vector.sort(reverse=True)
    
    sorted_feature_name = []
    for feat_imp_val in sorted_feat_imp_vector:
        feat_index = feat_imp_vector.index(feat_imp_val) 
        sorted_feature_name.append(feature_names_param[feat_index])
        
    print ("sorted feature names: ", sorted_feature_name)
    print ("sorted feature importance: ", sorted_feat_imp_vector)

In [15]:
data, true_label, train_data, test_data, train_label, test_label = [], [], [], [], [], [] # necessary variables

In [16]:
data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Data set: COLOCATED_MOZILLA.csv")
print("Initial data shape: ", data.shape)
true_label = data['ICP_STATUS']
true_label = convert_label_to_numeric(true_label)
data = data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

repeated_test(data, true_label, train_data, test_data, train_label, test_label, "k_fold") #repeat kfold 10 times and report avarage performance

columns_name = list(data.columns.values) # read all column names
calcFeatureImp(data, true_label, columns_name) # find feature importance

Data set: COLOCATED_MOZILLA.csv
Initial data shape:  (1613, 16)
CART fit time 0.008294556999771885
CART predict time 0.0002465229999870644
KNN fit time 0.0013862470000276517
KNN predict time 0.0022844619998068083
SVM fit time 0.053126978000364034
SVM predict time 0.0027216579997002555
NB fit time 0.0010341449997213203
NB predict time 0.0005555839998123702
RF fit time 0.06076516700022694
RF predict time 0.003400543999759975
CART fit time 0.006672794000223803
CART predict time 0.0008923770001274534
KNN fit time 0.0010134750000361237
KNN predict time 0.0029722319995926227
SVM fit time 0.07595304799997393
SVM predict time 0.0028870369997093803
NB fit time 0.0010350029997425736
NB predict time 0.00031389799960379605
RF fit time 0.055653908000294905
RF predict time 0.003932926999823394
CART fit time 0.008239928999955737
CART predict time 0.00024930600011430215
KNN fit time 0.0010882370002036623
KNN predict time 0.0023630900000171096
SVM fit time 0.04828447199997754
SVM predict time 0.0019255

CART fit time 0.008949545999712427
CART predict time 0.0009346319998257968
KNN fit time 0.0007343080001191993
KNN predict time 0.0024512439999853086
SVM fit time 0.060610523000377725
SVM predict time 0.0025571679998392938
NB fit time 0.0010515660001146898
NB predict time 0.0019423439998718095
RF fit time 0.03684386600025391
RF predict time 0.0035904820001633198
CART fit time 0.009697629000129382
CART predict time 0.00024128299992298707
KNN fit time 0.0011733860001186258
KNN predict time 0.002297002999966935
SVM fit time 0.053492643000026874
SVM predict time 0.003039049000108207
NB fit time 0.0009751510001478891
NB predict time 0.0003803370000241557
RF fit time 0.04394530299987309
RF predict time 0.001943965999998909
CART fit time 0.008133385999826714
CART predict time 0.00023326300015469315
KNN fit time 0.0011759710000660561
KNN predict time 0.002511347000108799
SVM fit time 0.05431463800005076
SVM predict time 0.003559584999948129
NB fit time 0.0008704069996383623
NB predict time 0.00

RF fit time 0.0457648279998466
RF predict time 0.003395066999928531
CART fit time 0.007685857000069518
CART predict time 0.00020421799990799627
KNN fit time 0.0011262959997111466
KNN predict time 0.00249615199982145
SVM fit time 0.052081197000006796
SVM predict time 0.002333478999844374
NB fit time 0.001029507000112062
NB predict time 0.0005693040002370253
RF fit time 0.04071342600036587
RF predict time 0.0022705209999003273
CART fit time 0.009195530999932089
CART predict time 0.000908858999991935
KNN fit time 0.0011781480002355238
KNN predict time 0.003225596999982372
SVM fit time 0.05372169899965229
SVM predict time 0.0020005810001748614
NB fit time 0.0009901520002131292
NB predict time 0.0009589689998392714
RF fit time 0.03945511399979296
RF predict time 0.0025042020001819765
CART fit time 0.00765974699970684
CART predict time 0.00020855100001426763
KNN fit time 0.0009970790001716523
KNN predict time 0.002337768000415963
SVM fit time 0.056341601999974955
SVM predict time 0.002325031

RF fit time 0.042708512999979575
RF predict time 0.002042414999777975
CART fit time 0.00706387399986852
CART predict time 0.0002631269999255892
KNN fit time 0.001122782000038569
KNN predict time 0.0022902310001882142
SVM fit time 0.050523417000022164
SVM predict time 0.002548091999869939
NB fit time 0.0008530500003871566
NB predict time 0.0010013860000981367
RF fit time 0.03724177200001577
RF predict time 0.0019041340001422213
CART fit time 0.008293027999570768
CART predict time 0.00022213300007933867
KNN fit time 0.0011383840001144563
KNN predict time 0.0022162679997563828
SVM fit time 0.0537522770000578
SVM predict time 0.0023773170000822574
NB fit time 0.0010829789998751949
NB predict time 0.0010449619999235438
RF fit time 0.03799799799980974
RF predict time 0.00255475199992361
CART fit time 0.007837449999897217
CART predict time 0.0002065700000457582
KNN fit time 0.001115456999741582
KNN predict time 0.0024127329998009373
SVM fit time 0.05422274300008212
SVM predict time 0.00249474

KNN predict time 0.0026500260000830167
SVM fit time 0.049306816999887815
SVM predict time 0.0023408160000144562
NB fit time 0.0010357219998695655
NB predict time 0.00027671499992720783
RF fit time 0.037522013999932824
RF predict time 0.0025823579999268986
CART fit time 0.0072063470001921814
CART predict time 0.0005214560001149948
KNN fit time 0.0010471519999555312
KNN predict time 0.0027166929999111744
SVM fit time 0.05314838799995414
SVM predict time 0.0025334209999527957
NB fit time 0.0009663860000728164
NB predict time 0.0003127909999420808
RF fit time 0.03728323500035913
RF predict time 0.001850859000114724
CART fit time 0.00729223100006493
CART predict time 0.0002051999999821419
KNN fit time 0.001151443999788171
KNN predict time 0.0021136569998816412
SVM fit time 0.05619895100016947
SVM predict time 0.0023812000003999856
NB fit time 0.00101998599984654
NB predict time 0.0005562510000345355
RF fit time 0.04164053500016962
RF predict time 0.0022262800002863514
CART fit time 0.007209

In [17]:
data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Data set: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", data.shape)
true_label = data['ICP_STATUS']
true_label = convert_label_to_numeric(true_label)
data = data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

repeated_test(data, true_label, train_data, test_data, train_label, test_label, "k_fold") #repeat kfold 10 times and report avarage performance

columns_name = list(data.columns.values) # read all column names
calcFeatureImp(data, true_label, columns_name) # find feature importance

Data set: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
CART fit time 0.02397591699991608
CART predict time 0.0009596559998499288
KNN fit time 0.001990350000141916
KNN predict time 0.005609508999896207
SVM fit time 0.21018472700006896
SVM predict time 0.009973860000172863
NB fit time 0.0016566040003453963
NB predict time 0.000751046999994287
RF fit time 0.07635636900022291
RF predict time 0.00225427600025796
CART fit time 0.021678458999758732
CART predict time 0.0009030640003402368
KNN fit time 0.0018358130000706296
KNN predict time 0.004567782999856718
SVM fit time 0.2008233259998633
SVM predict time 0.010513446000004478
NB fit time 0.0011614270001700788
NB predict time 0.0003121429999737302
RF fit time 0.0715941980001844
RF predict time 0.003636326999640005
CART fit time 0.021907130999807123
CART predict time 0.0009002580000014859
KNN fit time 0.001885456999843882
KNN predict time 0.0052236749997973675
SVM fit time 0.2036196099998051
SVM predict time 0.01061623099985809
NB 

SVM fit time 0.17732868200027951
SVM predict time 0.006375792000198999
NB fit time 0.0007588989997202589
NB predict time 0.0007367039997916436
RF fit time 0.05799169400006576
RF predict time 0.002499554000223725
CART fit time 0.019769021999763936
CART predict time 0.0005276279998724931
KNN fit time 0.0015093500001057691
KNN predict time 0.003963128000123106
SVM fit time 0.18226781500015932
SVM predict time 0.0071017140003277746
NB fit time 0.001125471999785077
NB predict time 0.0006904849997226847
RF fit time 0.0668897770001422
RF predict time 0.0022655190000477887
CART fit time 0.022251521999805846
CART predict time 0.0004452359999049804
KNN fit time 0.0014954779999243328
KNN predict time 0.004556818000310159
SVM fit time 0.1906821980001041
SVM predict time 0.008854801000325097
NB fit time 0.0011964800000896503
NB predict time 0.0007577040000796842
RF fit time 0.06956959899980575
RF predict time 0.003390706000118371
CART fit time 0.022555622000254516
CART predict time 0.00033597099991

SVM fit time 0.1777451950001705
SVM predict time 0.008750062999752117
NB fit time 0.001307042000007641
NB predict time 0.0005135940000400296
RF fit time 0.06295511300004364
RF predict time 0.002638573000240285
CART fit time 0.020762672999808274
CART predict time 0.000530427999819949
KNN fit time 0.0015760390001560154
KNN predict time 0.003906902999915474
SVM fit time 0.17700839500002985
SVM predict time 0.0075415040000734734
NB fit time 0.0012177879998489516
NB predict time 0.0005401379999057099
RF fit time 0.06509630699974878
RF predict time 0.002241884999875765
CART fit time 0.02047117199981585
CART predict time 0.00042618799989213585
KNN fit time 0.0018441170000187412
KNN predict time 0.0044242439998924965
SVM fit time 0.1720641470001283
SVM predict time 0.008924696000121912
NB fit time 0.0007735750000392727
NB predict time 0.0004513300000326126
RF fit time 0.062158203999842954
RF predict time 0.002614656999867293
CART fit time 0.01916261599990321
CART predict time 0.000367662000371

SVM fit time 0.18414011400000163
SVM predict time 0.008848400000260881
NB fit time 0.0011796710000453459
NB predict time 0.0030727590001333738
RF fit time 0.06538393899973016
RF predict time 0.002219280000190338
CART fit time 0.020870066000043153
CART predict time 0.0010202330004176474
KNN fit time 0.0018459380003150727
KNN predict time 0.0048907450000115205
SVM fit time 0.22936499199977334
SVM predict time 0.006976804999794695
NB fit time 0.0007666050000807445
NB predict time 0.0007162550000430201
RF fit time 0.058823349000249436
RF predict time 0.002767046999906597
CART fit time 0.02280985299967142
CART predict time 0.0004884070003754459
KNN fit time 0.0016727009997339337
KNN predict time 0.005414002999714285
SVM fit time 0.20621800500020981
SVM predict time 0.01312970899971333
NB fit time 0.0014630730001954362
NB predict time 0.001029069000196614
RF fit time 0.1334556540000449
RF predict time 0.0029063239999231882
CART fit time 0.020564992999879905
CART predict time 0.00053015100002

SVM fit time 0.18758364800032723
SVM predict time 0.009071694999875035
NB fit time 0.0011918069999410363
NB predict time 0.001346843000192166
RF fit time 0.07027088599988929
RF predict time 0.0026418859997647814
CART fit time 0.02088271699994948
CART predict time 0.000383410999802436
KNN fit time 0.001509884999904898
KNN predict time 0.004620759999852453
SVM fit time 0.18085589000020263
SVM predict time 0.009130089000336739
NB fit time 0.0019403450000936573
NB predict time 0.0008827070000734238
RF fit time 0.07258636500000648
RF predict time 0.0023215109999910055
CART fit time 0.022705205000420392
CART predict time 0.0002540759996918496
KNN fit time 0.0016948640000009618
KNN predict time 0.004508067999722698
SVM fit time 0.28142599400007384
SVM predict time 0.017096504000164714
NB fit time 0.001265849999981583
NB predict time 0.00043386700008340995
RF fit time 0.07228133000035086
RF predict time 0.0026293050000276708
CART fit time 0.021974936999868078
CART predict time 0.00049782399992

In [18]:
data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Data set: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", data.shape)
true_label = data['ICP_STATUS']
true_label = convert_label_to_numeric(true_label)
data = data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

repeated_test(data, true_label, train_data, test_data, train_label, test_label, "k_fold") # repeat kfold 10 times and report avarage performance

columns_name = list(data.columns.values) # read all column names
calcFeatureImp(data, true_label, columns_name) # find feature importance

Data set: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
CART fit time 0.02296424999985902
CART predict time 0.00023306800039790687
KNN fit time 0.0016615399999864167
KNN predict time 0.0074851959998341044
SVM fit time 0.3111719400003494
SVM predict time 0.009567437999976391
NB fit time 0.0014914710000084597
NB predict time 0.0004912970002806105
RF fit time 0.06502668799976163
RF predict time 0.0026118199998563796
CART fit time 0.019971407999946678
CART predict time 0.0005170059998818033
KNN fit time 0.002018987000155903
KNN predict time 0.005148667000412388
SVM fit time 0.318452662999789
SVM predict time 0.009281359999931738
NB fit time 0.001353294000182359
NB predict time 0.0008748509999350063
RF fit time 0.06212498700006108
RF predict time 0.0027479579998725967
CART fit time 0.022951954000291153
CART predict time 0.00023385800022879266
KNN fit time 0.0017525329999443784
KNN predict time 0.005619913999908022
SVM fit time 0.3892109410003286
SVM predict time 0.0112179620000461

SVM fit time 0.25524172799987355
SVM predict time 0.0077913879999869096
NB fit time 0.001611861000128556
NB predict time 0.0008426849999523256
RF fit time 0.05882134200010114
RF predict time 0.0024741979996178998
CART fit time 0.022814179999841144
CART predict time 0.0005655999998452899
KNN fit time 0.0018465799998921284
KNN predict time 0.006792337999740994
SVM fit time 0.24386050099974454
SVM predict time 0.008416563000082533
NB fit time 0.0010100710001097468
NB predict time 0.001016454999898997
RF fit time 0.07078195399981269
RF predict time 0.003002992000347149
CART fit time 0.021829869000157487
CART predict time 0.000947544000155176
KNN fit time 0.0012875450001956779
KNN predict time 0.005317902000115282
SVM fit time 0.2329754980000871
SVM predict time 0.0074141749996670114
NB fit time 0.001080498000192165
NB predict time 0.0006149890000415326
RF fit time 0.043622420000247075
RF predict time 0.0021337889997994353
CART fit time 0.019562340999982553
CART predict time 0.0003273170000

SVM fit time 0.2197606779996022
SVM predict time 0.006978120999974635
NB fit time 0.001029646999995748
NB predict time 0.0007982679999258835
RF fit time 0.04503564400010873
RF predict time 0.002792660000068281
CART fit time 0.018065700000079232
CART predict time 0.0003849290001198824
KNN fit time 0.0012482309998631536
KNN predict time 0.005007984999792825
SVM fit time 0.22086705899982917
SVM predict time 0.008116497999708372
NB fit time 0.0010675909998099087
NB predict time 0.0008667189999869152
RF fit time 0.047376768000049196
RF predict time 0.0022543649997714965
CART fit time 0.017243595000309142
CART predict time 0.0006155790001685091
KNN fit time 0.0012614279999070277
KNN predict time 0.005167762999917613
SVM fit time 0.21236068300004263
SVM predict time 0.006664492999789218
NB fit time 0.0010664050000741554
NB predict time 0.0013931060002505546
RF fit time 0.0486558839997997
RF predict time 0.003001198000220029
CART fit time 0.01722167800016905
CART predict time 0.000472191999961

SVM fit time 0.23966963300017596
SVM predict time 0.007091696000316006
NB fit time 0.0009004640000966901
NB predict time 0.0011162999999214662
RF fit time 0.04930387299964423
RF predict time 0.004751264999867999
CART fit time 0.017478427999776613
CART predict time 0.0004513459998634062
KNN fit time 0.0013610609998977452
KNN predict time 0.0045309970000744215
SVM fit time 0.22316436699975384
SVM predict time 0.0063311130002148275
NB fit time 0.0010758069997791608
NB predict time 0.0011959700000261364
RF fit time 0.04226546900008543
RF predict time 0.0028670480000982934
CART fit time 0.016623564999918017
CART predict time 0.000660098000025755
KNN fit time 0.0016350750001947745
KNN predict time 0.005612797000139835
SVM fit time 0.2312592019998192
SVM predict time 0.009754710999914096
NB fit time 0.0008669990002090344
NB predict time 0.0008156150001923379
RF fit time 0.06487895399959598
RF predict time 0.003062705000047572
CART fit time 0.01983999699996275
CART predict time 0.0004584759999

SVM fit time 0.2183165490000647
SVM predict time 0.006266503000006196
NB fit time 0.0010402359998806787
NB predict time 0.0008813229997031158
RF fit time 0.04130387300028815
RF predict time 0.0020994219999010966
CART fit time 0.018352503999722103
CART predict time 0.0004530150004029565
KNN fit time 0.0013042509999650065
KNN predict time 0.004845627000122477
SVM fit time 0.2092748439999923
SVM predict time 0.006033914999989065
NB fit time 0.0008752600001571409
NB predict time 0.0010014059998866287
RF fit time 0.04719103000024916
RF predict time 0.002671574000032706
CART fit time 0.01945218399987425
CART predict time 0.0004917280002700863
KNN fit time 0.0013743620002060197
KNN predict time 0.00531223999996655
SVM fit time 0.19554454199987958
SVM predict time 0.006675282000287552
NB fit time 0.0009264749996873434
NB predict time 0.0008920499999476306
RF fit time 0.05746561899968583
RF predict time 0.0027830590001940436
CART fit time 0.019150627999806602
CART predict time 0.000328806000197

In [19]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Train Data: COLOCATED_MOZILLA.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['ICP_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Test Data: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['ICP_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

repeated_test(data, true_label,train_data, test_data, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

Train Data: COLOCATED_MOZILLA.csv
Initial data shape:  (1613, 16)
Test Data: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
CART fit time 0.0072493650000069465
CART predict time 0.0005871119997209462
KNN fit time 0.000862796000092203
KNN predict time 0.02696051599968996
SVM fit time 0.04790193200005888
SVM predict time 0.031852777999574755
NB fit time 0.0007923620000838127
NB predict time 0.0009632820001570508
RF fit time 0.02959396400001424
RF predict time 0.004066056999818102
CART fit time 0.006383525999808626
CART predict time 0.0006461490002038772
KNN fit time 0.0008540660001017386
KNN predict time 0.025489121999726194
SVM fit time 0.04872657399982927
SVM predict time 0.03269595899973865
NB fit time 0.0007950089998303156
NB predict time 0.0010837459999493149
RF fit time 0.028930047000358172
RF predict time 0.004099545999906695
CART fit time 0.006263750000016444
CART predict time 0.0006485560002147395
KNN fit time 0.000864032000208681
KNN predict time 0.02434411000012915
SV

In [20]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Initial data shape: ", train_data.shape)
print("Train Data: COLOCATED_MOZILLA.csv")
train_label = train_data['ICP_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Test Data: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['ICP_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

repeated_test(data, true_label,train_data, test_data, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

Initial data shape:  (1613, 16)
Train Data: COLOCATED_MOZILLA.csv
Test Data: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
CART fit time 0.007391544999791222
CART predict time 0.0006583580002370581
KNN fit time 0.001019884000015736
KNN predict time 0.027427182999872457
SVM fit time 0.04962827000008474
SVM predict time 0.037496258999908605
NB fit time 0.001001616999928956
NB predict time 0.0009823710001910513
RF fit time 0.030258955000135757
RF predict time 0.0035988919998999336
CART fit time 0.006839145999947505
CART predict time 0.0003351789996486332
KNN fit time 0.0009685560003163118
KNN predict time 0.0313441570001487
SVM fit time 0.05507950199989864
SVM predict time 0.0364253060001829
NB fit time 0.0009416369998689333
NB predict time 0.0009499879997747485
RF fit time 0.03504297500012399
RF predict time 0.005051287000242155
CART fit time 0.010284539999702247
CART predict time 0.0005691960000149265
KNN fit time 0.0011508229999890318
KNN predict time 0.040474589000041306
SVM

In [21]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Train Data: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['ICP_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Test Data: COLOCATED_MOZILLA.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['ICP_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

repeated_test(data, true_label,train_data, test_data, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

Train Data: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
Test Data: COLOCATED_MOZILLA.csv
Initial data shape:  (1613, 16)
CART fit time 0.02070973900026729
CART predict time 0.0008026199998312222
KNN fit time 0.001700888999948802
KNN predict time 0.030433869999797025
SVM fit time 0.17749950899997202
SVM predict time 0.037194205000105285
NB fit time 0.0009231630001522717
NB predict time 0.0010813760000019101
RF fit time 0.049123028999929375
RF predict time 0.0030068959999880462
CART fit time 0.019916537000426615
CART predict time 0.0007652760000382841
KNN fit time 0.0014598650000152702
KNN predict time 0.030281161999937467
SVM fit time 0.20775806600022406
SVM predict time 0.054305438000028516
NB fit time 0.0016441699999631965
NB predict time 0.001066350000201055
RF fit time 0.06631840099998954
RF predict time 0.004225793999921734
CART fit time 0.023390952000227117
CART predict time 0.0006928580000931106
KNN fit time 0.0018358359998273954
KNN predict time 0.035058242000104656


In [22]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Train Data: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['ICP_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Test Data: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['ICP_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

repeated_test(data, true_label,train_data, test_data, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

Train Data: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
Test Data: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
CART fit time 0.02365699899974061
CART predict time 0.0008071370002653566
KNN fit time 0.0022512290001941437
KNN predict time 0.05465572399998564
SVM fit time 0.20711485200035895
SVM predict time 0.08778962799988221
NB fit time 0.0010126990000571823
NB predict time 0.0017470069997216342
RF fit time 0.06988865099992836
RF predict time 0.006664729000021907
CART fit time 0.02401180500010014
CART predict time 0.0015605430003233778
KNN fit time 0.0016143449997798598
KNN predict time 0.0577430620000996
SVM fit time 0.1989796020002359
SVM predict time 0.0703187720000642
NB fit time 0.0009732909998092509
NB predict time 0.0011697720001393463
RF fit time 0.057187750000139204
RF predict time 0.004681737999817415
CART fit time 0.020342446000086056
CART predict time 0.0006529429997499392
KNN fit time 0.001544265999655181
KNN predict time 0.048988118000124814
SVM fi

In [23]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Train Data: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['ICP_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Test Data: COLOCATED_MOZILLA.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['ICP_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

repeated_test(data, true_label,train_data, test_data, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

Train Data: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
Test Data: COLOCATED_MOZILLA.csv
Initial data shape:  (1613, 16)
CART fit time 0.02407900200023505
CART predict time 0.0008078420000856568
KNN fit time 0.0013984170000185259
KNN predict time 0.037222979000034684
SVM fit time 0.3174151520001942
SVM predict time 0.05012308200002735
NB fit time 0.0007963029997881677
NB predict time 0.0012108310002076905
RF fit time 0.06610867900008088
RF predict time 0.004230629000176123
CART fit time 0.024312538000231143
CART predict time 0.0007543000001533073
KNN fit time 0.0019211869998798647
KNN predict time 0.03855086700013999
SVM fit time 0.30267335799999273
SVM predict time 0.0393137010000828
NB fit time 0.0010586540001895628
NB predict time 0.0010644499998306856
RF fit time 0.04548791400020491
RF predict time 0.004338270999596716
CART fit time 0.020204604999889852
CART predict time 0.0006263800000851916
KNN fit time 0.0013671440001417068
KNN predict time 0.031126272000165045
SVM f

In [24]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Train Data: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['ICP_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Test Data: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['ICP_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

repeated_test(data, true_label,train_data, test_data, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

Train Data: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
Test Data: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
CART fit time 0.0250558610000553
CART predict time 0.0010583849998511141
KNN fit time 0.001780265999968833
KNN predict time 0.04521611900008793
SVM fit time 0.3075026110000181
SVM predict time 0.06506790100002036
NB fit time 0.0011103510000793904
NB predict time 0.0012905109997518593
RF fit time 0.04495462299973951
RF predict time 0.004241639000156283
CART fit time 0.024432204000277125
CART predict time 0.0008961129997260286
KNN fit time 0.0016652629997224722
KNN predict time 0.048297442000148294
SVM fit time 0.2664228120002008
SVM predict time 0.062329862999831676
NB fit time 0.0009840160000749165
NB predict time 0.0010072290001517104
RF fit time 0.045957879000070534
RF predict time 0.004292964999876858
CART fit time 0.022542137000073126
CART predict time 0.0010156900002584734
KNN fit time 0.0017252439997719193
KNN predict time 0.04959794200021861
SVM 