In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import svm, tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
def read_csv_file (filename):
    df = pd.read_csv(filename)  # read the csv file
    return df

In [3]:
def convert_label_to_numeric(label):
    converted_label = np.empty(len(label), dtype=object) 
    for i in range(len(label)):
        if label[i] == "ONLY_ONE":
            converted_label[i] = 1
        elif label[i] == "NEUTRAL":
            converted_label[i] = 0
        else: 
            converted_label[i] = 2
    converted_label = converted_label.astype('int')
    return converted_label

In [4]:
def apply_PCA(train, test):
    # Since PCA is effected by scale, we need to scale the features in the data before applying PCA
    scaler = StandardScaler()
    # Fit on training set only.
    scaler.fit(train)
    # Apply transform to both the training set and the test set.
    train = scaler.transform(train)
    test = scaler.transform(test)

    # Make an instance of the Model
    pca = PCA(.95) #  choose the minimum number of principal components such that 95% of the variance is retained.
    # We are fitting PCA on the training set only.
    pca.fit(train)
    #print ("Number of selected components: ", pca.n_components_)
    #print (pd.DataFrame(pca.components_))
    
    # Apply the mapping (transform) to both the training set and the test set
    #print("Before applying PCA train set size: ", train.shape)
    #print("Before applying PCA test set size: ", test.shape)
    train = pca.transform(train)
    test = pca.transform(test)
    #print("After applying PCA train set size: ", train.shape)
    #print("After applying PCA test set size: ", test.shape)
    return train,test

In [5]:
def perfrom_CART(train, test, train_label, test_label):
    clf = tree.DecisionTreeClassifier(criterion='gini', splitter='best', 
                                      min_samples_split = 2, min_weight_fraction_leaf=0.0)
    clf.fit(train, train_label)
    predicted_label = clf.predict(test)
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1

In [6]:
def perfrom_KNN(train, test, train_label, test_label):
    clf = KNeighborsClassifier(n_neighbors=5, weights='distance')
    clf.fit(train, train_label)
    predicted_label = clf.predict(test)
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1

In [7]:
def perfrom_SVM(train, test, train_label, test_label):
    clf = svm.SVC(gamma='auto', C = 20.0, kernel='rbf', class_weight = {0:1, 1:1, 2:5})
    clf.fit(train, train_label)
    predicted_label = clf.predict(test)
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1

In [8]:
def perfrom_NB(train, test, train_label, test_label):
    clf = GaussianNB()
    clf.fit(train, train_label)
    predicted_label = clf.predict(test)
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1

In [9]:
def perfrom_RF(train, test, train_label, test_label):
    clf = RandomForestClassifier(n_estimators=10, criterion='gini')
    clf.fit(train, train_label)
    predicted_label = clf.predict(test)
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1

In [10]:
def measure_performance(true_label, predicted_label):   
    precision = recall = f1 = np.zeros(3, dtype=np.float32)
    report = classification_report(true_label, predicted_label)
    precision = precision_score(true_label, predicted_label, average=None, labels=[0,1,2])
    recall = recall_score(true_label, predicted_label, average=None, labels=[0,1,2])
    f1 = f1_score(true_label, predicted_label, average=None, labels=[0,1,2])
    return recall, precision, f1

In [11]:
def kfold_cv(data, true_label):
    # 10 fold cv
    kf = KFold(n_splits=10, shuffle = True, random_state = 7)

    cv_recall_DT = []
    cv_precision_DT = []
    cv_f1_DT = []

    cv_recall_KNN = []
    cv_precision_KNN = []
    cv_f1_KNN = []

    cv_recall_SVM = []
    cv_precision_SVM = []
    cv_f1_SVM = []

    cv_recall_NB = []
    cv_precision_NB = []
    cv_f1_NB = []

    cv_recall_RF = []
    cv_precision_RF = []
    cv_f1_RF = []


    for train_index, test_index in kf.split(data):
        train, test = data.loc[train_index], data.loc[test_index]
        train_label, test_label = true_label[train_index], true_label[test_index]

        train, test = apply_PCA(train, test)

        recall, precision, f1 = perfrom_CART(train, test, train_label, test_label)
        cv_recall_DT.append(recall)
        cv_precision_DT.append(precision)
        cv_f1_DT.append(f1)

        recall, precision, f1 = perfrom_KNN(train, test, train_label, test_label)
        cv_recall_KNN.append(recall)
        cv_precision_KNN.append(precision)
        cv_f1_KNN.append(f1)

        recall, precision, f1 = perfrom_SVM(train, test, train_label, test_label)
        cv_recall_SVM.append(recall)
        cv_precision_SVM.append(precision)
        cv_f1_SVM.append(f1)

        recall, precision, f1 = perfrom_NB(train, test, train_label, test_label)
        cv_recall_NB.append(recall)
        cv_precision_NB.append(precision)
        cv_f1_NB.append(f1)

        recall, precision, f1 = perfrom_RF(train, test, train_label, test_label)
        cv_recall_RF.append(recall)
        cv_precision_RF.append(precision)
        cv_f1_RF.append(f1)

        
    recall_DT = np.mean(cv_recall_DT, axis= 0)
    precision_DT = np.mean(cv_precision_DT, axis= 0)
    f1_DT = np.mean(cv_f1_DT, axis= 0)

    recall_KNN = np.mean(cv_recall_KNN, axis= 0)
    precision_KNN = np.mean(cv_precision_KNN, axis= 0)
    f1_KNN = np.mean(cv_f1_KNN, axis= 0)

    recall_SVM = np.mean(cv_recall_SVM, axis= 0)
    precision_SVM = np.mean(cv_precision_SVM, axis= 0)
    f1_SVM =  np.mean(cv_f1_SVM, axis= 0)

    recall_NB = np.mean(cv_recall_NB, axis= 0)
    precision_NB = np.mean(cv_precision_NB, axis= 0)
    f1_NB = np.mean(cv_f1_NB, axis= 0)

    recall_RF = np.mean(cv_recall_RF, axis= 0)
    precision_RF = np.mean(cv_precision_RF, axis= 0)
    f1_RF = np.mean(cv_f1_RF, axis= 0)
    
    return recall_DT, precision_DT, f1_DT, recall_KNN, precision_KNN, f1_KNN, recall_SVM, precision_SVM,\
    f1_SVM, recall_NB, precision_NB, f1_NB, recall_RF, precision_RF, f1_RF

In [12]:
def different_file_test(train, test, train_label, test_label):
    
    train, test = apply_PCA(train, test)
        
    recall_DT, precision_DT, f1_DT = perfrom_CART(train, test, train_label, test_label)
    recall_KNN, precision_KNN, f1_KNN = perfrom_KNN(train, test, train_label, test_label)
    recall_SVM, precision_SVM, f1_SVM = perfrom_SVM(train, test, train_label, test_label)
    recall_NB, precision_NB, f1_NB = perfrom_NB(train, test, train_label, test_label)
    recall_RF, precision_RF, f1_RF = perfrom_RF(train, test, train_label, test_label)

    return recall_DT, precision_DT, f1_DT, recall_KNN, precision_KNN, f1_KNN, recall_SVM, precision_SVM,\
    f1_SVM, recall_NB, precision_NB, f1_NB, recall_RF, precision_RF, f1_RF

In [13]:
def repeated_test(data, true_label, train_data, test_data, train_label, test_label, test_name):
    repeated_recall_DT = []
    repeated_precision_DT = []
    repeated_f1_DT = []

    repeated_recall_KNN = []
    repeated_precision_KNN = []
    repeated_f1_KNN = []

    repeated_recall_SVM = []
    repeated_precision_SVM = []
    repeated_f1_SVM = []

    repeated_recall_NB = []
    repeated_precision_NB = []
    repeated_f1_NB = []

    repeated_recall_RF = []
    repeated_precision_RF = []
    repeated_f1_RF = []
    
    recall_DT= precision_DT= f1_DT= recall_KNN= precision_KNN= f1_KNN= recall_SVM= precision_SVM= f1_SVM\
    = recall_NB= precision_NB= f1_NB= recall_RF= precision_RF= f1_RF = []
    
    for i in range(10):
        if test_name == "k_fold":
            recall_DT, precision_DT, f1_DT, recall_KNN, precision_KNN, f1_KNN, recall_SVM, precision_SVM, f1_SVM,\
            recall_NB, precision_NB, f1_NB, recall_RF, precision_RF, f1_RF = kfold_cv(data, true_label)
        else: 
            recall_DT, precision_DT, f1_DT, recall_KNN, precision_KNN, f1_KNN, recall_SVM, precision_SVM, f1_SVM,\
            recall_NB, precision_NB, f1_NB, recall_RF, precision_RF, f1_RF = different_file_test(train_data, test_data, train_label, test_label)
        
        repeated_recall_DT.append(recall_DT)
        repeated_precision_DT.append(precision_DT)
        repeated_f1_DT.append(f1_DT)

        repeated_recall_KNN.append(recall_KNN)
        repeated_precision_KNN.append(precision_KNN)
        repeated_f1_KNN.append(f1_KNN)

        repeated_recall_SVM.append(recall_SVM)
        repeated_precision_SVM.append(precision_SVM)
        repeated_f1_SVM.append(f1_SVM)

        repeated_recall_NB.append(recall_NB)
        repeated_precision_NB.append(precision_NB)
        repeated_f1_NB.append(f1_NB)

        repeated_recall_RF.append(recall_RF)
        repeated_precision_RF.append(precision_RF)
        repeated_f1_RF.append(f1_RF)
        
    print("-------DT-------")
    print("Recall:", np.median(repeated_recall_DT, axis= 0))
    print("Precision:", np.median(repeated_precision_DT, axis= 0))
    print("f1 score:", np.median(repeated_f1_DT, axis= 0))

    print("-------KNN-------")
    print("Recall:", np.median(repeated_recall_KNN, axis= 0))
    print("Precision:", np.median(repeated_precision_KNN, axis= 0))
    print("f1 score:", np.median(repeated_f1_KNN, axis= 0))

    print("-------SVM-------")
    print("Recall:", np.median(repeated_recall_SVM, axis= 0))
    print("Precision:", np.median(repeated_precision_SVM, axis= 0))
    print("f1 score:", np.median(repeated_f1_SVM, axis= 0))

    print("-------NB-------")
    print("Recall:", np.median(repeated_recall_NB, axis= 0))
    print("Precision:", np.median(repeated_precision_NB, axis= 0))
    print("f1 score:", np.median(repeated_f1_NB, axis= 0))

    print("-------RF-------")
    print("Recall:", np.median(repeated_recall_RF, axis= 0))
    print("Precision:", np.median(repeated_precision_RF, axis= 0))
    print("f1 score:", np.median(repeated_f1_RF, axis= 0))

In [14]:
def calcFeatureImp(feature_vec, label_vec, feature_names_param, repeat=10):
    header_str, output= '', ''
    for name_ in feature_names_param:
        header_str = header_str + name_ + ','
    theRndForestModel = RandomForestClassifier()
    theRndForestModel.fit(feature_vec, label_vec)
    feat_imp_vector=theRndForestModel.feature_importances_

    for ind_ in range(repeat):
        for imp_vec_index in range(len(feat_imp_vector)):
            feat_imp_val = round(feat_imp_vector[imp_vec_index], 5)
            output = output +  str(feat_imp_val) + ','
        output = output + '\n'
    output_status = header_str + '\n' + output
    #print ("Feature importance: ", output_status)
    
    feat_imp_vector=list(feat_imp_vector)
    sorted_feat_imp_vector= [x_ for x_ in feat_imp_vector]
    sorted_feat_imp_vector.sort(reverse=True)
    
    sorted_feature_name = []
    for feat_imp_val in sorted_feat_imp_vector:
        feat_index = feat_imp_vector.index(feat_imp_val) 
        sorted_feature_name.append(feature_names_param[feat_index])
        
    print ("sorted feature names: ", sorted_feature_name)
    print ("sorted feature importance: ", sorted_feat_imp_vector)

In [15]:
data, true_label, train_data, test_data, train_label, test_label = [], [], [], [], [], [] # necessary variables

In [16]:
data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Data set: COLOCATED_MOZILLA.csv")
print("Initial data shape: ", data.shape)
true_label = data['COLOCATED_STATUS']
true_label = convert_label_to_numeric(true_label)
data = data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

repeated_test(data, true_label, train_data, test_data, train_label, test_label, "k_fold") #repeat kfold 10 times and report avarage performance

columns_name = list(data.columns.values) # read all column names
calcFeatureImp(data, true_label, columns_name) # find feature importance

Data set: COLOCATED_MOZILLA.csv
Initial data shape:  (1613, 16)
-------DT-------
Recall: [0.96050196 0.86052198 0.85900174]
Precision: [0.97475894 0.75953755 0.82572647]
f1 score: [0.96745177 0.7855099  0.83691895]
-------KNN-------
Recall: [0.97714871 0.86699634 0.85482835]
Precision: [0.97130508 0.8698138  0.8825487 ]
f1 score: [0.9741529  0.86254302 0.86627848]
-------SVM-------
Recall: [0.93800623 0.36703297 0.7730844 ]
Precision: [0.93229627 0.7497619  0.6320801 ]
f1 score: [0.93488611 0.48641539 0.68892872]
-------NB-------
Recall: [0.92018843 0.13440476 0.2707983 ]
Precision: [0.85496542 0.24       0.35075538]
f1 score: [0.88584689 0.15109082 0.30274131]
-------RF-------
Recall: [0.98157159 0.72940934 0.77514864]
Precision: [0.95261326 0.86052503 0.89526931]
f1 score: [0.96664785 0.78057835 0.83821221]
sorted feature names:  ['SLOC', 'HARD_CODE', 'ATTR', 'URL', 'INCL', 'COMMENT', 'ENS', 'REQ', 'CMD', 'FILE', 'FILE_MODE', 'SSH']
sorted feature importance:  [0.21977242079810236, 0

In [17]:
data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Data set: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", data.shape)
true_label = data['COLOCATED_STATUS']
true_label = convert_label_to_numeric(true_label)
data = data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

repeated_test(data, true_label, train_data, test_data, train_label, test_label, "k_fold") #repeat kfold 10 times and report avarage performance

columns_name = list(data.columns.values) # read all column names
calcFeatureImp(data, true_label, columns_name) # find feature importance

Data set: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
-------DT-------
Recall: [0.83419317 0.17592692 0.65614025]
Precision: [0.84772966 0.17021454 0.64007608]
f1 score: [0.84025935 0.16630119 0.64742559]
-------KNN-------
Recall: [0.8947942  0.05582796 0.68889839]
Precision: [0.84439477 0.19277778 0.71628359]
f1 score: [0.86863789 0.08256557 0.70063758]
-------SVM-------
Recall: [0.7086892  0.02789433 0.84278456]
Precision: [0.89674136 0.06666667 0.51792268]
f1 score: [0.79135969 0.03759804 0.64118245]
-------NB-------
Recall: [0.93956549 0.06507039 0.32406956]
Precision: [0.74122986 0.26428571 0.67129139]
f1 score: [0.82843356 0.09645667 0.43564587]
-------RF-------
Recall: [0.90093732 0.07974839 0.66135016]
Precision: [0.83523306 0.24684524 0.71197049]
f1 score: [0.86711376 0.11662414 0.6838848 ]
sorted feature names:  ['HARD_CODE', 'SLOC', 'ATTR', 'COMMENT', 'URL', 'ENS', 'INCL', 'REQ', 'CMD', 'FILE', 'FILE_MODE', 'SSH']
sorted feature importance:  [0.22273903204074955,

In [18]:
data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Data set: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", data.shape)
true_label = data['COLOCATED_STATUS']
true_label = convert_label_to_numeric(true_label)
data = data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

repeated_test(data, true_label, train_data, test_data, train_label, test_label, "k_fold") # repeat kfold 10 times and report avarage performance

columns_name = list(data.columns.values) # read all column names
calcFeatureImp(data, true_label, columns_name) # find feature importance

Data set: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
-------DT-------
Recall: [0.8170178  0.17228026 0.4349699 ]
Precision: [0.81751151 0.15290754 0.45022698]
f1 score: [0.81678518 0.15797012 0.43998046]
-------KNN-------
Recall: [0.89905329 0.11156079 0.43120454]
Precision: [0.80591502 0.28174603 0.54830925]
f1 score: [0.84968552 0.15563023 0.47977294]
-------SVM-------
Recall: [0.72317665 0.01130952 0.65357168]
Precision: [0.83499016 0.045      0.35194502]
f1 score: [0.77474481 0.01766917 0.45556341]
-------NB-------
Recall: [0.93734606 0.01309524 0.25734362]
Precision: [0.77028988 0.05333333 0.50922704]
f1 score: [0.84555845 0.02102564 0.34032561]
-------RF-------
Recall: [0.9156549  0.07574769 0.38036871]
Precision: [0.79745821 0.31404762 0.53160471]
f1 score: [0.8511604  0.114326   0.43907036]
sorted feature names:  ['SLOC', 'ATTR', 'HARD_CODE', 'COMMENT', 'ENS', 'INCL', 'CMD', 'REQ', 'URL', 'FILE', 'FILE_MODE', 'SSH']
sorted feature importance:  [0.2102332740039162, 

In [19]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Train Data: COLOCATED_MOZILLA.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['COLOCATED_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Test Data: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['COLOCATED_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

repeated_test(data, true_label,train_data, test_data, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

Train Data: COLOCATED_MOZILLA.csv
Initial data shape:  (1613, 16)
Test Data: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
-------DT-------
Recall: [0.68500539 0.08527132 0.3681178 ]
Precision: [0.74136698 0.04421132 0.35134121]
f1 score: [0.70933833 0.05915047 0.35411103]
-------KNN-------
Recall: [0.81121899 0.02325581 0.33674776]
Precision: [0.7283293  0.04347826 0.41746032]
f1 score: [0.76754274 0.03030303 0.37278526]
-------SVM-------
Recall: [0.6386192  0.00775194 0.48015365]
Precision: [0.72593501 0.02222222 0.34466912]
f1 score: [0.6794835  0.01149425 0.40128411]
-------NB-------
Recall: [0.79665588 0.         0.4084507 ]
Precision: [0.73665835 0.         0.43818681]
f1 score: [0.76548329 0.         0.42279655]
-------RF-------
Recall: [0.85544768 0.03100775 0.25800256]
Precision: [0.70473562 0.08319039 0.39523677]
f1 score: [0.77127269 0.04158587 0.3142232 ]


In [20]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Initial data shape: ", train_data.shape)
print("Train Data: COLOCATED_MOZILLA.csv")
train_label = train_data['COLOCATED_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Test Data: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['COLOCATED_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

repeated_test(data, true_label,train_data, test_data, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

Initial data shape:  (1613, 16)
Train Data: COLOCATED_MOZILLA.csv
Test Data: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
-------DT-------
Recall: [0.82797505 0.09624413 0.30474453]
Precision: [0.79311311 0.10680227 0.35359725]
f1 score: [0.80967461 0.09987058 0.33094044]
-------KNN-------
Recall: [0.91074856 0.03286385 0.24452555]
Precision: [0.77755018 0.07368421 0.43365696]
f1 score: [0.83889503 0.04545455 0.31271879]
-------SVM-------
Recall: [0.86900192 0.03755869 0.27919708]
Precision: [0.78094006 0.11764706 0.33406114]
f1 score: [0.82262094 0.0569395  0.30417495]
-------NB-------
Recall: [0.89299424 0.02347418 0.34854015]
Precision: [0.78324916 0.11904762 0.44730679]
f1 score: [0.83452915 0.03921569 0.39179487]
-------RF-------
Recall: [0.9328215  0.01877934 0.19890511]
Precision: [0.76299419 0.09307359 0.44374782]
f1 score: [0.83911459 0.03125048 0.27533753]


In [21]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Train Data: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['COLOCATED_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Test Data: COLOCATED_MOZILLA.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['COLOCATED_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

repeated_test(data, true_label,train_data, test_data, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

Train Data: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
Test Data: COLOCATED_MOZILLA.csv
Initial data shape:  (1613, 16)
-------DT-------
Recall: [0.68102797 0.08252427 0.5026738 ]
Precision: [0.89465906 0.0472651  0.22117687]
f1 score: [0.77424893 0.05994015 0.30736232]
-------KNN-------
Recall: [0.79516251 0.16504854 0.45989305]
Precision: [0.89531915 0.12056738 0.28956229]
f1 score: [0.84227382 0.13934426 0.3553719 ]
-------SVM-------
Recall: [0.83219955 0.00970874 0.61497326]
Precision: [0.9017199  0.07692308 0.30343008]
f1 score: [0.86556604 0.01724138 0.40636042]
-------NB-------
Recall: [0.87528345 0.05825243 0.34759358]
Precision: [0.86806597 0.21428571 0.25896414]
f1 score: [0.87165977 0.09160305 0.29680365]
-------RF-------
Recall: [0.77588813 0.08737864 0.54545455]
Precision: [0.89956906 0.07138957 0.29285329]
f1 score: [0.83351331 0.07203664 0.38319365]


In [22]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Train Data: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['COLOCATED_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Test Data: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['COLOCATED_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

repeated_test(data, true_label,train_data, test_data, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

Train Data: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
Test Data: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
-------DT-------
Recall: [0.75263916 0.1056338  0.44525547]
Precision: [0.80410279 0.11627707 0.34913689]
f1 score: [0.77820435 0.11030031 0.39163971]
-------KNN-------
Recall: [0.82773512 0.03286385 0.47262774]
Precision: [0.79676674 0.1147541  0.4184168 ]
f1 score: [0.81195575 0.05109489 0.44387318]
-------SVM-------
Recall: [0.70489443 0.00469484 0.63868613]
Precision: [0.82667417 0.04166667 0.33524904]
f1 score: [0.76094276 0.00843882 0.43969849]
-------NB-------
Recall: [0.88771593 0.03755869 0.35948905]
Precision: [0.7879046  0.20512821 0.430131  ]
f1 score: [0.83483755 0.06349206 0.3916501 ]
-------RF-------
Recall: [0.83301344 0.0258216  0.42883212]
Precision: [0.79405105 0.14236111 0.39090605]
f1 score: [0.81365296 0.04240754 0.40941543]


In [23]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Train Data: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['COLOCATED_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Test Data: COLOCATED_MOZILLA.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['COLOCATED_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

repeated_test(data, true_label,train_data, test_data, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

Train Data: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
Test Data: COLOCATED_MOZILLA.csv
Initial data shape:  (1613, 16)
-------DT-------
Recall: [0.6462585  0.22330097 0.38770053]
Precision: [0.86709762 0.07430527 0.23177083]
f1 score: [0.74070466 0.1140037  0.293569  ]
-------KNN-------
Recall: [0.83068783 0.14563107 0.29411765]
Precision: [0.85525292 0.07978723 0.39285714]
f1 score: [0.84279141 0.10309278 0.33639144]
-------SVM-------
Recall: [0.81708239 0.09708738 0.64705882]
Precision: [0.90764064 0.27027027 0.31428571]
f1 score: [0.85998409 0.14285714 0.42307692]
-------NB-------
Recall: [0.95464853 0.         0.22459893]
Precision: [0.84368737 0.         0.36206897]
f1 score: [0.89574468 0.         0.27722772]
-------RF-------
Recall: [0.8733938  0.0776699  0.33957219]
Precision: [0.85820986 0.07012527 0.40061728]
f1 score: [0.86586504 0.0755716  0.36490237]


In [24]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Train Data: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['COLOCATED_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Test Data: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['COLOCATED_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

repeated_test(data, true_label,train_data, test_data, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

Train Data: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
Test Data: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
-------DT-------
Recall: [0.7877562  0.17054264 0.31241997]
Precision: [0.73717215 0.08682588 0.49190071]
f1 score: [0.76186736 0.11505647 0.38094193]
-------KNN-------
Recall: [0.87702265 0.0620155  0.29065301]
Precision: [0.7239537  0.0776699  0.54698795]
f1 score: [0.79317073 0.06896552 0.37959866]
-------SVM-------
Recall: [0.77292341 0.00775194 0.47759283]
Precision: [0.75699947 0.04545455 0.4393404 ]
f1 score: [0.76487857 0.01324503 0.45766871]
-------NB-------
Recall: [0.8781014  0.00775194 0.35723431]
Precision: [0.73300315 0.25       0.51762523]
f1 score: [0.7990184  0.01503759 0.42272727]
-------RF-------
Recall: [0.90884574 0.05426357 0.24327785]
Precision: [0.71875461 0.09584699 0.53806135]
f1 score: [0.80200105 0.06589267 0.34238769]
