In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import svm, tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
def read_csv_file (filename):
    df = pd.read_csv(filename)  # read the csv file
    return df

In [3]:
def convert_label_to_numeric(label):
    converted_label = np.empty(len(label), dtype=object) 
    for i in range(len(label)):
        if label[i] == "ONLY_ONE":
            converted_label[i] = 1
        elif label[i] == "NEUTRAL":
            converted_label[i] = 0
        else: 
            converted_label[i] = 2
    converted_label = converted_label.astype('int')
    return converted_label

In [4]:
def apply_PCA(train, test):
    # Since PCA is effected by scale, we need to scale the features in the data before applying PCA
    scaler = StandardScaler()
    # Fit on training set only.
    scaler.fit(train)
    # Apply transform to both the training set and the test set.
    train = scaler.transform(train)
    test = scaler.transform(test)

    # Make an instance of the Model
    pca = PCA(.95) #  choose the minimum number of principal components such that 95% of the variance is retained.
    # We are fitting PCA on the training set only.
    pca.fit(train)
    #print ("Number of selected components: ", pca.n_components_)
    #print (pd.DataFrame(pca.components_))
    
    # Apply the mapping (transform) to both the training set and the test set
    #print("Before applying PCA train set size: ", train.shape)
    #print("Before applying PCA test set size: ", test.shape)
    train = pca.transform(train)
    test = pca.transform(test)
    #print("After applying PCA train set size: ", train.shape)
    #print("After applying PCA test set size: ", test.shape)
    return train,test

In [5]:
def perfrom_CART(train, test, train_label, test_label):
    clf = tree.DecisionTreeClassifier(criterion='gini', splitter='best', 
                                      min_samples_split = 2, min_weight_fraction_leaf=0.0)
    clf.fit(train, train_label)
    predicted_label = clf.predict(test)
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1

In [6]:
def perfrom_KNN(train, test, train_label, test_label):
    clf = KNeighborsClassifier(n_neighbors=5, weights='distance')
    clf.fit(train, train_label)
    predicted_label = clf.predict(test)
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1

In [7]:
def perfrom_SVM(train, test, train_label, test_label):
    clf = svm.SVC(gamma='auto', C = 20.0, kernel='rbf', class_weight = {0:1, 1:1, 2:5})
    clf.fit(train, train_label)
    predicted_label = clf.predict(test)
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1

In [8]:
def perfrom_NB(train, test, train_label, test_label):
    clf = GaussianNB()
    clf.fit(train, train_label)
    predicted_label = clf.predict(test)
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1

In [9]:
def perfrom_RF(train, test, train_label, test_label):
    clf = RandomForestClassifier(n_estimators=10, criterion='gini')
    clf.fit(train, train_label)
    predicted_label = clf.predict(test)
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1

In [10]:
def measure_performance(true_label, predicted_label):   
    precision = recall = f1 = np.zeros(3, dtype=np.float32)
    report = classification_report(true_label, predicted_label, digits=3)
    precision[:] = precision_score(true_label, predicted_label, average=None, labels=[0,1,2])
    recall[:] = recall_score(true_label, predicted_label, average=None, labels=[0,1,2])
    f1[:] = f1_score(true_label, predicted_label, average=None, labels=[0,1,2])
    return recall, precision, f1

In [11]:
def kfold_cv(data, true_label):
    # 10 fold cv
    kf = KFold(n_splits=10, shuffle = True, random_state = 7)

    cv_recall_DT = []
    cv_precision_DT = []
    cv_f1_DT = []

    cv_recall_KNN = []
    cv_precision_KNN = []
    cv_f1_KNN = []

    cv_recall_SVM = []
    cv_precision_SVM = []
    cv_f1_SVM = []

    cv_recall_NB = []
    cv_precision_NB = []
    cv_f1_NB = []

    cv_recall_RF = []
    cv_precision_RF = []
    cv_f1_RF = []


    for train_index, test_index in kf.split(data):
        train, test = data.loc[train_index], data.loc[test_index]
        train_label, test_label = true_label[train_index], true_label[test_index]

        train, test = apply_PCA(train, test)

        recall, precision, f1 = perfrom_CART(train, test, train_label, test_label)
        cv_recall_DT.append(recall)
        cv_precision_DT.append(precision)
        cv_f1_DT.append(f1)

        recall, precision, f1 = perfrom_KNN(train, test, train_label, test_label)
        cv_recall_KNN.append(recall)
        cv_precision_KNN.append(precision)
        cv_f1_KNN.append(f1)

        recall, precision, f1 = perfrom_SVM(train, test, train_label, test_label)
        cv_recall_SVM.append(recall)
        cv_precision_SVM.append(precision)
        cv_f1_SVM.append(f1)

        recall, precision, f1 = perfrom_NB(train, test, train_label, test_label)
        cv_recall_NB.append(recall)
        cv_precision_NB.append(precision)
        cv_f1_NB.append(f1)

        recall, precision, f1 = perfrom_RF(train, test, train_label, test_label)
        cv_recall_RF.append(recall)
        cv_precision_RF.append(precision)
        cv_f1_RF.append(f1)

        
    recall_DT = np.mean(cv_recall_DT, axis= 0)
    precision_DT = np.mean(cv_precision_DT, axis= 0)
    f1_DT = np.mean(cv_f1_DT, axis= 0)

    recall_KNN = np.mean(cv_recall_KNN, axis= 0)
    precision_KNN = np.mean(cv_precision_KNN, axis= 0)
    f1_KNN = np.mean(cv_f1_KNN, axis= 0)

    recall_SVM = np.mean(cv_recall_SVM, axis= 0)
    precision_SVM = np.mean(cv_precision_SVM, axis= 0)
    f1_SVM =  np.mean(cv_f1_SVM, axis= 0)

    recall_NB = np.mean(cv_recall_NB, axis= 0)
    precision_NB = np.mean(cv_precision_NB, axis= 0)
    f1_NB = np.mean(cv_f1_NB, axis= 0)

    recall_RF = np.mean(cv_recall_RF, axis= 0)
    precision_RF = np.mean(cv_precision_RF, axis= 0)
    f1_RF = np.mean(cv_f1_RF, axis= 0)
    
    return recall_DT, precision_DT, f1_DT, recall_KNN, precision_KNN, f1_KNN, recall_SVM, precision_SVM,\
    f1_SVM, recall_NB, precision_NB, f1_NB, recall_RF, precision_RF, f1_RF

In [12]:
def different_file_test(train, test, train_label, test_label):
    
    train, test = apply_PCA(train, test)
        
    recall_DT, precision_DT, f1_DT = perfrom_CART(train, test, train_label, test_label)
    recall_KNN, precision_KNN, f1_KNN = perfrom_KNN(train, test, train_label, test_label)
    recall_SVM, precision_SVM, f1_SVM = perfrom_SVM(train, test, train_label, test_label)
    recall_NB, precision_NB, f1_NB = perfrom_NB(train, test, train_label, test_label)
    recall_RF, precision_RF, f1_RF = perfrom_RF(train, test, train_label, test_label)

    return recall_DT, precision_DT, f1_DT, recall_KNN, precision_KNN, f1_KNN, recall_SVM, precision_SVM,\
    f1_SVM, recall_NB, precision_NB, f1_NB, recall_RF, precision_RF, f1_RF

In [13]:
def repeated_test(data, true_label, train_data, test_data, train_label, test_label, test_name):
    repeated_recall_DT = []
    repeated_precision_DT = []
    repeated_f1_DT = []

    repeated_recall_KNN = []
    repeated_precision_KNN = []
    repeated_f1_KNN = []

    repeated_recall_SVM = []
    repeated_precision_SVM = []
    repeated_f1_SVM = []

    repeated_recall_NB = []
    repeated_precision_NB = []
    repeated_f1_NB = []

    repeated_recall_RF = []
    repeated_precision_RF = []
    repeated_f1_RF = []
    
    recall_DT= precision_DT= f1_DT= recall_KNN= precision_KNN= f1_KNN= recall_SVM= precision_SVM= f1_SVM\
    = recall_NB= precision_NB= f1_NB= recall_RF= precision_RF= f1_RF = []
    
    for i in range(10):
        if test_name == "k_fold":
            recall_DT, precision_DT, f1_DT, recall_KNN, precision_KNN, f1_KNN, recall_SVM, precision_SVM, f1_SVM,\
            recall_NB, precision_NB, f1_NB, recall_RF, precision_RF, f1_RF = kfold_cv(data, true_label)
        else: 
            recall_DT, precision_DT, f1_DT, recall_KNN, precision_KNN, f1_KNN, recall_SVM, precision_SVM, f1_SVM,\
            recall_NB, precision_NB, f1_NB, recall_RF, precision_RF, f1_RF = different_file_test(train_data, test_data, train_label, test_label)
        
        repeated_recall_DT.append(recall_DT)
        repeated_precision_DT.append(precision_DT)
        repeated_f1_DT.append(f1_DT)

        repeated_recall_KNN.append(recall_KNN)
        repeated_precision_KNN.append(precision_KNN)
        repeated_f1_KNN.append(f1_KNN)

        repeated_recall_SVM.append(recall_SVM)
        repeated_precision_SVM.append(precision_SVM)
        repeated_f1_SVM.append(f1_SVM)

        repeated_recall_NB.append(recall_NB)
        repeated_precision_NB.append(precision_NB)
        repeated_f1_NB.append(f1_NB)

        repeated_recall_RF.append(recall_RF)
        repeated_precision_RF.append(precision_RF)
        repeated_f1_RF.append(f1_RF)
        
    print("-------DT-------")
    print("Recall:", np.median(repeated_recall_DT, axis= 0))
    print("Precision:", np.median(repeated_precision_DT, axis= 0))
    print("f1 score:", np.median(repeated_f1_DT, axis= 0))

    print("-------KNN-------")
    print("Recall:", np.median(repeated_recall_KNN, axis= 0))
    print("Precision:", np.median(repeated_precision_KNN, axis= 0))
    print("f1 score:", np.median(repeated_f1_KNN, axis= 0))

    print("-------SVM-------")
    print("Recall:", np.median(repeated_recall_SVM, axis= 0))
    print("Precision:", np.median(repeated_precision_SVM, axis= 0))
    print("f1 score:", np.median(repeated_f1_SVM, axis= 0))

    print("-------NB-------")
    print("Recall:", np.median(repeated_recall_NB, axis= 0))
    print("Precision:", np.median(repeated_precision_NB, axis= 0))
    print("f1 score:", np.median(repeated_f1_NB, axis= 0))

    print("-------RF-------")
    print("Recall:", np.median(repeated_recall_RF, axis= 0))
    print("Precision:", np.median(repeated_precision_RF, axis= 0))
    print("f1 score:", np.median(repeated_f1_RF, axis= 0))

In [14]:
def calcFeatureImp(feature_vec, label_vec, feature_names_param, repeat=10):
    header_str, output= '', ''
    for name_ in feature_names_param:
        header_str = header_str + name_ + ','
    theRndForestModel = RandomForestClassifier()
    theRndForestModel.fit(feature_vec, label_vec)
    feat_imp_vector=theRndForestModel.feature_importances_

    for ind_ in range(repeat):
        for imp_vec_index in range(len(feat_imp_vector)):
            feat_imp_val = round(feat_imp_vector[imp_vec_index], 5)
            output = output +  str(feat_imp_val) + ','
        output = output + '\n'
    output_status = header_str + '\n' + output
    #print ("Feature importance: ", output_status)
    
    feat_imp_vector=list(feat_imp_vector)
    sorted_feat_imp_vector= [x_ for x_ in feat_imp_vector]
    sorted_feat_imp_vector.sort(reverse=True)
    
    sorted_feature_name = []
    for feat_imp_val in sorted_feat_imp_vector:
        feat_index = feat_imp_vector.index(feat_imp_val) 
        sorted_feature_name.append(feature_names_param[feat_index])
        
    print ("sorted feature names: ", sorted_feature_name)
    print ("sorted feature importance: ", sorted_feat_imp_vector)

In [15]:
data, true_label, train_data, test_data, train_label, test_label = [], [], [], [], [], [] # necessary variables

In [16]:
data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Data set: COLOCATED_MOZILLA.csv")
print("Initial data shape: ", data.shape)
true_label = data['COLOCATED_STATUS']
true_label = convert_label_to_numeric(true_label)
data = data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

repeated_test(data, true_label, train_data, test_data, train_label, test_label, "k_fold") #repeat kfold 10 times and report avarage performance

columns_name = list(data.columns.values) # read all column names
calcFeatureImp(data, true_label, columns_name) # find feature importance

Data set: COLOCATED_MOZILLA.csv
Initial data shape:  (1613, 16)
-------DT-------
Recall: [0.9678998 0.7902211 0.8400565]
Precision: [0.9678998 0.7902211 0.8400565]
f1 score: [0.9678998 0.7902211 0.8400565]
-------KNN-------
Recall: [0.9741529  0.862543   0.86627847]
Precision: [0.9741529  0.862543   0.86627847]
f1 score: [0.9741529  0.862543   0.86627847]
-------SVM-------
Recall: [0.9348861 0.4864154 0.6889287]
Precision: [0.9348861 0.4864154 0.6889287]
f1 score: [0.9348861 0.4864154 0.6889287]
-------NB-------
Recall: [0.8858469  0.15109083 0.30274132]
Precision: [0.8858469  0.15109083 0.30274132]
f1 score: [0.8858469  0.15109083 0.30274132]
-------RF-------
Recall: [0.9674127  0.76225644 0.8386735 ]
Precision: [0.9674127  0.76225644 0.8386735 ]
f1 score: [0.9674127  0.76225644 0.8386735 ]
sorted feature names:  ['SLOC', 'HARD_CODE', 'ATTR', 'URL', 'INCL', 'ENS', 'COMMENT', 'REQ', 'FILE', 'CMD', 'FILE_MODE', 'SSH']
sorted feature importance:  [0.22213144284417968, 0.16445270750604168

In [17]:
data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Data set: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", data.shape)
true_label = data['COLOCATED_STATUS']
true_label = convert_label_to_numeric(true_label)
data = data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

repeated_test(data, true_label, train_data, test_data, train_label, test_label, "k_fold") #repeat kfold 10 times and report avarage performance

columns_name = list(data.columns.values) # read all column names
calcFeatureImp(data, true_label, columns_name) # find feature importance

Data set: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
-------DT-------
Recall: [0.8406242 0.1596147 0.6469656]
Precision: [0.8406242 0.1596147 0.6469656]
f1 score: [0.8406242 0.1596147 0.6469656]
-------KNN-------
Recall: [0.86863786 0.08256557 0.70063764]
Precision: [0.86863786 0.08256557 0.70063764]
f1 score: [0.86863786 0.08256557 0.70063764]
-------SVM-------
Recall: [0.7913597  0.03759804 0.6411825 ]
Precision: [0.7913597  0.03759804 0.6411825 ]
f1 score: [0.7913597  0.03759804 0.6411825 ]
-------NB-------
Recall: [0.82843363 0.09645667 0.43564588]
Precision: [0.82843363 0.09645667 0.43564588]
f1 score: [0.82843363 0.09645667 0.43564588]
-------RF-------
Recall: [0.87092936 0.11049214 0.6879132 ]
Precision: [0.87092936 0.11049214 0.6879132 ]
f1 score: [0.87092936 0.11049214 0.6879132 ]
sorted feature names:  ['HARD_CODE', 'SLOC', 'ATTR', 'COMMENT', 'URL', 'ENS', 'INCL', 'REQ', 'CMD', 'FILE', 'FILE_MODE', 'SSH']
sorted feature importance:  [0.22742768440480246, 0.165736

In [18]:
data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Data set: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", data.shape)
true_label = data['COLOCATED_STATUS']
true_label = convert_label_to_numeric(true_label)
data = data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

repeated_test(data, true_label, train_data, test_data, train_label, test_label, "k_fold") # repeat kfold 10 times and report avarage performance

columns_name = list(data.columns.values) # read all column names
calcFeatureImp(data, true_label, columns_name) # find feature importance

Data set: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
-------DT-------
Recall: [0.81663704 0.15337972 0.44466323]
Precision: [0.81663704 0.15337972 0.44466323]
f1 score: [0.81663704 0.15337972 0.44466323]
-------KNN-------
Recall: [0.8496855  0.15563023 0.4797729 ]
Precision: [0.8496855  0.15563023 0.4797729 ]
f1 score: [0.8496855  0.15563023 0.4797729 ]
-------SVM-------
Recall: [0.7747448  0.01766917 0.4555634 ]
Precision: [0.7747448  0.01766917 0.4555634 ]
f1 score: [0.7747448  0.01766917 0.4555634 ]
-------NB-------
Recall: [0.84555846 0.02102564 0.34032562]
Precision: [0.84555846 0.02102564 0.34032562]
f1 score: [0.84555846 0.02102564 0.34032562]
-------RF-------
Recall: [0.85109967 0.12539047 0.43760198]
Precision: [0.85109967 0.12539047 0.43760198]
f1 score: [0.85109967 0.12539047 0.43760198]
sorted feature names:  ['SLOC', 'HARD_CODE', 'ATTR', 'COMMENT', 'ENS', 'INCL', 'CMD', 'REQ', 'FILE', 'URL', 'FILE_MODE', 'SSH']
sorted feature importance:  [0.21035711321460082,

In [19]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Train Data: COLOCATED_MOZILLA.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['COLOCATED_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Test Data: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['COLOCATED_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

repeated_test(data, true_label,train_data, test_data, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

Train Data: COLOCATED_MOZILLA.csv
Initial data shape:  (1613, 16)
Test Data: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
-------DT-------
Recall: [0.7008945  0.06162513 0.36012408]
Precision: [0.7008945  0.06162513 0.36012408]
f1 score: [0.7008945  0.06162513 0.36012408]
-------KNN-------
Recall: [0.7675427  0.03030303 0.37278527]
Precision: [0.7675427  0.03030303 0.37278527]
f1 score: [0.7675427  0.03030303 0.37278527]
-------SVM-------
Recall: [0.6794835  0.01149425 0.4012841 ]
Precision: [0.6794835  0.01149425 0.4012841 ]
f1 score: [0.6794835  0.01149425 0.4012841 ]
-------NB-------
Recall: [0.76548326 0.         0.42279655]
Precision: [0.76548326 0.         0.42279655]
f1 score: [0.76548326 0.         0.42279655]
-------RF-------
Recall: [0.7701546  0.04253656 0.32992882]
Precision: [0.7701546  0.04253656 0.32992882]
f1 score: [0.7701546  0.04253656 0.32992882]


In [20]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Initial data shape: ", train_data.shape)
print("Train Data: COLOCATED_MOZILLA.csv")
train_label = train_data['COLOCATED_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Test Data: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['COLOCATED_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

repeated_test(data, true_label,train_data, test_data, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

Initial data shape:  (1613, 16)
Train Data: COLOCATED_MOZILLA.csv
Test Data: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
-------DT-------
Recall: [0.8106023  0.1027635  0.31791216]
Precision: [0.8106023  0.1027635  0.31791216]
f1 score: [0.8106023  0.1027635  0.31791216]
-------KNN-------
Recall: [0.838895   0.04545455 0.31271878]
Precision: [0.838895   0.04545455 0.31271878]
f1 score: [0.838895   0.04545455 0.31271878]
-------SVM-------
Recall: [0.8226209  0.0569395  0.30417496]
Precision: [0.8226209  0.0569395  0.30417496]
f1 score: [0.8226209  0.0569395  0.30417496]
-------NB-------
Recall: [0.83452916 0.03921569 0.39179486]
Precision: [0.83452916 0.03921569 0.39179486]
f1 score: [0.83452916 0.03921569 0.39179486]
-------RF-------
Recall: [0.8404225  0.031625   0.27238744]
Precision: [0.8404225  0.031625   0.27238744]
f1 score: [0.8404225  0.031625   0.27238744]


In [21]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Train Data: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['COLOCATED_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Test Data: COLOCATED_MOZILLA.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['COLOCATED_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

repeated_test(data, true_label,train_data, test_data, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

Train Data: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
Test Data: COLOCATED_MOZILLA.csv
Initial data shape:  (1613, 16)
-------DT-------
Recall: [0.7727953  0.07536456 0.31451613]
Precision: [0.7727953  0.07536456 0.31451613]
f1 score: [0.7727953  0.07536456 0.31451613]
-------KNN-------
Recall: [0.84227383 0.13934426 0.3553719 ]
Precision: [0.84227383 0.13934426 0.3553719 ]
f1 score: [0.84227383 0.13934426 0.3553719 ]
-------SVM-------
Recall: [0.865566   0.01724138 0.40636042]
Precision: [0.865566   0.01724138 0.40636042]
f1 score: [0.865566   0.01724138 0.40636042]
-------NB-------
Recall: [0.87165976 0.09160306 0.29680365]
Precision: [0.87165976 0.09160306 0.29680365]
f1 score: [0.87165976 0.09160306 0.29680365]
-------RF-------
Recall: [0.8436268  0.11232323 0.3647645 ]
Precision: [0.8436268  0.11232323 0.3647645 ]
f1 score: [0.8436268  0.11232323 0.3647645 ]


In [22]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Train Data: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['COLOCATED_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Test Data: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['COLOCATED_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

repeated_test(data, true_label,train_data, test_data, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

Train Data: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
Test Data: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
-------DT-------
Recall: [0.7809032  0.10734347 0.39433157]
Precision: [0.7809032  0.10734347 0.39433157]
f1 score: [0.7809032  0.10734347 0.39433157]
-------KNN-------
Recall: [0.81195575 0.05109489 0.44387317]
Precision: [0.81195575 0.05109489 0.44387317]
f1 score: [0.81195575 0.05109489 0.44387317]
-------SVM-------
Recall: [0.76094276 0.00843882 0.4396985 ]
Precision: [0.76094276 0.00843882 0.4396985 ]
f1 score: [0.76094276 0.00843882 0.4396985 ]
-------NB-------
Recall: [0.83483756 0.06349207 0.3916501 ]
Precision: [0.83483756 0.06349207 0.3916501 ]
f1 score: [0.83483756 0.06349207 0.3916501 ]
-------RF-------
Recall: [0.813097   0.04181818 0.41307005]
Precision: [0.813097   0.04181818 0.41307005]
f1 score: [0.813097   0.04181818 0.41307005]


In [23]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Train Data: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['COLOCATED_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Test Data: COLOCATED_MOZILLA.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['COLOCATED_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

repeated_test(data, true_label,train_data, test_data, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

Train Data: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
Test Data: COLOCATED_MOZILLA.csv
Initial data shape:  (1613, 16)
-------DT-------
Recall: [0.7547518  0.12381584 0.2982396 ]
Precision: [0.7547518  0.12381584 0.2982396 ]
f1 score: [0.7547518  0.12381584 0.2982396 ]
-------KNN-------
Recall: [0.84279144 0.10309278 0.33639145]
Precision: [0.84279144 0.10309278 0.33639145]
f1 score: [0.84279144 0.10309278 0.33639145]
-------SVM-------
Recall: [0.8599841  0.14285715 0.42307693]
Precision: [0.8599841  0.14285715 0.42307693]
f1 score: [0.8599841  0.14285715 0.42307693]
-------NB-------
Recall: [0.8957447  0.         0.27722773]
Precision: [0.8957447  0.         0.27722773]
f1 score: [0.8957447  0.         0.27722773]
-------RF-------
Recall: [0.8757363  0.06649289 0.3824259 ]
Precision: [0.8757363  0.06649289 0.3824259 ]
f1 score: [0.8757363  0.06649289 0.3824259 ]


In [24]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Train Data: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['COLOCATED_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Test Data: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['COLOCATED_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

repeated_test(data, true_label,train_data, test_data, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

Train Data: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
Test Data: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
-------DT-------
Recall: [0.75794923 0.11698671 0.3776664 ]
Precision: [0.75794923 0.11698671 0.3776664 ]
f1 score: [0.75794923 0.11698671 0.3776664 ]
-------KNN-------
Recall: [0.79317075 0.06896552 0.37959865]
Precision: [0.79317075 0.06896552 0.37959865]
f1 score: [0.79317075 0.06896552 0.37959865]
-------SVM-------
Recall: [0.7648786  0.01324503 0.45766872]
Precision: [0.7648786  0.01324503 0.45766872]
f1 score: [0.7648786  0.01324503 0.45766872]
-------NB-------
Recall: [0.7990184  0.01503759 0.4227273 ]
Precision: [0.7990184  0.01503759 0.4227273 ]
f1 score: [0.7990184  0.01503759 0.4227273 ]
-------RF-------
Recall: [0.799718   0.06351645 0.35897547]
Precision: [0.799718   0.06351645 0.35897547]
f1 score: [0.799718   0.06351645 0.35897547]
