In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import svm, tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
def read_csv_file (filename):
    df = pd.read_csv(filename)  # read the csv file
    return df

In [3]:
def convert_label_to_numeric(label):
    converted_label = np.empty(len(label), dtype=object) 
    for i in range(len(label)):
        if label[i] == "ONLY_ONE":
            converted_label[i] = 1
        elif label[i] == "NEUTRAL":
            converted_label[i] = 0
        else: 
            converted_label[i] = 2
    converted_label = converted_label.astype('int')
    return converted_label

In [4]:
def apply_PCA(train, test):
    # Since PCA is effected by scale, we need to scale the features in the data before applying PCA
    scaler = StandardScaler()
    # Fit on training set only.
    scaler.fit(train)
    # Apply transform to both the training set and the test set.
    train = scaler.transform(train)
    test = scaler.transform(test)

    # Make an instance of the Model
    pca = PCA(.95) #  choose the minimum number of principal components such that 95% of the variance is retained.
    # We are fitting PCA on the training set only.
    pca.fit(train)
    #print ("Number of selected components: ", pca.n_components_)
    #print (pd.DataFrame(pca.components_))
    
    # Apply the mapping (transform) to both the training set and the test set
    #print("Before applying PCA train set size: ", train.shape)
    #print("Before applying PCA test set size: ", test.shape)
    train = pca.transform(train)
    test = pca.transform(test)
    #print("After applying PCA train set size: ", train.shape)
    #print("After applying PCA test set size: ", test.shape)
    return train,test

In [5]:
def perfrom_CART(train, test, train_label, test_label):
    clf = tree.DecisionTreeClassifier(criterion='gini', splitter='best', 
                                      min_samples_split = 2, min_weight_fraction_leaf=0.0)
    clf.fit(train, train_label)
    predicted_label = clf.predict(test)
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1

In [6]:
def perfrom_KNN(train, test, train_label, test_label):
    clf = KNeighborsClassifier(n_neighbors=5, weights='distance')
    clf.fit(train, train_label)
    predicted_label = clf.predict(test)
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1

In [7]:
def perfrom_SVM(train, test, train_label, test_label):
    clf = svm.SVC(gamma='auto', C = 20.0, kernel='rbf', class_weight = {0:1, 1:1, 2:5})
    clf.fit(train, train_label)
    predicted_label = clf.predict(test)
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1

In [8]:
def perfrom_NB(train, test, train_label, test_label):
    clf = GaussianNB()
    clf.fit(train, train_label)
    predicted_label = clf.predict(test)
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1

In [9]:
def perfrom_RF(train, test, train_label, test_label):
    clf = RandomForestClassifier(n_estimators=10, criterion='gini')
    clf.fit(train, train_label)
    predicted_label = clf.predict(test)
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1

In [10]:
def measure_performance(true_label, predicted_label):   
    report = classification_report(true_label, predicted_label, digits=3)
    recall = recall_score(true_label, predicted_label, average="macro")
    precision = precision_score(true_label, predicted_label, average="macro")
    f1 = f1_score(true_label, predicted_label, average="macro")
    return recall, precision, f1

In [11]:
def kfold_cv(data, true_label):
    # 10 fold cv
    kf = KFold(n_splits=10, shuffle = True, random_state = 7)

    cv_recall_DT = []
    cv_precision_DT = []
    cv_f1_DT = []

    cv_recall_KNN = []
    cv_precision_KNN = []
    cv_f1_KNN = []

    cv_recall_SVM = []
    cv_precision_SVM = []
    cv_f1_SVM = []

    cv_recall_NB = []
    cv_precision_NB = []
    cv_f1_NB = []

    cv_recall_RF = []
    cv_precision_RF = []
    cv_f1_RF = []


    for train_index, test_index in kf.split(data):
        train, test = data.loc[train_index], data.loc[test_index]
        train_label, test_label = true_label[train_index], true_label[test_index]

        train, test = apply_PCA(train, test)

        recall, precision, f1 = perfrom_CART(train, test, train_label, test_label)
        cv_recall_DT.append(recall)
        cv_precision_DT.append(precision)
        cv_f1_DT.append(f1)

        recall, precision, f1 = perfrom_KNN(train, test, train_label, test_label)
        cv_recall_KNN.append(recall)
        cv_precision_KNN.append(precision)
        cv_f1_KNN.append(f1)

        recall, precision, f1 = perfrom_SVM(train, test, train_label, test_label)
        cv_recall_SVM.append(recall)
        cv_precision_SVM.append(precision)
        cv_f1_SVM.append(f1)

        recall, precision, f1 = perfrom_NB(train, test, train_label, test_label)
        cv_recall_NB.append(recall)
        cv_precision_NB.append(precision)
        cv_f1_NB.append(f1)

        recall, precision, f1 = perfrom_RF(train, test, train_label, test_label)
        cv_recall_RF.append(recall)
        cv_precision_RF.append(precision)
        cv_f1_RF.append(f1)

        
    recall_DT = np.mean(cv_recall_DT)
    precision_DT = np.mean(cv_precision_DT)
    f1_DT = np.mean(cv_f1_DT)

    recall_KNN = np.mean(cv_recall_KNN)
    precision_KNN = np.mean(cv_precision_KNN)
    f1_KNN = np.mean(cv_f1_KNN)

    recall_SVM = np.mean(cv_recall_SVM)
    precision_SVM = np.mean(cv_precision_SVM)
    f1_SVM =  np.mean(cv_f1_SVM)

    recall_NB = np.mean(cv_recall_NB)
    precision_NB = np.mean(cv_precision_NB)
    f1_NB = np.mean(cv_f1_NB)

    recall_RF = np.mean(cv_recall_RF)
    precision_RF = np.mean(cv_precision_RF)
    f1_RF = np.mean(cv_f1_RF)
    
    return recall_DT, precision_DT, f1_DT, recall_KNN, precision_KNN, f1_KNN, recall_SVM, precision_SVM,\
    f1_SVM, recall_NB, precision_NB, f1_NB, recall_RF, precision_RF, f1_RF

In [12]:
def different_file_test(train, test, train_label, test_label):
    
    train, test = apply_PCA(train, test)
        
    recall_DT, precision_DT, f1_DT = perfrom_CART(train, test, train_label, test_label)
    recall_KNN, precision_KNN, f1_KNN = perfrom_KNN(train, test, train_label, test_label)
    recall_SVM, precision_SVM, f1_SVM = perfrom_SVM(train, test, train_label, test_label)
    recall_NB, precision_NB, f1_NB = perfrom_NB(train, test, train_label, test_label)
    recall_RF, precision_RF, f1_RF = perfrom_RF(train, test, train_label, test_label)

    return recall_DT, precision_DT, f1_DT, recall_KNN, precision_KNN, f1_KNN, recall_SVM, precision_SVM,\
    f1_SVM, recall_NB, precision_NB, f1_NB, recall_RF, precision_RF, f1_RF

In [13]:
def repeated_test(data, true_label, train_data, test_data, train_label, test_label, test_name):
    repeated_recall_DT = []
    repeated_precision_DT = []
    repeated_f1_DT = []

    repeated_recall_KNN = []
    repeated_precision_KNN = []
    repeated_f1_KNN = []

    repeated_recall_SVM = []
    repeated_precision_SVM = []
    repeated_f1_SVM = []

    repeated_recall_NB = []
    repeated_precision_NB = []
    repeated_f1_NB = []

    repeated_recall_RF = []
    repeated_precision_RF = []
    repeated_f1_RF = []
    
    recall_DT= precision_DT= f1_DT= recall_KNN= precision_KNN= f1_KNN= recall_SVM= precision_SVM= f1_SVM\
    = recall_NB= precision_NB= f1_NB= recall_RF= precision_RF= f1_RF = 0
    
    for i in range(10):
        if test_name == "k_fold":
            recall_DT, precision_DT, f1_DT, recall_KNN, precision_KNN, f1_KNN, recall_SVM, precision_SVM, f1_SVM,\
            recall_NB, precision_NB, f1_NB, recall_RF, precision_RF, f1_RF = kfold_cv(data, true_label)
        else: 
            recall_DT, precision_DT, f1_DT, recall_KNN, precision_KNN, f1_KNN, recall_SVM, precision_SVM, f1_SVM,\
            recall_NB, precision_NB, f1_NB, recall_RF, precision_RF, f1_RF = different_file_test(train_data, test_data, train_label, test_label)
        
        repeated_recall_DT.append(recall_DT)
        repeated_precision_DT.append(precision_DT)
        repeated_f1_DT.append(f1_DT)

        repeated_recall_KNN.append(recall_KNN)
        repeated_precision_KNN.append(precision_KNN)
        repeated_f1_KNN.append(f1_KNN)

        repeated_recall_SVM.append(recall_SVM)
        repeated_precision_SVM.append(precision_SVM)
        repeated_f1_SVM.append(f1_SVM)

        repeated_recall_NB.append(recall_NB)
        repeated_precision_NB.append(precision_NB)
        repeated_f1_NB.append(f1_NB)

        repeated_recall_RF.append(recall_RF)
        repeated_precision_RF.append(precision_RF)
        repeated_f1_RF.append(f1_RF)
        
    print("-------DT-------")
    print("Recall:", np.mean(repeated_recall_DT))
    print("Precision:", np.mean(repeated_precision_DT))
    print("f1 score:", np.mean(repeated_f1_DT))

    print("-------KNN-------")
    print("Recall:", np.mean(repeated_recall_KNN))
    print("Precision:", np.mean(repeated_precision_KNN))
    print("f1 score:", np.mean(repeated_f1_KNN))

    print("-------SVM-------")
    print("Recall:", np.mean(repeated_recall_SVM))
    print("Precision:", np.mean(repeated_precision_SVM))
    print("f1 score:", np.mean(repeated_f1_SVM))

    print("-------NB-------")
    print("Recall:", np.mean(repeated_recall_NB))
    print("Precision:", np.mean(repeated_precision_NB))
    print("f1 score:", np.mean(repeated_f1_NB))

    print("-------RF-------")
    print("Recall:", np.mean(repeated_recall_RF))
    print("Precision:", np.mean(repeated_precision_RF))
    print("f1 score:", np.mean(repeated_f1_RF))

In [14]:
data, true_label, train_data, test_data, train_label, test_label = [], [], [], [], [], [] # necessary variables

In [15]:
data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Data set: COLOCATED_MOZILLA.csv")
print("Initial data shape: ", data.shape)
true_label = data['COLOCATED_STATUS']
true_label = convert_label_to_numeric(true_label)
data = data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

repeated_test(data, true_label, train_data, test_data, train_label, test_label, "k_fold") #repeat kfold 10 times and report avarage performance

Data set: COLOCATED_MOZILLA.csv
Initial data shape:  (1613, 16)
-------DT-------
Recall: 0.8936278567090504
Precision: 0.854439258644591
f1 score: 0.8662168775739522
-------KNN-------
Recall: 0.8996578000728326
Precision: 0.9078891938789528
f1 score: 0.9009914677819351
-------SVM-------
Recall: 0.6927078668314843
Precision: 0.7713794250479379
f1 score: 0.7034100740450092
-------NB-------
Recall: 0.4417971611795387
Precision: 0.4819069337609035
f1 score: 0.4465596739043945
-------RF-------
Recall: 0.832945573186948
Precision: 0.901820565109585
f1 score: 0.8574742269186058


In [16]:
data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Data set: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", data.shape)
true_label = data['COLOCATED_STATUS']
true_label = convert_label_to_numeric(true_label)
data = data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

repeated_test(data, true_label, train_data, test_data, train_label, test_label, "k_fold") #repeat kfold 10 times and report avarage performance

Data set: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
-------DT-------
Recall: 0.5489996320703077
Precision: 0.5481927483612656
f1 score: 0.5451432184647201
-------KNN-------
Recall: 0.5465068504006748
Precision: 0.584485381016399
f1 score: 0.5506136828988317
-------SVM-------
Recall: 0.526456027178722
Precision: 0.49377690310870676
f1 score: 0.49004672608296546
-------NB-------
Recall: 0.44290181209010504
Precision: 0.5589356550283747
f1 score: 0.45351203595714445
-------RF-------
Recall: 0.5461687459175607
Precision: 0.602767601416278
f1 score: 0.5549420331549549


In [17]:
data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Data set: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", data.shape)
true_label = data['COLOCATED_STATUS']
true_label = convert_label_to_numeric(true_label)
data = data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

repeated_test(data, true_label, train_data, test_data, train_label, test_label, "k_fold") # repeat kfold 10 times and report avarage performance

Data set: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
-------DT-------
Recall: 0.47451198452482213
Precision: 0.4755073419906199
f1 score: 0.472963151033012
-------KNN-------
Recall: 0.4806062104426288
Precision: 0.5453234342467559
f1 score: 0.49502956109292817
-------SVM-------
Recall: 0.46268595417217817
Precision: 0.41064505883862407
f1 score: 0.4159924656348165
-------NB-------
Recall: 0.40259497439713093
Precision: 0.44428341807287347
f1 score: 0.4023032320496731
-------RF-------
Recall: 0.4642459456293507
Precision: 0.555923339757609
f1 score: 0.4784391380690634


In [18]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Train Data: COLOCATED_MOZILLA.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['COLOCATED_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Test Data: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['COLOCATED_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

repeated_test(data, true_label,train_data, test_data, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

Train Data: COLOCATED_MOZILLA.csv
Initial data shape:  (1613, 16)
Test Data: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
-------DT-------
Recall: 0.3770914298506131
Precision: 0.38010284963372376
f1 score: 0.37410694813214806
-------KNN-------
Recall: 0.3904075197375755
Precision: 0.39642262538356865
f1 score: 0.39021034315994685
-------SVM-------
Recall: 0.37550826295940926
Precision: 0.3642754496886976
f1 score: 0.364087287580067
-------NB-------
Recall: 0.4017021944685011
Precision: 0.3916150557671755
f1 score: 0.39609327994487364
-------RF-------
Recall: 0.3735516274367126
Precision: 0.38847079405949725
f1 score: 0.3675040308301512


In [19]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Initial data shape: ", train_data.shape)
print("Train Data: COLOCATED_MOZILLA.csv")
train_label = train_data['COLOCATED_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Test Data: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['COLOCATED_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

repeated_test(data, true_label,train_data, test_data, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

Initial data shape:  (1613, 16)
Train Data: COLOCATED_MOZILLA.csv
Test Data: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
-------DT-------
Recall: 0.4089370476188033
Precision: 0.41597237675804866
f1 score: 0.4113931756330487
-------KNN-------
Recall: 0.3960459858903888
Precision: 0.42829711760193145
f1 score: 0.3990227865144219
-------SVM-------
Recall: 0.3952525617079256
Precision: 0.4108827515218526
f1 score: 0.3945784641151898
-------NB-------
Recall: 0.4216695220772559
Precision: 0.4498678562886213
f1 score: 0.4218465686838148
-------RF-------
Recall: 0.38381108648707274
Precision: 0.4401328401572259
f1 score: 0.3816224610231428


In [20]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Train Data: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['COLOCATED_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Test Data: COLOCATED_MOZILLA.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['COLOCATED_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

repeated_test(data, true_label,train_data, test_data, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

Train Data: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
Test Data: COLOCATED_MOZILLA.csv
Initial data shape:  (1613, 16)
-------DT-------
Recall: 0.43341358903078736
Precision: 0.3942033568230024
f1 score: 0.388409574494584
-------KNN-------
Recall: 0.4733680337552954
Precision: 0.43514960479499487
f1 score: 0.44566332739225756
-------SVM-------
Recall: 0.4856271821271413
Precision: 0.42735768593288376
f1 score: 0.4297226136914875
-------NB-------
Recall: 0.4270431522613949
Precision: 0.4471052749095003
f1 score: 0.4200221576857574
-------RF-------
Recall: 0.461962187378956
Precision: 0.41962836206140164
f1 score: 0.42756314074740376


In [21]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Train Data: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['COLOCATED_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Test Data: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['COLOCATED_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

repeated_test(data, true_label,train_data, test_data, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

Train Data: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
Test Data: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
-------DT-------
Recall: 0.4347967534594406
Precision: 0.4242558790040537
f1 score: 0.4271261816799591
-------KNN-------
Recall: 0.44440890391720417
Precision: 0.44331254776734125
f1 score: 0.43564127463258845
-------SVM-------
Recall: 0.44942513361626746
Precision: 0.4011966262538711
f1 score: 0.40302669065682445
-------NB-------
Recall: 0.428254555814337
Precision: 0.47438793638476734
f1 score: 0.42999323600733186
-------RF-------
Recall: 0.43343287695656796
Precision: 0.4354113603935271
f1 score: 0.4248255219063


In [22]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Train Data: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['COLOCATED_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Test Data: COLOCATED_MOZILLA.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['COLOCATED_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

repeated_test(data, true_label,train_data, test_data, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

Train Data: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
Test Data: COLOCATED_MOZILLA.csv
Initial data shape:  (1613, 16)
-------DT-------
Recall: 0.43163033969103975
Precision: 0.39639212684799613
f1 score: 0.3879572476023646
-------KNN-------
Recall: 0.42347884856927304
Precision: 0.4426324317292113
f1 score: 0.4274252106189893
-------SVM-------
Recall: 0.5204095302270495
Precision: 0.49739887422507073
f1 score: 0.4753060516783668
-------NB-------
Recall: 0.39308248551946035
Precision: 0.4019187800889135
f1 score: 0.3909908012077804
-------RF-------
Recall: 0.43705641257516376
Precision: 0.45552435402193325
f1 score: 0.4439018317147346


In [23]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Train Data: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['COLOCATED_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Test Data: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['COLOCATED_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

repeated_test(data, true_label,train_data, test_data, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

Train Data: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
Test Data: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
-------DT-------
Recall: 0.4327821499385996
Precision: 0.43572341994230934
f1 score: 0.42103644685611546
-------KNN-------
Recall: 0.4098970555201733
Precision: 0.4495371833928144
f1 score: 0.4139116370520181
-------SVM-------
Recall: 0.4194227255119136
Precision: 0.41393147255455665
f1 score: 0.4119307714304363
-------NB-------
Recall: 0.4143625517795123
Precision: 0.5002094612147998
f1 score: 0.41226109054007026
-------RF-------
Recall: 0.4036220889999921
Precision: 0.45658845924896696
f1 score: 0.40527814045221533
