In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn import svm, tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
def read_csv_file (filename):
    df = pd.read_csv(filename)  # read the csv file
    return df

In [3]:
def convert_label_to_numeric(label):
    converted_label = np.empty(len(label), dtype=object) 
    for i in range(len(label)):
        if label[i] == "ONLY_ONE":
            converted_label[i] = 1
        elif label[i] == "NEUTRAL":
            converted_label[i] = 0
        else: 
            converted_label[i] = 2
    converted_label = converted_label.astype('int')
    return converted_label

In [4]:
def perfrom_CART(train, test, train_label, test_label):
    clf = tree.DecisionTreeClassifier(criterion='gini', splitter='best', 
                                      min_samples_split = 2, min_weight_fraction_leaf=0.0)
    clf.fit(train, train_label)
    predicted_label = clf.predict(test)
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1

In [5]:
def perfrom_KNN(train, test, train_label, test_label):
    clf = KNeighborsClassifier(n_neighbors=5, weights='distance')
    clf.fit(train, train_label)
    predicted_label = clf.predict(test)
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1

In [6]:
def perfrom_SVM(train, test, train_label, test_label):
    clf = svm.SVC(gamma='auto', C = 20.0, kernel='rbf', class_weight = {0:1, 1:1, 2:5})
    clf.fit(train, train_label)
    predicted_label = clf.predict(test)
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1

In [7]:
def perfrom_NB(train, test, train_label, test_label):
    clf = GaussianNB()
    clf.fit(train, train_label)
    predicted_label = clf.predict(test)
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1

In [8]:
def perfrom_RF(train, test, train_label, test_label):
    clf = RandomForestClassifier(n_estimators=10, criterion='gini')
    clf.fit(train, train_label)
    predicted_label = clf.predict(test)
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1

In [9]:
def measure_performance(true_label, predicted_label):   
    precision = recall = f1 = np.zeros(3, dtype=np.float32)
    report = classification_report(true_label, predicted_label, digits=3)
    precision = precision_score(true_label, predicted_label, average=None, labels=[0,1,2])
    recall = recall_score(true_label, predicted_label, average=None, labels=[0,1,2])
    f1 = f1_score(true_label, predicted_label, average=None, labels=[0,1,2])
    return recall, precision, f1

In [10]:
def kfold_cv(data, true_label):
    # 10 fold cv
    kf = KFold(n_splits=10, shuffle = True, random_state = 7)

    cv_recall_DT = []
    cv_precision_DT = []
    cv_f1_DT = []

    cv_recall_KNN = []
    cv_precision_KNN = []
    cv_f1_KNN = []

    cv_recall_SVM = []
    cv_precision_SVM = []
    cv_f1_SVM = []

    cv_recall_NB = []
    cv_precision_NB = []
    cv_f1_NB = []

    cv_recall_RF = []
    cv_precision_RF = []
    cv_f1_RF = []


    for train_index, test_index in kf.split(data):
        train, test = data.loc[train_index], data.loc[test_index]
        train_label, test_label = true_label[train_index], true_label[test_index]
    
        train = train.values.reshape(-1, 1)
        test = test.values.reshape(-1, 1)

        recall, precision, f1 = perfrom_CART(train, test, train_label, test_label)
        cv_recall_DT.append(recall)
        cv_precision_DT.append(precision)
        cv_f1_DT.append(f1)

        recall, precision, f1 = perfrom_KNN(train, test, train_label, test_label)
        cv_recall_KNN.append(recall)
        cv_precision_KNN.append(precision)
        cv_f1_KNN.append(f1)

        recall, precision, f1 = perfrom_SVM(train, test, train_label, test_label)
        cv_recall_SVM.append(recall)
        cv_precision_SVM.append(precision)
        cv_f1_SVM.append(f1)

        recall, precision, f1 = perfrom_NB(train, test, train_label, test_label)
        cv_recall_NB.append(recall)
        cv_precision_NB.append(precision)
        cv_f1_NB.append(f1)

        recall, precision, f1 = perfrom_RF(train, test, train_label, test_label)
        cv_recall_RF.append(recall)
        cv_precision_RF.append(precision)
        cv_f1_RF.append(f1)

    recall_DT = np.mean(cv_recall_DT, axis= 0)
    precision_DT = np.mean(cv_precision_DT, axis= 0)
    f1_DT = np.mean(cv_f1_DT, axis= 0)

    recall_KNN = np.mean(cv_recall_KNN, axis= 0)
    precision_KNN = np.mean(cv_precision_KNN, axis= 0)
    f1_KNN = np.mean(cv_f1_KNN, axis= 0)

    recall_SVM = np.mean(cv_recall_SVM, axis= 0)
    precision_SVM = np.mean(cv_precision_SVM, axis= 0)
    f1_SVM =  np.mean(cv_f1_SVM, axis= 0)

    recall_NB = np.mean(cv_recall_NB, axis= 0)
    precision_NB = np.mean(cv_precision_NB, axis= 0)
    f1_NB = np.mean(cv_f1_NB, axis= 0)

    recall_RF = np.mean(cv_recall_RF, axis= 0)
    precision_RF = np.mean(cv_precision_RF, axis= 0)
    f1_RF = np.mean(cv_f1_RF, axis= 0)
    
    return recall_DT, precision_DT, f1_DT, recall_KNN, precision_KNN, f1_KNN, recall_SVM, precision_SVM,\
    f1_SVM, recall_NB, precision_NB, f1_NB, recall_RF, precision_RF, f1_RF

In [11]:
def different_file_test(train, test, train_label, test_label):
    
    train = train.values.reshape(-1, 1)
    test = test.values.reshape(-1, 1)

    recall_DT, precision_DT, f1_DT = perfrom_CART(train, test, train_label, test_label)
    recall_KNN, precision_KNN, f1_KNN = perfrom_KNN(train, test, train_label, test_label)
    recall_SVM, precision_SVM, f1_SVM = perfrom_SVM(train, test, train_label, test_label)
    recall_NB, precision_NB, f1_NB = perfrom_NB(train, test, train_label, test_label)
    recall_RF, precision_RF, f1_RF = perfrom_RF(train, test, train_label, test_label)

    return recall_DT, precision_DT, f1_DT, recall_KNN, precision_KNN, f1_KNN, recall_SVM, precision_SVM,\
    f1_SVM, recall_NB, precision_NB, f1_NB, recall_RF, precision_RF, f1_RF

In [12]:
def repeated_test(data, true_label, train_data, test_data, train_label, test_label, test_name):
    repeated_recall_DT = []
    repeated_precision_DT = []
    repeated_f1_DT = []

    repeated_recall_KNN = []
    repeated_precision_KNN = []
    repeated_f1_KNN = []

    repeated_recall_SVM = []
    repeated_precision_SVM = []
    repeated_f1_SVM = []

    repeated_recall_NB = []
    repeated_precision_NB = []
    repeated_f1_NB = []

    repeated_recall_RF = []
    repeated_precision_RF = []
    repeated_f1_RF = []
    
    recall_DT= precision_DT= f1_DT= recall_KNN= precision_KNN= f1_KNN= recall_SVM= precision_SVM= f1_SVM\
    = recall_NB= precision_NB= f1_NB= recall_RF= precision_RF= f1_RF = 0
    
    for i in range(10):
        if test_name == "k_fold":
            recall_DT, precision_DT, f1_DT, recall_KNN, precision_KNN, f1_KNN, recall_SVM, precision_SVM, f1_SVM,\
            recall_NB, precision_NB, f1_NB, recall_RF, precision_RF, f1_RF = kfold_cv(data, true_label)
        else: 
            recall_DT, precision_DT, f1_DT, recall_KNN, precision_KNN, f1_KNN, recall_SVM, precision_SVM, f1_SVM,\
            recall_NB, precision_NB, f1_NB, recall_RF, precision_RF, f1_RF = different_file_test(train_data, test_data, train_label, test_label)
        
        repeated_recall_DT.append(recall_DT)
        repeated_precision_DT.append(precision_DT)
        repeated_f1_DT.append(f1_DT)

        repeated_recall_KNN.append(recall_KNN)
        repeated_precision_KNN.append(precision_KNN)
        repeated_f1_KNN.append(f1_KNN)

        repeated_recall_SVM.append(recall_SVM)
        repeated_precision_SVM.append(precision_SVM)
        repeated_f1_SVM.append(f1_SVM)

        repeated_recall_NB.append(recall_NB)
        repeated_precision_NB.append(precision_NB)
        repeated_f1_NB.append(f1_NB)

        repeated_recall_RF.append(recall_RF)
        repeated_precision_RF.append(precision_RF)
        repeated_f1_RF.append(f1_RF)
        
    print("-------DT-------")
    print("Recall:", np.median(repeated_recall_DT, axis= 0))
    print("Precision:", np.median(repeated_precision_DT, axis= 0))
    print("f1 score:", np.median(repeated_f1_DT, axis= 0))

    print("-------KNN-------")
    print("Recall:", np.median(repeated_recall_KNN, axis= 0))
    print("Precision:", np.median(repeated_precision_KNN, axis= 0))
    print("f1 score:", np.median(repeated_f1_KNN, axis= 0))

    print("-------SVM-------")
    print("Recall:", np.median(repeated_recall_SVM, axis= 0))
    print("Precision:", np.median(repeated_precision_SVM, axis= 0))
    print("f1 score:", np.median(repeated_f1_SVM, axis= 0))

    print("-------NB-------")
    print("Recall:", np.median(repeated_recall_NB, axis= 0))
    print("Precision:", np.median(repeated_precision_NB, axis= 0))
    print("f1 score:", np.median(repeated_f1_NB, axis= 0))

    print("-------RF-------")
    print("Recall:", np.median(repeated_recall_RF, axis= 0))
    print("Precision:", np.median(repeated_precision_RF, axis= 0))
    print("f1 score:", np.median(repeated_f1_RF, axis= 0))

In [13]:
data, true_label, train_data, test_data, train_label, test_label = [], [], [], [], [], [] # necessary variables

In [14]:
data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Data set: COLOCATED_MOZILLA.csv")
print("Initial data shape: ", data.shape)
true_label = data['COLOCATED_STATUS']
true_label = convert_label_to_numeric(true_label)
data = data['SLOC']

repeated_test(data, true_label, train_data, test_data, train_label, test_label, "k_fold") #repeat kfold 10 times and report avarage performance

Data set: COLOCATED_MOZILLA.csv
Initial data shape:  (1613, 16)
-------DT-------
Recall: [0.96210218 0.06019231 0.22488691]
Precision: [0.8534994  0.19166667 0.48790043]
f1 score: [0.90435788 0.08929107 0.30651157]
-------KNN-------
Recall: [0.95297456 0.08281136 0.2531657 ]
Precision: [0.85403148 0.27928571 0.48514985]
f1 score: [0.90055903 0.11625566 0.33049221]
-------SVM-------
Recall: [0.80370029 0.01435897 0.54844305]
Precision: [0.8874475  0.13333333 0.25395369]
f1 score: [0.84228078 0.02539683 0.34135721]
-------NB-------
Recall: [0.97659539 0.         0.23462136]
Precision: [0.84505128 0.         0.54757937]
f1 score: [0.90567925 0.         0.31992986]
-------RF-------
Recall: [0.95608478 0.11223901 0.27751573]
Precision: [0.86200103 0.26115079 0.4937235 ]
f1 score: [0.90638727 0.14444249 0.35091662]


In [15]:
data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Data set: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", data.shape)
true_label = data['COLOCATED_STATUS']
true_label = convert_label_to_numeric(true_label)
data = data['SLOC']

repeated_test(data, true_label, train_data, test_data, train_label, test_label, "k_fold") #repeat kfold 10 times and report avarage performance

Data set: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
-------DT-------
Recall: [0.89699112 0.         0.2958275 ]
Precision: [0.71935324 0.         0.53377049]
f1 score: [0.79802995 0.         0.37873898]
-------KNN-------
Recall: [0.86004416 0.         0.33515689]
Precision: [0.72342269 0.         0.49110371]
f1 score: [0.78494013 0.         0.39428036]
-------SVM-------
Recall: [0.45588231 0.         0.80645252]
Precision: [0.822132   0.         0.36365602]
f1 score: [0.58549225 0.         0.50003528]
-------NB-------
Recall: [0.95102357 0.         0.20834582]
Precision: [0.70762499 0.         0.60517278]
f1 score: [0.81099248 0.         0.30663692]
-------RF-------
Recall: [0.87659064 0.         0.33164945]
Precision: [0.72583103 0.         0.51475748]
f1 score: [0.79352737 0.         0.40058016]


In [16]:
data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Data set: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", data.shape)
true_label = data['COLOCATED_STATUS']
true_label = convert_label_to_numeric(true_label)
data = data['SLOC']

repeated_test(data, true_label, train_data, test_data, train_label, test_label, "k_fold") #repeat kfold 10 times and report avarage performance

Data set: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
-------DT-------
Recall: [0.94863784 0.         0.1771144 ]
Precision: [0.76486975 0.         0.43036979]
f1 score: [0.84670644 0.         0.24716962]
-------KNN-------
Recall: [0.91146763 0.00357143 0.20982627]
Precision: [0.76588354 0.00909091 0.36974359]
f1 score: [0.83201668 0.00512821 0.26280522]
-------SVM-------
Recall: [0.65710768 0.         0.67799173]
Precision: [0.84220501 0.         0.30687959]
f1 score: [0.73785139 0.         0.42109328]
-------NB-------
Recall: [0.96443803 0.         0.21728116]
Precision: [0.76386633 0.         0.56691324]
f1 score: [0.85244551 0.         0.31224473]
-------RF-------
Recall: [0.92991702 0.         0.22604941]
Precision: [0.77034632 0.         0.42550372]
f1 score: [0.84226177 0.         0.29096101]


In [17]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Train Data: COLOCATED_MOZILLA.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['COLOCATED_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data['SLOC']

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Test Data: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['COLOCATED_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data['SLOC']

repeated_test(data, true_label,train_data, test_data, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

Train Data: COLOCATED_MOZILLA.csv
Initial data shape:  (1613, 16)
Test Data: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
-------DT-------
Recall: [0.87432578 0.07751938 0.2765685 ]
Precision: [0.72269282 0.0862069  0.53333333]
f1 score: [0.79131072 0.08163265 0.36424958]
-------KNN-------
Recall: [0.84088457 0.07751938 0.30857875]
Precision: [0.72376973 0.08333333 0.49183673]
f1 score: [0.77794411 0.08032129 0.37922895]
-------SVM-------
Recall: [0.70711974 0.02325581 0.38412292]
Precision: [0.70903191 0.07317073 0.34324943]
f1 score: [0.70807453 0.03529412 0.36253776]
-------NB-------
Recall: [0.87270766 0.         0.33546735]
Precision: [0.72686433 0.         0.48698885]
f1 score: [0.79313725 0.         0.39727066]
-------RF-------
Recall: [0.83522114 0.08914729 0.31049936]
Precision: [0.7276797  0.07980703 0.49045066]
f1 score: [0.77838929 0.08492952 0.37832393]


In [18]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Initial data shape: ", train_data.shape)
print("Train Data: COLOCATED_MOZILLA.csv")
train_label = train_data['COLOCATED_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data['SLOC']

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Test Data: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['COLOCATED_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data['SLOC']

repeated_test(data, true_label,train_data, test_data, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

Initial data shape:  (1613, 16)
Train Data: COLOCATED_MOZILLA.csv
Test Data: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
-------DT-------
Recall: [0.95825336 0.0657277  0.16970803]
Precision: [0.76454824 0.21875    0.55029586]
f1 score: [0.85051107 0.10108303 0.25941423]
-------KNN-------
Recall: [0.94001919 0.07981221 0.20072993]
Precision: [0.77034998 0.18478261 0.52380952]
f1 score: [0.84676896 0.11147541 0.29023747]
-------SVM-------
Recall: [0.79606526 0.04694836 0.37226277]
Precision: [0.77997179 0.27777778 0.29912023]
f1 score: [0.78793636 0.08032129 0.33170732]
-------NB-------
Recall: [0.95921305 0.         0.25182482]
Precision: [0.77003082 0.         0.55421687]
f1 score: [0.8542735  0.         0.34629862]
-------RF-------
Recall: [0.93809981 0.08215962 0.19343066]
Precision: [0.77083596 0.18997494 0.50237957]
f1 score: [0.84686966 0.11279622 0.27582631]


In [19]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Train Data: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['COLOCATED_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data['SLOC']

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Test Data: COLOCATED_MOZILLA.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['COLOCATED_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data['SLOC']

repeated_test(data, true_label,train_data, test_data, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

Train Data: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
Test Data: COLOCATED_MOZILLA.csv
Initial data shape:  (1613, 16)
-------DT-------
Recall: [0.96371882 0.         0.19786096]
Precision: [0.84102902 0.         0.38541667]
f1 score: [0.89820359 0.         0.2614841 ]
-------KNN-------
Recall: [0.92970522 0.         0.28342246]
Precision: [0.84769125 0.         0.32919255]
f1 score: [0.88680606 0.         0.3045977 ]
-------SVM-------
Recall: [0.62433862 0.         0.83957219]
Precision: [0.94077449 0.         0.21360544]
f1 score: [0.75056792 0.         0.34056399]
-------NB-------
Recall: [0.99244142 0.         0.09625668]
Precision: [0.82996207 0.         0.58064516]
f1 score: [0.90395869 0.         0.16513761]
-------RF-------
Recall: [0.95464853 0.         0.2540107 ]
Precision: [0.84708706 0.         0.40254237]
f1 score: [0.89811817 0.         0.31024454]


In [20]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Train Data: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['COLOCATED_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data['SLOC']

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Test Data: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['COLOCATED_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data['SLOC']

repeated_test(data, true_label,train_data, test_data, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

Train Data: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
Test Data: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
-------DT-------
Recall: [0.95633397 0.         0.14416058]
Precision: [0.75122503 0.         0.42021277]
f1 score: [0.84146084 0.         0.21467391]
-------KNN-------
Recall: [0.91362764 0.         0.20985401]
Precision: [0.75796178 0.         0.34954407]
f1 score: [0.82854656 0.         0.2622577 ]
-------SVM-------
Recall: [0.59980806 0.         0.72262774]
Precision: [0.84516565 0.         0.29010989]
f1 score: [0.70165591 0.         0.41400941]
-------NB-------
Recall: [0.98752399 0.         0.11131387]
Precision: [0.7489083  0.         0.62886598]
f1 score: [0.85182119 0.         0.18914729]
-------RF-------
Recall: [0.94289827 0.00469484 0.18339416]
Precision: [0.75638066 0.12142857 0.41592582]
f1 score: [0.83924779 0.00902976 0.25394847]


In [21]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Train Data: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['COLOCATED_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data['SLOC']

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Test Data: COLOCATED_MOZILLA.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['COLOCATED_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data['SLOC']

repeated_test(data, true_label,train_data, test_data, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

Train Data: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
Test Data: COLOCATED_MOZILLA.csv
Initial data shape:  (1613, 16)
-------DT-------
Recall: [0.97052154 0.02912621 0.21390374]
Precision: [0.84808454 0.375      0.43956044]
f1 score: [0.90518153 0.05405405 0.28776978]
-------KNN-------
Recall: [0.94633409 0.03883495 0.27272727]
Precision: [0.84938942 0.5        0.38931298]
f1 score: [0.89524491 0.07207207 0.32075472]
-------SVM-------
Recall: [0.65986395 0.         0.72192513]
Precision: [0.91032325 0.         0.20673813]
f1 score: [0.76511832 0.         0.32142857]
-------NB-------
Recall: [0.98412698 0.         0.17647059]
Precision: [0.83783784 0.         0.55932203]
f1 score: [0.90510949 0.         0.26829268]
-------RF-------
Recall: [0.95880574 0.00970874 0.24064171]
Precision: [0.85150783 0.07738095 0.40561661]
f1 score: [0.90236601 0.01724266 0.30444801]


In [22]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Train Data: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['COLOCATED_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data['SLOC']

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Test Data: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['COLOCATED_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data['SLOC']

repeated_test(data, true_label,train_data, test_data, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

Train Data: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
Test Data: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
-------DT-------
Recall: [0.8781014  0.04651163 0.25224072]
Precision: [0.71060672 0.15789474 0.45287356]
f1 score: [0.78552473 0.07185629 0.32401316]
-------KNN-------
Recall: [0.86569579 0.03100775 0.3021767 ]
Precision: [0.72297297 0.08       0.47773279]
f1 score: [0.78792342 0.04469274 0.37019608]
-------SVM-------
Recall: [0.42718447 0.01550388 0.79769526]
Precision: [0.80981595 0.18181818 0.35098592]
f1 score: [0.55932203 0.02857143 0.48748044]
-------NB-------
Recall: [0.89050701 0.         0.3021767 ]
Precision: [0.7209607 0.        0.4978903]
f1 score: [0.79681467 0.         0.37609562]
-------RF-------
Recall: [0.84304207 0.03100775 0.30025608]
Precision: [0.71673448 0.09980237 0.43628369]
f1 score: [0.77719696 0.05281966 0.35436303]
