In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn import svm, tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
def read_csv_file (filename):
    df = pd.read_csv(filename)  # read the csv file
    return df

In [3]:
def convert_label_to_numeric(label):
    converted_label = np.empty(len(label), dtype=object) 
    for i in range(len(label)):
        if label[i] == "ONLY_ONE":
            converted_label[i] = 1
        elif label[i] == "NEUTRAL":
            converted_label[i] = 0
        else: 
            converted_label[i] = 2
    converted_label = converted_label.astype('int')
    return converted_label

In [4]:
def perfrom_CART(train, test, train_label, test_label):
    clf = tree.DecisionTreeClassifier(criterion='gini', splitter='best', 
                                      min_samples_split = 2, min_weight_fraction_leaf=0.0)
    clf.fit(train, train_label)
    predicted_label = clf.predict(test)
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1

In [5]:
def perfrom_KNN(train, test, train_label, test_label):
    clf = KNeighborsClassifier(n_neighbors=5, weights='distance')
    clf.fit(train, train_label)
    predicted_label = clf.predict(test)
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1

In [6]:
def perfrom_SVM(train, test, train_label, test_label):
    clf = svm.SVC(gamma='auto', C = 20.0, kernel='rbf', class_weight = {0:1, 1:1, 2:5})
    clf.fit(train, train_label)
    predicted_label = clf.predict(test)
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1

In [7]:
def perfrom_NB(train, test, train_label, test_label):
    clf = GaussianNB()
    clf.fit(train, train_label)
    predicted_label = clf.predict(test)
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1

In [8]:
def perfrom_RF(train, test, train_label, test_label):
    clf = RandomForestClassifier(n_estimators=10, criterion='gini')
    clf.fit(train, train_label)
    predicted_label = clf.predict(test)
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1

In [9]:
def measure_performance(true_label, predicted_label):   
    report = classification_report(true_label, predicted_label, digits=3)
    recall = recall_score(true_label, predicted_label, average="macro")
    precision = precision_score(true_label, predicted_label, average="macro")
    f1 = f1_score(true_label, predicted_label, average="macro")
    return recall, precision, f1

In [10]:
def kfold_cv(data, true_label):
    # 10 fold cv
    kf = KFold(n_splits=10, shuffle = True, random_state = 7)

    cv_recall_DT = []
    cv_precision_DT = []
    cv_f1_DT = []

    cv_recall_KNN = []
    cv_precision_KNN = []
    cv_f1_KNN = []

    cv_recall_SVM = []
    cv_precision_SVM = []
    cv_f1_SVM = []

    cv_recall_NB = []
    cv_precision_NB = []
    cv_f1_NB = []

    cv_recall_RF = []
    cv_precision_RF = []
    cv_f1_RF = []


    for train_index, test_index in kf.split(data):
        train, test = data.loc[train_index], data.loc[test_index]
        train_label, test_label = true_label[train_index], true_label[test_index]
    
        train = train.values.reshape(-1, 1)
        test = test.values.reshape(-1, 1)

        recall, precision, f1 = perfrom_CART(train, test, train_label, test_label)
        cv_recall_DT.append(recall)
        cv_precision_DT.append(precision)
        cv_f1_DT.append(f1)

        recall, precision, f1 = perfrom_KNN(train, test, train_label, test_label)
        cv_recall_KNN.append(recall)
        cv_precision_KNN.append(precision)
        cv_f1_KNN.append(f1)

        recall, precision, f1 = perfrom_SVM(train, test, train_label, test_label)
        cv_recall_SVM.append(recall)
        cv_precision_SVM.append(precision)
        cv_f1_SVM.append(f1)

        recall, precision, f1 = perfrom_NB(train, test, train_label, test_label)
        cv_recall_NB.append(recall)
        cv_precision_NB.append(precision)
        cv_f1_NB.append(f1)

        recall, precision, f1 = perfrom_RF(train, test, train_label, test_label)
        cv_recall_RF.append(recall)
        cv_precision_RF.append(precision)
        cv_f1_RF.append(f1)

    recall_DT = np.mean(cv_recall_DT)
    precision_DT = np.mean(cv_precision_DT)
    f1_DT = np.mean(cv_f1_DT)

    recall_KNN = np.mean(cv_recall_KNN)
    precision_KNN = np.mean(cv_precision_KNN)
    f1_KNN = np.mean(cv_f1_KNN)

    recall_SVM = np.mean(cv_recall_SVM)
    precision_SVM = np.mean(cv_precision_SVM)
    f1_SVM =  np.mean(cv_f1_SVM)

    recall_NB = np.mean(cv_recall_NB)
    precision_NB = np.mean(cv_precision_NB)
    f1_NB = np.mean(cv_f1_NB)

    recall_RF = np.mean(cv_recall_RF)
    precision_RF = np.mean(cv_precision_RF)
    f1_RF = np.mean(cv_f1_RF)
    
    return recall_DT, precision_DT, f1_DT, recall_KNN, precision_KNN, f1_KNN, recall_SVM, precision_SVM,\
    f1_SVM, recall_NB, precision_NB, f1_NB, recall_RF, precision_RF, f1_RF

In [11]:
def different_file_test(train, test, train_label, test_label):
    
    train = train.values.reshape(-1, 1)
    test = test.values.reshape(-1, 1)

    recall_DT, precision_DT, f1_DT = perfrom_CART(train, test, train_label, test_label)
    recall_KNN, precision_KNN, f1_KNN = perfrom_KNN(train, test, train_label, test_label)
    recall_SVM, precision_SVM, f1_SVM = perfrom_SVM(train, test, train_label, test_label)
    recall_NB, precision_NB, f1_NB = perfrom_NB(train, test, train_label, test_label)
    recall_RF, precision_RF, f1_RF = perfrom_RF(train, test, train_label, test_label)

    return recall_DT, precision_DT, f1_DT, recall_KNN, precision_KNN, f1_KNN, recall_SVM, precision_SVM,\
    f1_SVM, recall_NB, precision_NB, f1_NB, recall_RF, precision_RF, f1_RF

In [12]:
def repeated_test(data, true_label, train_data, test_data, train_label, test_label, test_name):
    repeated_recall_DT = []
    repeated_precision_DT = []
    repeated_f1_DT = []

    repeated_recall_KNN = []
    repeated_precision_KNN = []
    repeated_f1_KNN = []

    repeated_recall_SVM = []
    repeated_precision_SVM = []
    repeated_f1_SVM = []

    repeated_recall_NB = []
    repeated_precision_NB = []
    repeated_f1_NB = []

    repeated_recall_RF = []
    repeated_precision_RF = []
    repeated_f1_RF = []
    
    recall_DT= precision_DT= f1_DT= recall_KNN= precision_KNN= f1_KNN= recall_SVM= precision_SVM= f1_SVM\
    = recall_NB= precision_NB= f1_NB= recall_RF= precision_RF= f1_RF = 0
    
    for i in range(10):
        if test_name == "k_fold":
            recall_DT, precision_DT, f1_DT, recall_KNN, precision_KNN, f1_KNN, recall_SVM, precision_SVM, f1_SVM,\
            recall_NB, precision_NB, f1_NB, recall_RF, precision_RF, f1_RF = kfold_cv(data, true_label)
        else: 
            recall_DT, precision_DT, f1_DT, recall_KNN, precision_KNN, f1_KNN, recall_SVM, precision_SVM, f1_SVM,\
            recall_NB, precision_NB, f1_NB, recall_RF, precision_RF, f1_RF = different_file_test(train_data, test_data, train_label, test_label)
        
        repeated_recall_DT.append(recall_DT)
        repeated_precision_DT.append(precision_DT)
        repeated_f1_DT.append(f1_DT)

        repeated_recall_KNN.append(recall_KNN)
        repeated_precision_KNN.append(precision_KNN)
        repeated_f1_KNN.append(f1_KNN)

        repeated_recall_SVM.append(recall_SVM)
        repeated_precision_SVM.append(precision_SVM)
        repeated_f1_SVM.append(f1_SVM)

        repeated_recall_NB.append(recall_NB)
        repeated_precision_NB.append(precision_NB)
        repeated_f1_NB.append(f1_NB)

        repeated_recall_RF.append(recall_RF)
        repeated_precision_RF.append(precision_RF)
        repeated_f1_RF.append(f1_RF)
        
    print("-------DT-------")
    print("Recall:", np.mean(repeated_recall_DT))
    print("Precision:", np.mean(repeated_precision_DT))
    print("f1 score:", np.mean(repeated_f1_DT))

    print("-------KNN-------")
    print("Recall:", np.mean(repeated_recall_KNN))
    print("Precision:", np.mean(repeated_precision_KNN))
    print("f1 score:", np.mean(repeated_f1_KNN))

    print("-------SVM-------")
    print("Recall:", np.mean(repeated_recall_SVM))
    print("Precision:", np.mean(repeated_precision_SVM))
    print("f1 score:", np.mean(repeated_f1_SVM))

    print("-------NB-------")
    print("Recall:", np.mean(repeated_recall_NB))
    print("Precision:", np.mean(repeated_precision_NB))
    print("f1 score:", np.mean(repeated_f1_NB))

    print("-------RF-------")
    print("Recall:", np.mean(repeated_recall_RF))
    print("Precision:", np.mean(repeated_precision_RF))
    print("f1 score:", np.mean(repeated_f1_RF))

In [13]:
data, true_label, train_data, test_data, train_label, test_label = [], [], [], [], [], [] # necessary variables

In [14]:
data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Data set: COLOCATED_MOZILLA.csv")
print("Initial data shape: ", data.shape)
true_label = data['COLOCATED_STATUS']
true_label = convert_label_to_numeric(true_label)
data = data['SLOC']

repeated_test(data, true_label, train_data, test_data, train_label, test_label, "k_fold") #repeat kfold 10 times and report avarage performance

Data set: COLOCATED_MOZILLA.csv
Initial data shape:  (1613, 16)
-------DT-------
Recall: 0.41572713442997034
Precision: 0.5110221674627831
f1 score: 0.43338684199058025
-------KNN-------
Recall: 0.42965053873235864
Precision: 0.539489015177805
f1 score: 0.44910230032026843
-------SVM-------
Recall: 0.45550077382879567
Precision: 0.424911509349737
f1 score: 0.4030116073734521
-------NB-------
Recall: 0.4037389152735341
Precision: 0.4642102141034051
f1 score: 0.40853637290801226
-------RF-------
Recall: 0.44013527353876103
Precision: 0.52916788496674
f1 score: 0.4581714113308257


In [15]:
data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Data set: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", data.shape)
true_label = data['COLOCATED_STATUS']
true_label = convert_label_to_numeric(true_label)
data = data['SLOC']

repeated_test(data, true_label, train_data, test_data, train_label, test_label, "k_fold") #repeat kfold 10 times and report avarage performance

Data set: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
-------DT-------
Recall: 0.39760620796636553
Precision: 0.41770791145529496
f1 score: 0.3922563068381565
-------KNN-------
Recall: 0.39840034915619343
Precision: 0.40484213251059975
f1 score: 0.3930734967931312
-------SVM-------
Recall: 0.4207782793525867
Precision: 0.3952626733087973
f1 score: 0.3618425087115513
-------NB-------
Recall: 0.38645646155537217
Precision: 0.4375992558755752
f1 score: 0.37254313240702286
-------RF-------
Recall: 0.40348283596968393
Precision: 0.41466271225846346
f1 score: 0.39911595352601575


In [16]:
data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Data set: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", data.shape)
true_label = data['COLOCATED_STATUS']
true_label = convert_label_to_numeric(true_label)
data = data['SLOC']

repeated_test(data, true_label, train_data, test_data, train_label, test_label, "k_fold") #repeat kfold 10 times and report avarage performance

Data set: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
-------DT-------
Recall: 0.37525074793207136
Precision: 0.3984131804524734
f1 score: 0.3646253518663737
-------KNN-------
Recall: 0.3749551100623027
Precision: 0.3815726794762887
f1 score: 0.36665003332844576
-------SVM-------
Recall: 0.4450331346356779
Precision: 0.3830282001503932
f1 score: 0.3863148917761151
-------NB-------
Recall: 0.3939063966980324
Precision: 0.44359319063661956
f1 score: 0.38823007836213674
-------RF-------
Recall: 0.3859995758341783
Precision: 0.4040791116654341
f1 score: 0.3789451274050627


In [17]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Train Data: COLOCATED_MOZILLA.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['COLOCATED_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data['SLOC']

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Test Data: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['COLOCATED_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data['SLOC']

repeated_test(data, true_label,train_data, test_data, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

Train Data: COLOCATED_MOZILLA.csv
Initial data shape:  (1613, 16)
Test Data: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
-------DT-------
Recall: 0.4094712212861161
Precision: 0.4474110173327663
f1 score: 0.41239764887783215
-------KNN-------
Recall: 0.40899423297923576
Precision: 0.4329799329202433
f1 score: 0.41249811683228926
-------SVM-------
Recall: 0.37149949146266625
Precision: 0.37515068958833764
f1 score: 0.3686354720530009
-------NB-------
Recall: 0.40272500288909424
Precision: 0.4046177260738529
f1 score: 0.39680263816418654
-------RF-------
Recall: 0.4066282299621145
Precision: 0.4300793422021827
f1 score: 0.4081915006682495


In [18]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Initial data shape: ", train_data.shape)
print("Train Data: COLOCATED_MOZILLA.csv")
train_label = train_data['COLOCATED_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data['SLOC']

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Test Data: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['COLOCATED_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data['SLOC']

repeated_test(data, true_label,train_data, test_data, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

Initial data shape:  (1613, 16)
Train Data: COLOCATED_MOZILLA.csv
Test Data: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
-------DT-------
Recall: 0.39789636255091354
Precision: 0.5111980322951875
f1 score: 0.40366944389541015
-------KNN-------
Recall: 0.4068537758126782
Precision: 0.4929807042811197
f1 score: 0.41616061387550857
-------SVM-------
Recall: 0.4050921298824073
Precision: 0.45228993454572414
f1 score: 0.39998831979118415
-------NB-------
Recall: 0.4036792897805549
Precision: 0.44141589470362186
f1 score: 0.4001907080326152
-------RF-------
Recall: 0.4054762997413084
Precision: 0.47560688011473407
f1 score: 0.41319036622495


In [19]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Train Data: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['COLOCATED_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data['SLOC']

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Test Data: COLOCATED_MOZILLA.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['COLOCATED_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data['SLOC']

repeated_test(data, true_label,train_data, test_data, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

Train Data: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
Test Data: COLOCATED_MOZILLA.csv
Initial data shape:  (1613, 16)
-------DT-------
Recall: 0.387193261142841
Precision: 0.40881523013778953
f1 score: 0.38656256391810023
-------KNN-------
Recall: 0.4043758917708498
Precision: 0.3922945979998088
f1 score: 0.3971345857953023
-------SVM-------
Recall: 0.4879702722839978
Precision: 0.38479330988279903
f1 score: 0.36371063833142303
-------NB-------
Recall: 0.36289936850160937
Precision: 0.4702024115384093
f1 score: 0.3563654355297993
-------RF-------
Recall: 0.40388638996509335
Precision: 0.4263576329978732
f1 score: 0.40443123736484987


In [20]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Train Data: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['COLOCATED_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data['SLOC']

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Test Data: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['COLOCATED_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data['SLOC']

repeated_test(data, true_label,train_data, test_data, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

Train Data: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
Test Data: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
-------DT-------
Recall: 0.36683151902340155
Precision: 0.39047926474244327
f1 score: 0.352044917745898
-------KNN-------
Recall: 0.37449388458467014
Precision: 0.3691686187959396
f1 score: 0.3636014196404323
-------SVM-------
Recall: 0.44081193288220766
Precision: 0.3784251808592579
f1 score: 0.3718884390825007
-------NB-------
Recall: 0.3662792869785319
Precision: 0.45925809210822494
f1 score: 0.34698949295822856
-------RF-------
Recall: 0.37890148220661646
Precision: 0.46104134385769957
f1 score: 0.3700565128873339


In [21]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Train Data: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['COLOCATED_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data['SLOC']

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Test Data: COLOCATED_MOZILLA.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['COLOCATED_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data['SLOC']

repeated_test(data, true_label,train_data, test_data, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

Train Data: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
Test Data: COLOCATED_MOZILLA.csv
Initial data shape:  (1613, 16)
-------DT-------
Recall: 0.40451716628595147
Precision: 0.5542149946046908
f1 score: 0.4156684560039
-------KNN-------
Recall: 0.4192987711249385
Precision: 0.5795674645509441
f1 score: 0.4293572314362784
-------SVM-------
Recall: 0.4605963597560236
Precision: 0.3723537950295979
f1 score: 0.362182296231376
-------NB-------
Recall: 0.38686585745409274
Precision: 0.4657199572453809
f1 score: 0.39113405732597467
-------RF-------
Recall: 0.4087451724176317
Precision: 0.4893996513203204
f1 score: 0.41588467378162697


In [None]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Train Data: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['COLOCATED_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data['SLOC']

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Test Data: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['COLOCATED_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data['SLOC']

repeated_test(data, true_label,train_data, test_data, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

Train Data: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
Test Data: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
