In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import svm, tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score

In [2]:
def read_csv_file (filename):
    df = pd.read_csv(filename)  # read the csv file
    return df

In [3]:
def convert_label_to_numeric(label):
    converted_label = np.empty(len(label), dtype=object) 
    for i in range(len(label)):
        if label[i] == "ONLY_ONE":
            converted_label[i] = 1
        elif label[i] == "NEUTRAL":
            converted_label[i] = 0
        else: 
            converted_label[i] = 2
    converted_label = converted_label.astype('int')
    return converted_label

In [4]:
def apply_PCA(train, test):
    # Since PCA is effected by scale, we need to scale the features in the data before applying PCA
    scaler = StandardScaler()
    # Fit on training set only.
    scaler.fit(train)
    # Apply transform to both the training set and the test set.
    train = scaler.transform(train)
    test = scaler.transform(test)

    # Make an instance of the Model
    pca = PCA(.95) #  choose the minimum number of principal components such that 95% of the variance is retained.
    # We are fitting PCA on the training set only.
    pca.fit(train)
    print ("Number of selected components: ", pca.n_components_)
    #print (pd.DataFrame(pca.components_))
    
    # Apply the mapping (transform) to both the training set and the test set
    #print("Before applying PCA train set size: ", train.shape)
    #print("Before applying PCA test set size: ", test.shape)
    train = pca.transform(train)
    test = pca.transform(test)
    #print("After applying PCA train set size: ", train.shape)
    #print("After applying PCA test set size: ", test.shape)
    return train,test

In [5]:
def measure_performance(true_label, predicted_label):   
    report = classification_report(true_label, predicted_label, digits=3)
    recall = recall_score(true_label, predicted_label, average="macro")
    precision = precision_score(true_label, predicted_label, average="macro")
    f1 = f1_score(true_label, predicted_label, average="macro")
    return recall, precision, f1

In [6]:
def kfold_cv(data, true_label):
    # 10 fold cv
    kf = KFold(n_splits=10, shuffle = True, random_state = 7)

    cv_recall_DT = []
    cv_precision_DT = []
    cv_f1_DT = []

    cv_recall_KNN = []
    cv_precision_KNN = []
    cv_f1_KNN = []

    cv_recall_SVM = []
    cv_precision_SVM = []
    cv_f1_SVM = []

    cv_recall_NB = []
    cv_precision_NB = []
    cv_f1_NB = []

    cv_recall_RF = []
    cv_precision_RF = []
    cv_f1_RF = []


    for train_index, test_index in kf.split(data):
        train, test = data.loc[train_index], data.loc[test_index]
        train_label, test_label = true_label[train_index], true_label[test_index]

        train, test = apply_PCA(train, test)

        clf = tree.DecisionTreeClassifier(criterion='entropy', splitter='best', max_depth=40, 
                                          min_samples_split = 2, min_weight_fraction_leaf=0.0)
        clf.fit(train, train_label)
        predicted_label = clf.predict(test)
        recall, precision, f1 = measure_performance(test_label, predicted_label)
        cv_recall_DT.append(recall)
        cv_precision_DT.append(precision)
        cv_f1_DT.append(f1)

        clf = KNeighborsClassifier(n_neighbors=5, weights='distance')
        clf.fit(train, train_label)
        predicted_label = clf.predict(test)
        recall, precision, f1 = measure_performance(test_label, predicted_label)
        cv_recall_KNN.append(recall)
        cv_precision_KNN.append(precision)
        cv_f1_KNN.append(f1)

        clf = svm.SVC(gamma='auto', C = 20.0, kernel='rbf', class_weight = {0:1, 1:1, 2:5})
        clf.fit(train, train_label)
        predicted_label = clf.predict(test)
        recall, precision, f1 = measure_performance(test_label, predicted_label)
        cv_recall_SVM.append(recall)
        cv_precision_SVM.append(precision)
        cv_f1_SVM.append(f1)

        clf = GaussianNB()
        clf.fit(train, train_label)
        predicted_label = clf.predict(test)
        recall, precision, f1 = measure_performance(test_label, predicted_label)
        cv_recall_NB.append(recall)
        cv_precision_NB.append(precision)
        cv_f1_NB.append(f1)

        clf = RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=20)
        clf.fit(train, train_label)
        predicted_label = clf.predict(test)
        recall, precision, f1 = measure_performance(test_label, predicted_label)
        cv_recall_RF.append(recall)
        cv_precision_RF.append(precision)
        cv_f1_RF.append(f1)

    print("-------DT-------")
    print("Recall:", np.mean(cv_recall_DT))
    print("Precision:", np.mean(cv_precision_DT))
    print("f1 score:", np.mean(cv_f1_DT))

    print("-------KNN-------")
    print("Recall:", np.mean(cv_recall_KNN))
    print("Precision:", np.mean(cv_precision_KNN))
    print("f1 score:", np.mean(cv_f1_KNN))

    print("-------SVM-------")
    print("Recall:", np.mean(cv_recall_SVM))
    print("Precision:", np.mean(cv_precision_SVM))
    print("f1 score:", np.mean(cv_f1_SVM))

    print("-------NB-------")
    print("Recall:", np.mean(cv_recall_NB))
    print("Precision:", np.mean(cv_precision_NB))
    print("f1 score:", np.mean(cv_f1_NB))

    print("-------RF-------")
    print("Recall:", np.mean(cv_recall_RF))
    print("Precision:", np.mean(cv_precision_RF))
    print("f1 score:", np.mean(cv_f1_RF))

In [7]:
def different_file_test(train, test, train_label, test_label):
    
    train, test = apply_PCA(train, test)

    clf = tree.DecisionTreeClassifier(criterion='entropy', splitter='best', max_depth=40, 
                                     min_samples_split = 2, min_weight_fraction_leaf=0.0)
    clf.fit(train, train_label)
    predicted_label = clf.predict(test)
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    print("-------DT-------")
    print("Recall:", recall)
    print("Precision:", precision)
    print("f1 score:", f1)


    clf = KNeighborsClassifier(n_neighbors=5, weights='distance')
    clf.fit(train, train_label)
    predicted_label = clf.predict(test)
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    print("-------KNN-------")
    print("Recall:", recall)
    print("Precision:", precision)
    print("f1 score:", f1)
    
    clf = svm.SVC(gamma='auto', C = 20.0, kernel='rbf', class_weight = {0:1, 1:1, 2:5})
    clf.fit(train, train_label)
    predicted_label = clf.predict(test)
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    print("-------SVM-------")
    print("Recall:", recall)
    print("Precision:", precision)
    print("f1 score:", f1)

    clf = GaussianNB()
    clf.fit(train, train_label)
    predicted_label = clf.predict(test)
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    print("-------NB-------")
    print("Recall:", recall)
    print("Precision:", precision)
    print("f1 score:", f1)

    clf = RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=20)
    clf.fit(train, train_label)
    predicted_label = clf.predict(test)
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    print("-------RF-------")
    print("Recall:", recall)
    print("Precision:", precision)
    print("f1 score:", f1)

In [8]:
data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Data set: COLOCATED_MOZILLA.csv")
print("Initial data shape: ", data.shape)
true_label = data['COLOCATED_STATUS']
true_label = convert_label_to_numeric(true_label)
data = data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])
kfold_cv(data, true_label)

Data set: COLOCATED_MOZILLA.csv
Initial data shape:  (1613, 16)
Number of selected components:  8
Number of selected components:  8
Number of selected components:  8
Number of selected components:  8
Number of selected components:  8
Number of selected components:  8
Number of selected components:  8
Number of selected components:  8
Number of selected components:  8
Number of selected components:  8
-------DT-------
Recall: 0.885815588807516
Precision: 0.8611621842662475
f1 score: 0.8658295261244596
-------KNN-------
Recall: 0.8996578000728326
Precision: 0.9078891938789528
f1 score: 0.9009914677819351
-------SVM-------
Recall: 0.6927078668314843
Precision: 0.7713794250479379
f1 score: 0.7034100740450092
-------NB-------
Recall: 0.4417971611795387
Precision: 0.4819069337609035
f1 score: 0.4465596739043945
-------RF-------
Recall: 0.8196272197367701
Precision: 0.9015998664618993
f1 score: 0.8490566939460515


In [9]:
data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Data set: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", data.shape)
true_label = data['COLOCATED_STATUS']
true_label = convert_label_to_numeric(true_label)
data = data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])
kfold_cv(data, true_label)

Data set: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
Number of selected components:  9
Number of selected components:  9
Number of selected components:  9
Number of selected components:  9
Number of selected components:  9
Number of selected components:  9
Number of selected components:  9


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Number of selected components:  9
Number of selected components:  9
Number of selected components:  9
-------DT-------
Recall: 0.5514317815884134
Precision: 0.5478040065014351
f1 score: 0.5482885636771906
-------KNN-------
Recall: 0.5465068504006747
Precision: 0.5844853810163991
f1 score: 0.5506136828988317
-------SVM-------
Recall: 0.526456027178722
Precision: 0.49377690310870676
f1 score: 0.49004672608296546
-------NB-------
Recall: 0.44290181209010504
Precision: 0.5589356550283747
f1 score: 0.4535120359571444
-------RF-------
Recall: 0.547558396475895
Precision: 0.5912681418251824
f1 score: 0.5564448537870722


In [10]:
data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Data set: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", data.shape)
true_label = data['COLOCATED_STATUS']
true_label = convert_label_to_numeric(true_label)
data = data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])
kfold_cv(data, true_label)

Data set: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
Number of selected components:  8
Number of selected components:  8
Number of selected components:  8


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Number of selected components:  8
Number of selected components:  9
Number of selected components:  8
Number of selected components:  8


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Number of selected components:  8
Number of selected components:  8
Number of selected components:  8
-------DT-------
Recall: 0.4613036523005342
Precision: 0.46217030141490933
f1 score: 0.4597780141976416
-------KNN-------
Recall: 0.48060621044262886
Precision: 0.5453234342467559
f1 score: 0.49502956109292817
-------SVM-------
Recall: 0.46268595417217817
Precision: 0.41064505883862407
f1 score: 0.4159924656348165
-------NB-------
Recall: 0.40259497439713093
Precision: 0.44428341807287347
f1 score: 0.4023032320496731
-------RF-------
Recall: 0.46000110204912203
Precision: 0.5244065130397907
f1 score: 0.469050483091916


In [11]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Train Data: COLOCATED_MOZILLA.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['COLOCATED_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Test Data: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['COLOCATED_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

different_file_test(train_data, test_data, train_label, test_label)

Train Data: COLOCATED_MOZILLA.csv
Initial data shape:  (1613, 16)
Test Data: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
Number of selected components:  8
-------DT-------
Recall: 0.35460651607271676
Precision: 0.3543389456726503
f1 score: 0.3533488933753229
-------KNN-------
Recall: 0.39040751973757554
Precision: 0.3964226253835686
f1 score: 0.39021034315994685
-------SVM-------
Recall: 0.37550826295940926
Precision: 0.3642754496886976
f1 score: 0.364087287580067
-------NB-------
Recall: 0.4017021944685011
Precision: 0.3916150557671754
f1 score: 0.3960932799448737
-------RF-------
Recall: 0.36143862834146784
Precision: 0.37188309067691955
f1 score: 0.35376387541612003


In [12]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Initial data shape: ", train_data.shape)
print("Train Data: COLOCATED_MOZILLA.csv")
train_label = train_data['COLOCATED_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Test Data: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['COLOCATED_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

different_file_test(train_data, test_data, train_label, test_label)

Initial data shape:  (1613, 16)
Train Data: COLOCATED_MOZILLA.csv
Test Data: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
Number of selected components:  8
-------DT-------
Recall: 0.429206180289399
Precision: 0.44866890050349567
f1 score: 0.4348522944398943
-------KNN-------
Recall: 0.3960459858903888
Precision: 0.42829711760193145
f1 score: 0.39902278651442186
-------SVM-------
Recall: 0.3952525617079256
Precision: 0.41088275152185266
f1 score: 0.39457846411518976
-------NB-------
Recall: 0.4216695220772559
Precision: 0.4498678562886213
f1 score: 0.4218465686838148
-------RF-------
Recall: 0.3764263925314641
Precision: 0.43426258007994306
f1 score: 0.36953153424036894


In [13]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Train Data: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['COLOCATED_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Test Data: COLOCATED_MOZILLA.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['COLOCATED_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

different_file_test(train_data, test_data, train_label, test_label)

Train Data: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
Test Data: COLOCATED_MOZILLA.csv
Initial data shape:  (1613, 16)
Number of selected components:  9
-------DT-------
Recall: 0.46216155057361447
Precision: 0.42009296919711625
f1 score: 0.4129262248177483
-------KNN-------
Recall: 0.47336803375529546
Precision: 0.4351496047949949
f1 score: 0.4456633273922575
-------SVM-------
Recall: 0.48562718212714134
Precision: 0.4273576859328838
f1 score: 0.4297226136914875
-------NB-------
Recall: 0.4270431522613949
Precision: 0.4471052749095003
f1 score: 0.42002215768575746
-------RF-------
Recall: 0.4672015843047363
Precision: 0.4243355475998318
f1 score: 0.43060443839530255


In [14]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Train Data: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['COLOCATED_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Test Data: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['COLOCATED_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

different_file_test(train_data, test_data, train_label, test_label)

Train Data: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
Test Data: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
Number of selected components:  9
-------DT-------
Recall: 0.4330987833935977
Precision: 0.4225874085046651
f1 score: 0.4260703807482442
-------KNN-------
Recall: 0.44440890391720417
Precision: 0.4433125477673412
f1 score: 0.43564127463258845
-------SVM-------
Recall: 0.4494251336162675
Precision: 0.4011966262538711
f1 score: 0.40302669065682445
-------NB-------
Recall: 0.4282545558143371
Precision: 0.47438793638476734
f1 score: 0.4299932360073319
-------RF-------
Recall: 0.4353307548143657
Precision: 0.4242473196845158
f1 score: 0.4216850289853229


In [15]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Train Data: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['COLOCATED_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Test Data: COLOCATED_MOZILLA.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['COLOCATED_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

different_file_test(train_data, test_data, train_label, test_label)

Train Data: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
Test Data: COLOCATED_MOZILLA.csv
Initial data shape:  (1613, 16)
Number of selected components:  8
-------DT-------
Recall: 0.3837415453906711
Precision: 0.3830063648213116
f1 score: 0.3618845681148682
-------KNN-------
Recall: 0.4234788485692731
Precision: 0.44263243172921124
f1 score: 0.4274252106189893
-------SVM-------
Recall: 0.5204095302270494
Precision: 0.49739887422507073
f1 score: 0.47530605167836676
-------NB-------
Recall: 0.39308248551946035
Precision: 0.40191878008891346
f1 score: 0.39099080120778035
-------RF-------
Recall: 0.4207119348671115
Precision: 0.445083317476233
f1 score: 0.4295779885805295


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Train Data: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['COLOCATED_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Test Data: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['COLOCATED_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

different_file_test(train_data, test_data, train_label, test_label)

Train Data: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
Test Data: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
Number of selected components:  8
-------DT-------
Recall: 0.40656464205003123
Precision: 0.4188268758241061
f1 score: 0.3997746821656414
-------KNN-------
Recall: 0.4098970555201733
Precision: 0.4495371833928144
f1 score: 0.4139116370520181
-------SVM-------
Recall: 0.4194227255119136
Precision: 0.41393147255455665
f1 score: 0.4119307714304363
-------NB-------
Recall: 0.41436255177951237
Precision: 0.5002094612147999
f1 score: 0.41226109054007026
-------RF-------
Recall: 0.39819712061085344
Precision: 0.44850812789253575
f1 score: 0.3986342475741769
