In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import svm, tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
def read_csv_file (filename):
    df = pd.read_csv(filename)  # read the csv file
    return df

In [3]:
def convert_label_to_numeric(label):
    converted_label = np.empty(len(label), dtype=object) 
    for i in range(len(label)):
        if label[i] == "ONLY_ONE":
            converted_label[i] = 1
        elif label[i] == "NEUTRAL":
            converted_label[i] = 0
        else: 
            converted_label[i] = 2
    converted_label = converted_label.astype('int')
    return converted_label

In [4]:
def apply_PCA(train, test):
    # Since PCA is effected by scale, we need to scale the features in the data before applying PCA
    scaler = StandardScaler()
    # Fit on training set only.
    scaler.fit(train)
    # Apply transform to both the training set and the test set.
    train = scaler.transform(train)
    test = scaler.transform(test)

    # Make an instance of the Model
    pca = PCA(.95) #  choose the minimum number of principal components such that 95% of the variance is retained.
    # We are fitting PCA on the training set only.
    pca.fit(train)
    print ("Number of selected components: ", pca.n_components_)
    #print (pd.DataFrame(pca.components_))
    
    # Apply the mapping (transform) to both the training set and the test set
    print("Before applying PCA train set size: ", train.shape)
    print("Before applying PCA test set size: ", test.shape)
    train = pca.transform(train)
    test = pca.transform(test)
    print("After applying PCA train set size: ", train.shape)
    print("After applying PCA test set size: ", test.shape)
    return train,test

In [5]:
def measure_performance(true_label, predicted_label):   
    report = classification_report(true_label, predicted_label, digits=3)
    recall = recall_score(true_label, predicted_label, average="macro")
    precision = precision_score(true_label, predicted_label, average="macro")
    f1 = f1_score(true_label, predicted_label, average="macro")
    return recall, precision, f1

In [11]:
def kfold_cv(data, true_label):
    # 10 fold cv
    kf = KFold(n_splits=10, shuffle = True, random_state = 7)

    cv_recall_DT = []
    cv_precision_DT = []
    cv_f1_DT = []

    cv_recall_KNN = []
    cv_precision_KNN = []
    cv_f1_KNN = []

    cv_recall_SVM = []
    cv_precision_SVM = []
    cv_f1_SVM = []

    cv_recall_NB = []
    cv_precision_NB = []
    cv_f1_NB = []

    cv_recall_RF = []
    cv_precision_RF = []
    cv_f1_RF = []


    for train_index, test_index in kf.split(data):
        train, test = data.loc[train_index], data.loc[test_index]
        train_label, test_label = true_label[train_index], true_label[test_index]
    
        train = train.values.reshape(-1, 1)
        test = test.values.reshape(-1, 1)

        clf = tree.DecisionTreeClassifier(criterion='entropy', splitter='best', max_depth=40, 
                                          min_samples_split = 2, min_weight_fraction_leaf=0.0)
        clf.fit(train, train_label)
        predicted_label = clf.predict(test)
        recall, precision, f1 = measure_performance(test_label, predicted_label)
        cv_recall_DT.append(recall)
        cv_precision_DT.append(precision)
        cv_f1_DT.append(f1)

        clf = KNeighborsClassifier(n_neighbors=5, weights='distance')
        clf.fit(train, train_label)
        predicted_label = clf.predict(test)
        recall, precision, f1 = measure_performance(test_label, predicted_label)
        cv_recall_KNN.append(recall)
        cv_precision_KNN.append(precision)
        cv_f1_KNN.append(f1)

        clf = svm.SVC(gamma='auto', C = 20.0, kernel='rbf', class_weight = {0:1, 1:1, 2:5})
        clf.fit(train, train_label)
        predicted_label = clf.predict(test)
        recall, precision, f1 = measure_performance(test_label, predicted_label)
        cv_recall_SVM.append(recall)
        cv_precision_SVM.append(precision)
        cv_f1_SVM.append(f1)

        clf = GaussianNB()
        clf.fit(train, train_label)
        predicted_label = clf.predict(test)
        recall, precision, f1 = measure_performance(test_label, predicted_label)
        cv_recall_NB.append(recall)
        cv_precision_NB.append(precision)
        cv_f1_NB.append(f1)

        clf = RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=20)
        clf.fit(train, train_label)
        predicted_label = clf.predict(test)
        recall, precision, f1 = measure_performance(test_label, predicted_label)
        cv_recall_RF.append(recall)
        cv_precision_RF.append(precision)
        cv_f1_RF.append(f1)

    print("-------DT-------")
    print("Recall:", np.mean(cv_recall_DT))
    print("Precision:", np.mean(cv_precision_DT))
    print("f1 score:", np.mean(cv_f1_DT))

    print("-------KNN-------")
    print("Recall:", np.mean(cv_recall_KNN))
    print("Precision:", np.mean(cv_precision_KNN))
    print("f1 score:", np.mean(cv_f1_KNN))

    print("-------SVM-------")
    print("Recall:", np.mean(cv_recall_SVM))
    print("Precision:", np.mean(cv_precision_SVM))
    print("f1 score:", np.mean(cv_f1_SVM))

    print("-------NB-------")
    print("Recall:", np.mean(cv_recall_NB))
    print("Precision:", np.mean(cv_precision_NB))
    print("f1 score:", np.mean(cv_f1_NB))

    print("-------RF-------")
    print("Recall:", np.mean(cv_recall_RF))
    print("Precision:", np.mean(cv_precision_RF))
    print("f1 score:", np.mean(cv_f1_RF))

In [18]:
def different_file_test(train, test, train_label, test_label):
    
    train = train.values.reshape(-1, 1)
    test = test.values.reshape(-1, 1)

    clf = tree.DecisionTreeClassifier(criterion='entropy', splitter='best', max_depth=40, 
                                     min_samples_split = 2, min_weight_fraction_leaf=0.0)
    clf.fit(train, train_label)
    predicted_label = clf.predict(test)
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    print("-------DT-------")
    print("Recall:", recall)
    print("Precision:", precision)
    print("f1 score:", f1)


    clf = KNeighborsClassifier(n_neighbors=5, weights='distance')
    clf.fit(train, train_label)
    predicted_label = clf.predict(test)
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    print("-------KNN-------")
    print("Recall:", recall)
    print("Precision:", precision)
    print("f1 score:", f1)
    
    clf = svm.SVC(gamma='auto', C = 20.0, kernel='rbf', class_weight = {0:1, 1:1, 2:5})
    clf.fit(train, train_label)
    predicted_label = clf.predict(test)
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    print("-------SVM-------")
    print("Recall:", recall)
    print("Precision:", precision)
    print("f1 score:", f1)

    clf = GaussianNB()
    clf.fit(train, train_label)
    predicted_label = clf.predict(test)
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    print("-------NB-------")
    print("Recall:", recall)
    print("Precision:", precision)
    print("f1 score:", f1)

    clf = RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=20)
    clf.fit(train, train_label)
    predicted_label = clf.predict(test)
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    print("-------RF-------")
    print("Recall:", recall)
    print("Precision:", precision)
    print("f1 score:", f1)

In [19]:
data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Data set: COLOCATED_MOZILLA.csv")
print("Initial data shape: ", data.shape)
true_label = data['COLOCATED_STATUS']
true_label = convert_label_to_numeric(true_label)
data = data['SLOC']
kfold_cv(data, true_label)

Data set: COLOCATED_MOZILLA.csv
Initial data shape:  (1613, 16)
-------DT-------
Recall: 0.41572713442997034
Precision: 0.5110221674627832
f1 score: 0.43338684199058025
-------KNN-------
Recall: 0.42965053873235864
Precision: 0.5394890151778049
f1 score: 0.44910230032026854
-------SVM-------
Recall: 0.45550077382879567
Precision: 0.424911509349737
f1 score: 0.4030116073734521
-------NB-------
Recall: 0.4037389152735341
Precision: 0.46421021410340524
f1 score: 0.40853637290801226
-------RF-------
Recall: 0.43211579666742406
Precision: 0.5003179669868516
f1 score: 0.44562509190935595


In [20]:
data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Data set: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", data.shape)
true_label = data['COLOCATED_STATUS']
true_label = convert_label_to_numeric(true_label)
data = data['SLOC']
kfold_cv(data, true_label)

Data set: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
-------DT-------
Recall: 0.39760620796636553
Precision: 0.41770791145529496
f1 score: 0.3922563068381565
-------KNN-------
Recall: 0.39840034915619343
Precision: 0.40484213251059975
f1 score: 0.3930734967931312
-------SVM-------
Recall: 0.4207782793525867
Precision: 0.3952626733087973
f1 score: 0.3618425087115513
-------NB-------
Recall: 0.3864564615553722
Precision: 0.4375992558755752
f1 score: 0.37254313240702286
-------RF-------
Recall: 0.3989383570762617
Precision: 0.4091508734857177
f1 score: 0.3935637883184772


In [21]:
data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Data set: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", data.shape)
true_label = data['COLOCATED_STATUS']
true_label = convert_label_to_numeric(true_label)
data = data['SLOC']
kfold_cv(data, true_label)

Data set: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
-------DT-------
Recall: 0.3752507479320714
Precision: 0.3984131804524734
f1 score: 0.3646253518663737
-------KNN-------
Recall: 0.3749551100623027
Precision: 0.3815726794762887
f1 score: 0.36665003332844576
-------SVM-------
Recall: 0.4450331346356779
Precision: 0.3830282001503932
f1 score: 0.38631489177611505
-------NB-------
Recall: 0.39390639669803235
Precision: 0.44359319063661956
f1 score: 0.38823007836213674
-------RF-------
Recall: 0.38841390434297657
Precision: 0.39914064365137286
f1 score: 0.3814606053702077


In [22]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Train Data: COLOCATED_MOZILLA.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['COLOCATED_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data['SLOC']

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Test Data: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['COLOCATED_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data['SLOC']

different_file_test(train_data, test_data, train_label, test_label)

Train Data: COLOCATED_MOZILLA.csv
Initial data shape:  (1613, 16)
Test Data: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
-------DT-------
Recall: 0.40947122128611607
Precision: 0.4474110173327663
f1 score: 0.4123976488778322
-------KNN-------
Recall: 0.40899423297923576
Precision: 0.4329799329202433
f1 score: 0.41249811683228926
-------SVM-------
Recall: 0.3714994914626663
Precision: 0.37515068958833764
f1 score: 0.3686354720530009
-------NB-------
Recall: 0.40272500288909424
Precision: 0.4046177260738529
f1 score: 0.39680263816418654
-------RF-------
Recall: 0.41292616897067713
Precision: 0.4317588715650347
f1 score: 0.4125519294470277


In [23]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Initial data shape: ", train_data.shape)
print("Train Data: COLOCATED_MOZILLA.csv")
train_label = train_data['COLOCATED_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data['SLOC']

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Test Data: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['COLOCATED_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data['SLOC']

different_file_test(train_data, test_data, train_label, test_label)

Initial data shape:  (1613, 16)
Train Data: COLOCATED_MOZILLA.csv
Test Data: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
-------DT-------
Recall: 0.39789636255091354
Precision: 0.5111980322951875
f1 score: 0.4036694438954101
-------KNN-------
Recall: 0.40685377581267823
Precision: 0.4929807042811197
f1 score: 0.41616061387550857
-------SVM-------
Recall: 0.4050921298824073
Precision: 0.45228993454572414
f1 score: 0.39998831979118415
-------NB-------
Recall: 0.40367928978055495
Precision: 0.4414158947036219
f1 score: 0.4001907080326152
-------RF-------
Recall: 0.39845801141037285
Precision: 0.4873150897119185
f1 score: 0.4045556157453289


In [24]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Train Data: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['COLOCATED_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data['SLOC']

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Test Data: COLOCATED_MOZILLA.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['COLOCATED_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data['SLOC']

different_file_test(train_data, test_data, train_label, test_label)

Train Data: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
Test Data: COLOCATED_MOZILLA.csv
Initial data shape:  (1613, 16)
-------DT-------
Recall: 0.387193261142841
Precision: 0.4088152301377895
f1 score: 0.38656256391810023
-------KNN-------
Recall: 0.40437589177084976
Precision: 0.39229459799980876
f1 score: 0.3971345857953023
-------SVM-------
Recall: 0.4879702722839978
Precision: 0.384793309882799
f1 score: 0.36371063833142303
-------NB-------
Recall: 0.36289936850160937
Precision: 0.4702024115384093
f1 score: 0.3563654355297994
-------RF-------
Recall: 0.398178126469443
Precision: 0.409964449482199
f1 score: 0.39745314992025516


In [25]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Train Data: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['COLOCATED_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data['SLOC']

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Test Data: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['COLOCATED_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data['SLOC']

different_file_test(train_data, test_data, train_label, test_label)

Train Data: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
Test Data: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
-------DT-------
Recall: 0.36683151902340155
Precision: 0.3904792647424433
f1 score: 0.35204491774589797
-------KNN-------
Recall: 0.37449388458467014
Precision: 0.3691686187959396
f1 score: 0.3636014196404323
-------SVM-------
Recall: 0.4408119328822076
Precision: 0.37842518085925797
f1 score: 0.37188843908250063
-------NB-------
Recall: 0.36627928697853185
Precision: 0.45925809210822494
f1 score: 0.3469894929582285
-------RF-------
Recall: 0.3815239422894628
Precision: 0.5699745420056906
f1 score: 0.3732486151886611


In [26]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Train Data: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['COLOCATED_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data['SLOC']

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Test Data: COLOCATED_MOZILLA.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['COLOCATED_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data['SLOC']

different_file_test(train_data, test_data, train_label, test_label)

Train Data: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
Test Data: COLOCATED_MOZILLA.csv
Initial data shape:  (1613, 16)
-------DT-------
Recall: 0.40451716628595147
Precision: 0.5542149946046908
f1 score: 0.41566845600390007
-------KNN-------
Recall: 0.4192987711249385
Precision: 0.579567464550944
f1 score: 0.4293572314362784
-------SVM-------
Recall: 0.4605963597560236
Precision: 0.37235379502959787
f1 score: 0.36218229623137593
-------NB-------
Recall: 0.3868658574540927
Precision: 0.46571995724538096
f1 score: 0.39113405732597467
-------RF-------
Recall: 0.394536200279857
Precision: 0.4232987259320818
f1 score: 0.39584730492058623


In [27]:
train_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv') 
print("Train Data: COLOCATED_WIKIMEDIA.csv")
print("Initial data shape: ", train_data.shape)
train_label = train_data['COLOCATED_STATUS']
train_label = convert_label_to_numeric(train_label)
train_data = train_data['SLOC']

test_data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv') 
print("Test Data: COLOCATED_OPENSTACK.csv")
print("Initial data shape: ", test_data.shape)
test_label = test_data['COLOCATED_STATUS']
test_label = convert_label_to_numeric(test_label)
test_data = test_data['SLOC']

different_file_test(train_data, test_data, train_label, test_label)

Train Data: COLOCATED_WIKIMEDIA.csv
Initial data shape:  (2845, 16)
Test Data: COLOCATED_OPENSTACK.csv
Initial data shape:  (2764, 16)
-------DT-------
Recall: 0.3922845824365577
Precision: 0.44045834067199136
f1 score: 0.3937980579695159
-------KNN-------
Recall: 0.3996267471203791
Precision: 0.42690192216508005
f1 score: 0.40093741088364543
-------SVM-------
Recall: 0.4134612014908015
Precision: 0.44754001607712834
f1 score: 0.3584579668847991
-------NB-------
Recall: 0.3975612361363763
Precision: 0.40628366468286875
f1 score: 0.39097009644818415
-------RF-------
Recall: 0.3962776088649414
Precision: 0.42302809910426187
f1 score: 0.39827591997653505
