In [None]:
from dbn.tensorflow import SupervisedDBNClassification
from dbn.tensorflow.models import UnsupervisedDBN

import numpy as np
import pandas as pd
import csv, re
from sklearn.model_selection import train_test_split
from sklearn.metrics.classification import accuracy_score
from sklearn.preprocessing import MinMaxScaler

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import itertools

from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score

from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

from sklearn.pipeline import Pipeline

from sklearn import svm, tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

import time

import warnings
warnings.filterwarnings('ignore')

In [None]:
def processTokensOfOneFile( oneFileContent ):
    stemmer_obj  = SnowballStemmer("english")
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and
    # the output is a single string (a preprocessed movie review)
    
    # Remove non-letters
    letters_only = re.sub("[^a-zA-Z]", " ", oneFileContent)
    
    # Convert to lower case, split into individual words
    words = letters_only.lower().split()
    
    # In Python, searching a set is much faster than searching a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))
    
    # Remove stop words
    meaningful_words = [w for w in words if not w in stops]
    
    # Only inlcude words at least of length 3
    valid_len_words = [w for w in meaningful_words if len(w) >= 3]

    # convert words to utf
    stemmed_words = [stemmer_obj.stem(token) for token in valid_len_words]
    
    #Join the words back into one string separated by space, and return the result.
    return( " ".join( stemmed_words ))

In [None]:
def giveFileContent(fileNameParam):
    str2ret=""
    for line_ in open("..//updated_data" + fileNameParam, 'rU'):
        li=line_.strip()
        str2ret = str2ret + line_.rstrip()
    return str2ret

In [None]:
def getTokensForTokenization(datasetParam):
    completeCorpus    = [] ## a list of lists with tokens from defected and non defected files
    with open(datasetParam, 'rU') as f:
        reader_ = csv.reader(f)
        next(reader_, None)
        for row in reader_:
            fileToRead   = row[0]
            fileContentAsStr = giveFileContent(fileToRead)
            filtered_str_from_one_file = processTokensOfOneFile(fileContentAsStr)
            completeCorpus.append(filtered_str_from_one_file)       
    return completeCorpus

In [None]:
def convert_label_to_numeric(label):
    converted_label = np.empty(len(label), dtype=object) 
    for i in range(len(label)):
        if label[i] == "INSECURE":
            converted_label[i] = 1
        elif label[i] == "NEUTRAL":
            converted_label[i] = 0
        else: 
            converted_label[i] = 2
    converted_label = converted_label.astype('int')
    return converted_label

In [None]:
def measure_performance(true_label, predicted_label):   
    precision = recall = f1 = np.zeros(3, dtype=np.float32)
    report = classification_report(true_label, predicted_label, digits=3)
    precision = precision_score(true_label, predicted_label, average=None, labels=[0,1])
    recall = recall_score(true_label, predicted_label, average=None, labels=[0,1])
    f1 = f1_score(true_label, predicted_label, average=None, labels=[0,1])
    return recall, precision, f1

In [None]:
class MyLabelEncoder(object):
    def __init__(self):
        """
        It differs from LabelEncoder by handling new classes and providing a value for it [Unknown]
        Unknown will be added in fit and transform will take care of new item. It gives unknown class id
        """
        self.label_encoder = LabelEncoder()
        # self.classes_ = self.label_encoder.classes_

    def fit(self, data_list):
        """
        This will fit the encoder for all the unique values and introduce unknown value
        :param data_list: A list of string
        :return: self
        """
        self.label_encoder = self.label_encoder.fit(list(data_list) + ['Unknown'])
        self.classes_ = self.label_encoder.classes_

        return self

    def transform(self, data_list):
        """
        This will transform the data_list to id list where the new values get assigned to Unknown class
        :param data_list:
        :return:
        """
        new_data_list = list(data_list)
        for unique_item in np.unique(data_list):
            if unique_item not in self.label_encoder.classes_:
                new_data_list = ['Unknown' if x==unique_item else x for x in new_data_list]

        return self.label_encoder.transform(new_data_list)

In [None]:
def parse_source_code(data_file):     
    unfilteredTokensFromFile = getTokensForTokenization(data_file)
    return unfilteredTokensFromFile

In [None]:
def encode_vector_of_nodes(docs):
    #documents
    #docs = ['if foo for bar car', 'foo for if bar']
    
    docs = [x.lower() for x in docs]
  
    # split documents to tokens
    words = [doc.split(" ") for doc in docs]
    #print("All tokens: ", words)
    
    # find the length of vectors
    max_len = np.max([len(x) for x in words])
    print("Vector Length: ", max_len)
    
    # convert list of of token-lists to one flat list of tokens
    flatten_words = list(itertools.chain.from_iterable(words))
    #print("Flatten tokens: ", flatten_words)
    
    #fine all the unique tokens
    unique_words = np.unique(flatten_words)
    #print("Unique tokens: ", unique_words)
    
    # integer encode
    encoded_docs = []
    label_encoder = MyLabelEncoder()
    label_encoder.fit(unique_words)
    for doc in docs:
        #print(doc)
        words = doc.split(" ")
        #print(words)
        integer_encoded = label_encoder.transform(words)
        integer_encoded = np.pad(integer_encoded, (0, max_len - len(integer_encoded))) #padding with 0 to make fixed sized vectors
        #print(integer_encoded)
        encoded_docs.append(integer_encoded)
    
    return encoded_docs

In [None]:
def semantic_classifier(clf):
    dbn = UnsupervisedDBN(hidden_layers_structure=[256, 256],
                              batch_size=64,
                              learning_rate_rbm=0.06,
                              n_epochs_rbm=1,
                              activation_function='sigmoid',
                              verbose =0)

    classifier = Pipeline(steps=[('dbn', dbn), ('clf', clf)])  
    
    return classifier

In [None]:
def kfold_cv(data, true_label):
    
    data = np.array(data)
    
    # 10 fold cv
    kf = KFold(n_splits=1, shuffle = True, random_state = 7)

    cv_recall_DT = []
    cv_precision_DT = []
    cv_f1_DT = []

    cv_recall_KNN = []
    cv_precision_KNN = []
    cv_f1_KNN = []

    cv_recall_SVM = []
    cv_precision_SVM = []
    cv_f1_SVM = []

    cv_recall_NB = []
    cv_precision_NB = []
    cv_f1_NB = []

    cv_recall_RF = []
    cv_precision_RF = []
    cv_f1_RF = []
    
    cv_recall_DBN = []
    cv_precision_DBN = []
    cv_f1_DBN = []


    for train_index, test_index in kf.split(data):        
        train, test = data[train_index], data[test_index]
        train_label, test_label = true_label[train_index], true_label[test_index]
        
        # Since DBN is effected by scale, we need to scale the features in the data before applying PCA
        scaler = StandardScaler()
        # Fit on training set only.
        scaler.fit(train)
        # Apply transform to both the training set and the test set.
        train = scaler.transform(train)
        test = scaler.transform(test)
        
        clf = tree.DecisionTreeClassifier(criterion='gini', splitter='best', 
                                      min_samples_split = 2, min_weight_fraction_leaf=0.0)
        classifier = semantic_classifier(clf)
        start = time.perf_counter()
        classifier.fit(train, train_label)
        end = time.perf_counter()
        print("CART fit time", (end - start))
        start = time.perf_counter()
        predicted_label = classifier.predict(test)
        end = time.perf_counter()
        print("CART predict time", (end - start))
        recall, precision, f1 = measure_performance(test_label, predicted_label)
        cv_recall_DT.append(recall)
        cv_precision_DT.append(precision)
        cv_f1_DT.append(f1)

        
        clf = KNeighborsClassifier(n_neighbors=5, weights='distance')
        classifier = semantic_classifier(clf)
        start = time.perf_counter()
        classifier.fit(train, train_label)
        end = time.perf_counter()
        print("KNN fit time", (end - start))
        start = time.perf_counter()
        predicted_label = classifier.predict(test)
        end = time.perf_counter()
        print("KNN predict time", (end - start))
        recall, precision, f1 = measure_performance(test_label, predicted_label)
        cv_recall_KNN.append(recall)
        cv_precision_KNN.append(precision)
        cv_f1_KNN.append(f1)

        
        clf = svm.SVC(gamma='auto', C = 20.0, kernel='rbf')
        classifier = semantic_classifier(clf)
        start = time.perf_counter()
        classifier.fit(train, train_label)
        end = time.perf_counter()
        print("SVM fit time", (end - start))
        start = time.perf_counter()
        predicted_label = classifier.predict(test)
        end = time.perf_counter()
        print("SVM predict time", (end - start))
        recall, precision, f1 = measure_performance(test_label, predicted_label)
        cv_recall_SVM.append(recall)
        cv_precision_SVM.append(precision)
        cv_f1_SVM.append(f1)

        
        clf = GaussianNB()
        classifier = semantic_classifier(clf)
        start = time.perf_counter()
        classifier.fit(train, train_label)
        end = time.perf_counter()
        print("NB fit time", (end - start))
        start = time.perf_counter()
        predicted_label = classifier.predict(test)
        end = time.perf_counter()
        print("NB predict time", (end - start))
        recall, precision, f1 = measure_performance(test_label, predicted_label)
        cv_recall_NB.append(recall)
        cv_precision_NB.append(precision)
        cv_f1_NB.append(f1)

        clf = RandomForestClassifier(n_estimators=10, criterion='gini')
        classifier = semantic_classifier(clf)
        start = time.perf_counter()
        classifier.fit(train, train_label)
        end = time.perf_counter()
        print("RF fit time", (end - start))
        start = time.perf_counter()
        predicted_label = classifier.predict(test)
        end = time.perf_counter()
        print("RF predict time", (end - start))
        recall, precision, f1 = measure_performance(test_label, predicted_label)
        cv_recall_RF.append(recall)
        cv_precision_RF.append(precision)
        cv_f1_RF.append(f1)
        
        clf = SupervisedDBNClassification(hidden_layers_structure = [256, 256],
                    learning_rate_rbm=0.05,
                    learning_rate=0.1,
                    n_epochs_rbm=1,
                    n_iter_backprop=1,
                    batch_size=64,
                    activation_function='relu',
                    dropout_p=0.2,
                    verbose=0)  
        classifier = semantic_classifier(clf)
        start = time.perf_counter()
        classifier.fit(train, train_label)
        end = time.perf_counter()
        print("DBN fit time", (end - start))
        start = time.perf_counter()
        predicted_label = classifier.predict(test)
        end = time.perf_counter()
        print("DBN predict time", (end - start))
        recall, precision, f1 = measure_performance(test_label, predicted_label)
        cv_recall_DBN.append(recall)
        cv_precision_DBN.append(precision)
        cv_f1_DBN.append(f1)

        
    recall_DT = np.mean(cv_recall_DT, axis= 0)
    precision_DT = np.mean(cv_precision_DT, axis= 0)
    f1_DT = np.mean(cv_f1_DT, axis= 0)

    recall_KNN = np.mean(cv_recall_KNN, axis= 0)
    precision_KNN = np.mean(cv_precision_KNN, axis= 0)
    f1_KNN = np.mean(cv_f1_KNN, axis= 0)

    recall_SVM = np.mean(cv_recall_SVM, axis= 0)
    precision_SVM = np.mean(cv_precision_SVM, axis= 0)
    f1_SVM =  np.mean(cv_f1_SVM, axis= 0)

    recall_NB = np.mean(cv_recall_NB, axis= 0)
    precision_NB = np.mean(cv_precision_NB, axis= 0)
    f1_NB = np.mean(cv_f1_NB, axis= 0)

    recall_RF = np.mean(cv_recall_RF, axis= 0)
    precision_RF = np.mean(cv_precision_RF, axis= 0)
    f1_RF = np.mean(cv_f1_RF, axis= 0)
    
    recall_DBN = np.mean(cv_recall_DBN, axis= 0)
    precision_DBN = np.mean(cv_precision_DBN, axis= 0)
    f1_DBN = np.mean(cv_f1_DBN, axis= 0)
    
    return recall_DT, precision_DT, f1_DT, recall_KNN, precision_KNN, f1_KNN, recall_SVM, precision_SVM,\
    f1_SVM, recall_NB, precision_NB, f1_NB, recall_RF, precision_RF, f1_RF, recall_DBN, precision_DBN, f1_DBN

In [None]:
def different_file_test(train, test, train_label, test_label):
    
    train = np.array(train)
    test = np.array(test)
    
    # Since DBN is effected by scale, we need to scale the features in the data before applying PCA
    scaler = StandardScaler()
    # Fit on training set only.
    scaler.fit(train)
    # Apply transform to both the training set and the test set.
    train = scaler.transform(train)
    test = scaler.transform(test)
        
        
    clf = tree.DecisionTreeClassifier(criterion='gini', splitter='best', 
                                      min_samples_split = 2, min_weight_fraction_leaf=0.0)
    classifier = semantic_classifier(clf)
    classifier.fit(train, train_label)
    predicted_label = classifier.predict(test)
    recall_DT, precision_DT, f1_DT = measure_performance(test_label, predicted_label)    

    clf = KNeighborsClassifier(n_neighbors=5, weights='distance')
    classifier = semantic_classifier(clf)
    classifier.fit(train, train_label)
    predicted_label = classifier.predict(test)
    recall_KNN, precision_KNN, f1_KNN = measure_performance(test_label, predicted_label)
 
    clf = svm.SVC(gamma='auto', C = 20.0, kernel='rbf')
    classifier = semantic_classifier(clf)
    classifier.fit(train, train_label)
    predicted_label = classifier.predict(test)
    recall_SVM, precision_SVM, f1_SVM  = measure_performance(test_label, predicted_label)

    
    clf = GaussianNB()
    classifier = semantic_classifier(clf)
    classifier.fit(train, train_label)
    predicted_label = classifier.predict(test)
    recall_NB, precision_NB, f1_NB = measure_performance(test_label, predicted_label)

    
    clf = RandomForestClassifier(n_estimators=10, criterion='gini')
    classifier = semantic_classifier(clf)
    classifier.fit(train, train_label)
    predicted_label = classifier.predict(test)
    recall_RF, precision_RF, f1_RF = measure_performance(test_label, predicted_label)
    
    
    clf = SupervisedDBNClassification(hidden_layers_structure = [256, 256],
                    learning_rate_rbm=0.05,
                    learning_rate=0.1,
                    n_epochs_rbm=10,
                    n_iter_backprop=10,
                    batch_size=64,
                    activation_function='relu',
                    dropout_p=0.2)  
    classifier = semantic_classifier(clf)
    classifier.fit(train, train_label)
    predicted_label = classifier.predict(test)
    recall_DBN, precision_DBN, f1_DBN = measure_performance(test_label, predicted_label)

    return recall_DT, precision_DT, f1_DT, recall_KNN, precision_KNN, f1_KNN, recall_SVM, precision_SVM,\
    f1_SVM, recall_NB, precision_NB, f1_NB, recall_RF, precision_RF, f1_RF, recall_DBN, precision_DBN, f1_DBN

In [None]:
def repeated_test(data, true_label, train_data, test_data, train_label, test_label, test_name):
    
    repeated_recall_DT = []
    repeated_precision_DT = []
    repeated_f1_DT = []

    repeated_recall_KNN = []
    repeated_precision_KNN = []
    repeated_f1_KNN = []

    repeated_recall_SVM = []
    repeated_precision_SVM = []
    repeated_f1_SVM = []

    repeated_recall_NB = []
    repeated_precision_NB = []
    repeated_f1_NB = []

    repeated_recall_RF = []
    repeated_precision_RF = []
    repeated_f1_RF = []
    
    repeated_recall_DBN = []
    repeated_precision_DBN = []
    repeated_f1_DBN = []
    
    recall_DT= precision_DT= f1_DT= recall_KNN= precision_KNN= f1_KNN= recall_SVM= precision_SVM= f1_SVM\
    = recall_NB= precision_NB= f1_NB= recall_RF= precision_RF= f1_RF = recall_DBN= precision_DBN= f1_DBN =0
    
    for i in range(10):
        if test_name == "k_fold":
            recall_DT, precision_DT, f1_DT, recall_KNN, precision_KNN, f1_KNN, recall_SVM, precision_SVM, f1_SVM,\
            recall_NB, precision_NB, f1_NB, recall_RF, precision_RF, f1_RF, recall_DBN, precision_DBN, f1_DBN = kfold_cv(data, true_label)
        else: 
            recall_DT, precision_DT, f1_DT, recall_KNN, precision_KNN, f1_KNN, recall_SVM, precision_SVM, f1_SVM,\
            recall_NB, precision_NB, f1_NB, recall_RF, precision_RF, f1_RF, recall_DBN, precision_DBN, f1_DBN = different_file_test(train_data, test_data, train_label, test_label)
        
        repeated_recall_DT.append(recall_DT)
        repeated_precision_DT.append(precision_DT)
        repeated_f1_DT.append(f1_DT)

        repeated_recall_KNN.append(recall_KNN)
        repeated_precision_KNN.append(precision_KNN)
        repeated_f1_KNN.append(f1_KNN)

        repeated_recall_SVM.append(recall_SVM)
        repeated_precision_SVM.append(precision_SVM)
        repeated_f1_SVM.append(f1_SVM)

        repeated_recall_NB.append(recall_NB)
        repeated_precision_NB.append(precision_NB)
        repeated_f1_NB.append(f1_NB)

        repeated_recall_RF.append(recall_RF)
        repeated_precision_RF.append(precision_RF)
        repeated_f1_RF.append(f1_RF)
        
        repeated_recall_DBN.append(recall_DBN)
        repeated_precision_DBN.append(precision_DBN)
        repeated_f1_DBN.append(f1_DBN)
        
    print("-------DT-------")
    print("Recall:", np.median(repeated_recall_DT, axis= 0))
    print("Precision:", np.median(repeated_precision_DT, axis= 0))
    print("f1 score:", np.median(repeated_f1_DT, axis= 0))

    print("-------KNN-------")
    print("Recall:", np.median(repeated_recall_KNN, axis= 0))
    print("Precision:", np.median(repeated_precision_KNN, axis= 0))
    print("f1 score:", np.median(repeated_f1_KNN, axis= 0))

    print("-------SVM-------")
    print("Recall:", np.median(repeated_recall_SVM, axis= 0))
    print("Precision:", np.median(repeated_precision_SVM, axis= 0))
    print("f1 score:", np.median(repeated_f1_SVM, axis= 0))

    print("-------NB-------")
    print("Recall:", np.median(repeated_recall_NB, axis= 0))
    print("Precision:", np.median(repeated_precision_NB, axis= 0))
    print("f1 score:", np.median(repeated_f1_NB, axis= 0))

    print("-------RF-------")
    print("Recall:", np.median(repeated_recall_RF, axis= 0))
    print("Precision:", np.median(repeated_precision_RF, axis= 0))
    print("f1 score:", np.median(repeated_f1_RF, axis= 0))
    
    print("-------DBN-------")
    print("Recall:", np.median(repeated_recall_DBN, axis= 0))
    print("Precision:", np.median(repeated_precision_DBN, axis= 0))
    print("f1 score:", np.median(repeated_f1_DBN, axis= 0))

In [None]:
encoded_files, true_label, train_data, test_data, train_label, test_label = [], [], [], [], [], [] # necessary variables

In [None]:
data = '..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv'
print("Data set: COLOCATED_MOZILLA.csv")

parsed_files = parse_source_code(data)
file_df = pd.read_csv(data) 
print("Total File: ", file_df.shape[0])

true_label = file_df['ICP_STATUS']
true_label = convert_label_to_numeric(true_label)

encoded_files = encode_vector_of_nodes(parsed_files)

repeated_test(encoded_files, true_label, train_data, test_data, train_label, test_label, "k_fold") #repeat kfold 10 times and report avarage performance 

In [None]:
data = '..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv'
print("Data set: COLOCATED_OPENSTACK.csv")

parsed_files = parse_source_code(data)
file_df = pd.read_csv(data) 
print("Total File: ", file_df.shape[0])

true_label = file_df['ICP_STATUS']
true_label = convert_label_to_numeric(true_label)

encoded_files = encode_vector_of_nodes(parsed_files)

repeated_test(encoded_files, true_label, train_data, test_data, train_label, test_label, "k_fold") #repeat kfold 10

In [None]:
data = '..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv'
print("Data set: COLOCATED_WIKIMEDIA.csv")

parsed_files = parse_source_code(data)
file_df = pd.read_csv(data) 
print("Total File: ", file_df.shape[0])

true_label = file_df['ICP_STATUS']
true_label = convert_label_to_numeric(true_label)

encoded_files = encode_vector_of_nodes(parsed_files)

repeated_test(encoded_files, true_label, train_data, test_data, train_label, test_label, "k_fold") #repeat kfold 10

In [None]:
train_data = '..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv'
print("Train Data: COLOCATED_MOZILLA.csv")
parsed_train = parse_source_code(train_data)
train_file_df = pd.read_csv(train_data) 
print("Total File: ", train_file_df.shape[0])
train_label = train_file_df['ICP_STATUS']
train_label = convert_label_to_numeric(train_label)

test_data = '..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv'
print("Test Data: COLOCATED_OPENSTACK.csv")
parsed_test = parse_source_code(test_data)
test_file_df = pd.read_csv(test_data) 
print("Total File: ", test_file_df.shape[0])
test_label = test_file_df['ICP_STATUS']
test_label = convert_label_to_numeric(test_label)


parsed_file = parsed_train + parsed_test
encoded_file = encode_vector_of_nodes(parsed_file)
train_encoded = encoded_file[:train_file_df.shape[0]]
test_encoded = encoded_file[train_file_df.shape[0]:]

repeated_test(encoded_files, true_label, train_encoded, test_encoded, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

In [None]:
train_data = '..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv'
print("Train Data: COLOCATED_MOZILLA.csv")
parsed_train = parse_source_code(train_data)
train_file_df = pd.read_csv(train_data) 
print("Total File: ", train_file_df.shape[0])
train_label = train_file_df['ICP_STATUS']
train_label = convert_label_to_numeric(train_label)


test_data = '..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv'
print("Test Data: COLOCATED_WIKIMEDIA.csv")
parsed_test = parse_source_code(test_data)
test_file_df = pd.read_csv(test_data) 
print("Total File: ", test_file_df.shape[0])
test_label = test_file_df['ICP_STATUS']
test_label = convert_label_to_numeric(test_label)


parsed_file = parsed_train + parsed_test
encoded_file = encode_vector_of_nodes(parsed_file)
train_encoded = encoded_file[:train_file_df.shape[0]]
test_encoded = encoded_file[train_file_df.shape[0]:]

repeated_test(encoded_files, true_label, train_encoded, test_encoded, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

In [None]:
train_data = '..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv'
print("Train Data: COLOCATED_OPENSTACK.csv")
parsed_train = parse_source_code(train_data)
train_file_df = pd.read_csv(train_data) 
print("Total File: ", train_file_df.shape[0])
train_label = train_file_df['ICP_STATUS']
train_label = convert_label_to_numeric(train_label)


test_data = '..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv'
print("Test Data: COLOCATED_MOZILLA.csv")
parsed_test = parse_source_code(test_data)
test_file_df = pd.read_csv(test_data) 
print("Total File: ", test_file_df.shape[0])
test_label = test_file_df['ICP_STATUS']
test_label = convert_label_to_numeric(test_label)


parsed_file = parsed_train + parsed_test
encoded_file = encode_vector_of_nodes(parsed_file)
train_encoded = encoded_file[:train_file_df.shape[0]]
test_encoded = encoded_file[train_file_df.shape[0]:]

repeated_test(encoded_files, true_label, train_encoded, test_encoded, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

In [None]:
train_data = '..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv'
print("Train Data: COLOCATED_OPENSTACK.csv")
parsed_train = parse_source_code(train_data)
train_file_df = pd.read_csv(train_data) 
print("Total File: ", train_file_df.shape[0])
train_label = train_file_df['ICP_STATUS']
train_label = convert_label_to_numeric(train_label)


test_data = '..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv'
print("Test Data: COLOCATED_WIKIMEDIA.csv")
parsed_test = parse_source_code(test_data)
test_file_df = pd.read_csv(test_data) 
print("Total File: ", test_file_df.shape[0])
test_label = test_file_df['ICP_STATUS']
test_label = convert_label_to_numeric(test_label)


parsed_file = parsed_train + parsed_test
encoded_file = encode_vector_of_nodes(parsed_file)
train_encoded = encoded_file[:train_file_df.shape[0]]
test_encoded = encoded_file[train_file_df.shape[0]:]

repeated_test(encoded_files, true_label, train_encoded, test_encoded, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

In [None]:
train_data = '..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv'
print("Train Data: COLOCATED_WIKIMEDIA.csv")
parsed_train = parse_source_code(train_data)
train_file_df = pd.read_csv(train_data) 
print("Total File: ", train_file_df.shape[0])
train_label = train_file_df['ICP_STATUS']
train_label = convert_label_to_numeric(train_label)


test_data = '..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv'
print("Test Data: COLOCATED_MOZILLA.csv")
parsed_test = parse_source_code(test_data)
test_file_df = pd.read_csv(test_data) 
print("Total File: ", test_file_df.shape[0])
test_label = test_file_df['ICP_STATUS']
test_label = convert_label_to_numeric(test_label)


parsed_file = parsed_train + parsed_test
encoded_file = encode_vector_of_nodes(parsed_file)
train_encoded = encoded_file[:train_file_df.shape[0]]
test_encoded = encoded_file[train_file_df.shape[0]:]

repeated_test(encoded_files, true_label, train_encoded, test_encoded, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance

In [None]:
train_data = '..//updated_data//RAW_DATASETS//COLOCATED_WIKIMEDIA.csv'
print("Train Data: COLOCATED_WIKIMEDIA.csv")
parsed_train = parse_source_code(train_data)
train_file_df = pd.read_csv(train_data) 
print("Total File: ", train_file_df.shape[0])
train_label = train_file_df['ICP_STATUS']
train_label = convert_label_to_numeric(train_label)


test_data = '..//updated_data//RAW_DATASETS//COLOCATED_OPENSTACK.csv'
print("Test Data: COLOCATED_OPENSTACK.csv")
parsed_test = parse_source_code(test_data)
test_file_df = pd.read_csv(test_data) 
print("Total File: ", test_file_df.shape[0])
test_label = test_file_df['ICP_STATUS']
test_label = convert_label_to_numeric(test_label)


parsed_file = parsed_train + parsed_test
encoded_file = encode_vector_of_nodes(parsed_file)
train_encoded = encoded_file[:train_file_df.shape[0]]
test_encoded = encoded_file[train_file_df.shape[0]:]

repeated_test(encoded_files, true_label, train_encoded, test_encoded, train_label, test_label, "different_file_test") # repeat cross test 10 times and report avarage performance