In [None]:
#todo: hyperparameter optimization, tesing on different files

In [None]:
from dbn import SupervisedDBNClassification
import numpy as np
import pandas as pd
import csv, re
from sklearn.model_selection import train_test_split
from sklearn.metrics.classification import accuracy_score
from sklearn.preprocessing import MinMaxScaler

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import itertools

from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score

from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

In [None]:
def processTokensOfOneFile( oneFileContent ):
    stemmer_obj  = SnowballStemmer("english")
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and
    # the output is a single string (a preprocessed movie review)
    
    # Remove non-letters
    letters_only = re.sub("[^a-zA-Z]", " ", oneFileContent)
    
    # Convert to lower case, split into individual words
    words = letters_only.lower().split()
    
    # In Python, searching a set is much faster than searching a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))
    
    # Remove stop words
    meaningful_words = [w for w in words if not w in stops]
    
    # Only inlcude words at least of length 3
    valid_len_words = [w for w in meaningful_words if len(w) >= 3]

    # convert words to utf
    stemmed_words = [stemmer_obj.stem(token) for token in valid_len_words]
    
    #Join the words back into one string separated by space, and return the result.
    return( " ".join( stemmed_words ))

In [None]:
def giveFileContent(fileNameParam):
    str2ret=""
    for line_ in open("..//updated_data" + fileNameParam, 'rU'):
        li=line_.strip()
        str2ret = str2ret + line_.rstrip()
    return str2ret

In [None]:
def getTokensForTokenization(datasetParam):
    completeCorpus    = [] ## a list of lists with tokens from defected and non defected files
    with open(datasetParam, 'rU') as f:
        reader_ = csv.reader(f)
        next(reader_, None)
        for row in reader_:
            fileToRead   = row[0]
            fileContentAsStr = giveFileContent(fileToRead)
            filtered_str_from_one_file = processTokensOfOneFile(fileContentAsStr)
            completeCorpus.append(filtered_str_from_one_file)       
    return completeCorpus

In [None]:
def convert_label_to_numeric(label):
    converted_label = np.empty(len(label), dtype=object) 
    for i in range(len(label)):
        if label[i] == "ONLY_ONE":
            converted_label[i] = 1
        elif label[i] == "NEUTRAL":
            converted_label[i] = 0
        else: 
            converted_label[i] = 2
    converted_label = converted_label.astype('int')
    return converted_label

In [None]:
def measure_performance(true_label, predicted_label):   
    report = classification_report(true_label, predicted_label, digits=3)
    recall = recall_score(true_label, predicted_label, average="macro")
    precision = precision_score(true_label, predicted_label, average="macro")
    f1 = f1_score(true_label, predicted_label, average="macro")
    return recall, precision, f1

In [None]:
class MyLabelEncoder(object):
    def __init__(self):
        """
        It differs from LabelEncoder by handling new classes and providing a value for it [Unknown]
        Unknown will be added in fit and transform will take care of new item. It gives unknown class id
        """
        self.label_encoder = LabelEncoder()
        # self.classes_ = self.label_encoder.classes_

    def fit(self, data_list):
        """
        This will fit the encoder for all the unique values and introduce unknown value
        :param data_list: A list of string
        :return: self
        """
        self.label_encoder = self.label_encoder.fit(list(data_list) + ['Unknown'])
        self.classes_ = self.label_encoder.classes_

        return self

    def transform(self, data_list):
        """
        This will transform the data_list to id list where the new values get assigned to Unknown class
        :param data_list:
        :return:
        """
        new_data_list = list(data_list)
        for unique_item in np.unique(data_list):
            if unique_item not in self.label_encoder.classes_:
                new_data_list = ['Unknown' if x==unique_item else x for x in new_data_list]

        return self.label_encoder.transform(new_data_list)

In [None]:
def parse_source_code(data_file):     
    unfilteredTokensFromFile = getTokensForTokenization(data_file)
    return unfilteredTokensFromFile

In [None]:
def encode_vector_of_nodes(docs):
    #documents
    #docs = ['if foo for bar car', 'foo for if bar']
    
    docs = [x.lower() for x in docs]
  
    # split documents to tokens
    words = [doc.split(" ") for doc in docs]
    #print("All tokens: ", words)
    
    # find the length of vectors
    max_len = np.max([len(x) for x in words])
    print("Vector Length: ", max_len)
    
    # convert list of of token-lists to one flat list of tokens
    flatten_words = list(itertools.chain.from_iterable(words))
    #print("Flatten tokens: ", flatten_words)
    
    #fine all the unique tokens
    unique_words = np.unique(flatten_words)
    #print("Unique tokens: ", unique_words)
    
    # integer encode
    encoded_docs = []
    label_encoder = MyLabelEncoder()
    label_encoder.fit(unique_words)
    for doc in docs:
        #print(doc)
        words = doc.split(" ")
        #print(words)
        integer_encoded = label_encoder.transform(words)
        integer_encoded = np.pad(integer_encoded, (0, max_len - len(integer_encoded))) #padding with 0 to make fixed sized vectors
        #print(integer_encoded)
        encoded_docs.append(integer_encoded)
    
    return encoded_docs

In [None]:
def prediction_using_dbn(data, true_label):
    
    data = np.array(data)
    scaler = MinMaxScaler()
    data = scaler.fit_transform(data)
    
    # 10 fold cv
    kf = KFold(n_splits=10, shuffle = True, random_state = 7)

    cv_recall = []
    cv_precision = []
    cv_f1 = []

    for train_index, test_index in kf.split(data):
        train, test = data[train_index], data[test_index]
        train_label, test_label = true_label[train_index], true_label[test_index]

        # Since DBN is effected by scale, we need to scale the features in the data before applying PCA
        scaler = StandardScaler()
        # Fit on training set only.
        scaler.fit(train)
        # Apply transform to both the training set and the test set.
        train = scaler.transform(train)
        test = scaler.transform(test)

        dbn = SupervisedDBNClassification(hidden_layers_structure = [256, 256],
                    learning_rate_rbm=0.05,
                    learning_rate=0.001,
                    n_epochs_rbm=1,
                    n_iter_backprop=1,
                    batch_size=32,
                    activation_function='relu',
                    dropout_p=0.2)
        dbn.fit(train, train_label)
        predicted_label = dbn.predict(test)
        recall, precision, f1 = measure_performance(test_label, predicted_label)
        cv_recall.append(recall)
        cv_precision.append(precision)
        cv_f1.append(f1)
        
    print("Recall:", np.mean(cv_recall))
    print("Precision:", np.mean(cv_precision))
    print("f1 score:", np.mean(cv_f1))
    return

In [None]:
data_file = '..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv'
parsed_files = parse_source_code(data_file)

file_df = pd.read_csv(data_file) 
print("Total File: ", file_df.shape[0])
true_label = file_df['COLOCATED_STATUS']
true_label = convert_label_to_numeric(true_label)
file_names = np.unique(file_df['FILE_PATH'].tolist())

encoded_files = encode_vector_of_nodes(parsed_files)
prediction_using_dbn(encoded_files, true_label)
    