In [63]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from keras.models import Sequential
from keras import layers
from keras.backend import clear_session
from keras.preprocessing.text import Tokenizer
from keras.layers import Activation, Dense, Reshape 
import numpy as np
from keras import backend as K
import matplotlib.pyplot as plt
import pandas as pd
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout, Conv1D, GlobalMaxPooling1D
from keras.models import load_model
from numpy import asarray
from numpy import save
from numpy import load
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import esprima as esp
import re
from keras.initializers import Constant
from keras.optimizers import Adam
from keras.losses import BinaryCrossentropy
from keras.utils import plot_model
from keras import Model
from keras.utils import Sequence
import pydot
import gensim 
from nltk.tokenize import word_tokenize
import pickle
from keras import backend as K
import os
import random
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# Set the random seed for reproducible results
np.random.seed(123)
plt.style.use('ggplot')

In [2]:
def save_object(obj, filename):
    with open(filename+".pkl", 'wb') as outp:
        pickle.dump(obj, outp, pickle.HIGHEST_PROTOCOL)
    outp.close()

In [3]:
def load_object(filename):
    with open(filename+".pkl", 'rb') as outp:
        loaded_object = pickle.load(outp)
    outp.close()
    return loaded_object

In [4]:
def plot_history(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

In [5]:
def preprocessing_dataset_(scripts,fit = True):
    if fit:
        tokenizer = Tokenizer(num_words=num_words)
        tokenizer.fit_on_texts(scripts)
        our_tokenizer = save_object(tokenizer, "our_tokenizer")
    else :
        tokenizer = load_object("our_tokenizer")
    X = tokenizer.texts_to_sequences(scripts)
    if fit:
        vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index
        save_object(vocab_size, "vocab_size")
    X = pad_sequences(X, padding='post', maxlen=maxlen)
    return X

In [6]:
# convert java script code to the abstract syntax tree
# then extract the sequences of syntax units with detailed information
def sequence_of_syntax_units(scripts):
    """
    sctipts : list or single javascript code 
    return sequence of syntax units
    each syntax unit correspond to line in an abstract syntax tree
    """
    if type(scripts) is list:
        # esp.parseScript(script) returns abstract syntax tree of each js scripts
        return [re.sub('\s+', ' ', ''.join(str(esp.parseScript(script)).split('\n'))) for script in scripts]
    elif type(scripts) is str:
        return re.sub('\s+', ' ', ''.join(str(esp.parseScript(scripts)).split('\n')))
    else:
        raise ValueError('The type of scripts parameter must be {list or string}')

In [7]:
# create word tokens
def sequence_of_syntax_units_to_tokens(seq_of_syn_units):
    """
    Convert Sequence of syntax units to tokens to train  word2vec model 
    -----------------------------------------
    seq_of_syn_units: list of sequence syntax units correspond to abstract syntax trees
    """
    return list(map(word_tokenize, seq_of_syn_units))

In [8]:
class CustomDataGen(Sequence):
    
    def __init__(self, df, X_col, y_col,batch_size,fit=False,shuffle=True,parent_path=""):
        
        self.df = df.copy()
        self.X_col = X_col
        self.y_col = y_col
        self.batch_size = batch_size
        self.fit = fit
        self.shuffle = shuffle
        self.parent_path = parent_path
        self.n = df.shape[0]
    
    def on_epoch_end(self):
        if self.shuffle:
            self.df = self.df.sample(frac=1).reset_index(drop=True)
    
    def __get_input(self, path):
        try:
            with open(os.path.join(self.parent_path,path), 'r', encoding='utf-8') as js_file:
                script = js_file.read()
            script = sequence_of_syntax_units(script)
            return script
        except:
            return 'None'
    def __get_output(self, label):
        return label
#         return keras.utils.to_categorical(label, num_classes=2)
    
    def __get_data(self, batches):

        path_batch = batches[self.X_col['path']]
        
        name_batch = batches[self.y_col['name']]

        X_batch = np.asarray([self.__get_input(x) for x in path_batch]).astype('object')
        
        y_batch = np.asarray([self.__get_output(y) for y in name_batch]).astype('object')
        
        index = np.where(X_batch=='None')[0]
        
        np.delete(X_batch, index, axis=0)
        np.delete(y_batch, index, axis=0)
        
        X_batch  = preprocessing_dataset_(X_batch , fit = self.fit )
        
        X_batch = K.cast_to_floatx(X_batch)
        y_batch = K.cast_to_floatx(y_batch)
        

        return X_batch, y_batch
    
    def __getitem__(self, index):
        
        batches = self.df[index * self.batch_size:(index + 1) * self.batch_size]
        X, y = self.__get_data(batches)        
        return X, y
    
    def __len__(self):
        return self.n // self.batch_size

In [39]:
def feature_selection(x_chunk,y_chunk):
    indexes = random.sample(range(3, x_chunk.shape[0]), random.randint(1, x_chunk.shape[0]-3))
    x_chunk = x_chunk[indexes]
    y_chunk = y_chunk[indexes]
    return x_chunk,y_chunk

In [91]:
def evaluation(model,history,X_train,y_train,X_test,y_test,model_comparison_table):
    train_loss, train_accuracy = model.evaluate(X_train, y_train, verbose=False)
    test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=False)
    predicted = model.predict(X_test)
    Classification_report = classification_report(np.round(model.predict(X_test)), y_test)
    inputer(model_comparison_table,i,train_loss, train_accuracy,test_loss, test_accuracy,str(Classification_report))#,precision,recall=0,f1=0)

In [92]:
def create_model(shape=100,without_lstm=False):
    model = Sequential()
    if not without_lstm:
        model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
        model.add(Bidirectional(LSTM(units=50, input_shape=(None, 50), return_sequences=True)))
    else :
         model.add(layers.InputLayer(input_shape=shape))
        
    
    model.add(Bidirectional(LSTM(units=50, input_shape=(None, 50), return_sequences=True)))

    # TextCNN with 4 conv layers
    model.add(Conv1D(128, 7, activation='tanh', input_shape=(None, 32)))
    model.add(Conv1D(128, 15, activation='tanh'))
    model.add(Conv1D(128, 25, activation='tanh'))
    model.add(Conv1D(128, 35, activation='tanh'))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=myoptimizer,
                  loss=myloss,
                  metrics=[mymetrics])
#     model.summary()
    return model

In [34]:
def fit_model(model,X_train,y_train,X_validation,y_validation):
    history = model.fit(X_train, y_train,epochs=10,verbose=False,validation_data=(X_validation, y_validation))
    return model , history

In [28]:
train_df = pd.read_csv('G:/JSContanaDataSet/script_path_data.csv')
test_df = pd.read_csv('G:/JSContanaDataSet/script_path_test_data.csv')

train_batch_size = 20
test_batch_size = int(test_df[:500].shape[0]/(train_df[:1000].shape[0]/train_batch_size))
validate_batch_size = int(train_df[1000:1200].shape[0]/(train_df[:1000].shape[0]/train_batch_size))
# num_words = 400000
num_words = 400
# maxlen = 1024
maxlen = 100
embedding_dim = 50
myoptimizer = 'adam'
myloss= 'binary_crossentropy'
mymetrics = 'accuracy'

traingen = CustomDataGen(train_df[:1000],
                         X_col={'path':'ScriptPath'},
                         y_col={'name': 'Label'},
                         batch_size=train_batch_size,
                         parent_path='G:/JSContanaDataSet/',
                         fit=True)

validategen = CustomDataGen(train_df[1000:1200],
                         X_col={'path':'ScriptPath'},
                         y_col={'name': 'Label'},
                         batch_size=validate_batch_size,
                         parent_path='G:/JSContanaDataSet/',
                         fit=False)

testgen = CustomDataGen(test_df[:500],
                         X_col={'path':'ScriptPath'},
                         y_col={'name': 'Label'},
                         batch_size=test_batch_size,
                         parent_path='G:/JSContanaDataSet/',
                         fit=False)

X_train , y_train = traingen.__getitem__(0)

X_train_chunks = [] 
y_train_chunks = []

X_train_chunks.append(X_train)
y_train_chunks.append(y_train)

for i in range(1,int(train_df[:1000].shape[0]/train_batch_size)):
    X_train_chunks.append(traingen.__getitem__(i)[0])
    y_train_chunks.append(traingen.__getitem__(i)[1])

X_test , y_test = testgen.__getitem__(0)

X_test_chunks = [] 
y_test_chunks = []

X_test_chunks.append(X_test)
y_test_chunks.append(y_test)

for i in range(1,int(test_df[:500].shape[0]/test_batch_size)):
    X_test_chunks.append(testgen.__getitem__(i)[0])
    y_test_chunks.append(testgen.__getitem__(i)[1])


X_validate , y_validate = validategen.__getitem__(0)

X_validate_chunks = [] 
y_validate_chunks = []

X_validate_chunks.append(X_validate)
y_validate_chunks.append(y_validate)

for i in range(1,int(train_df[1000:1200].shape[0]/validate_batch_size)):
    X_validate_chunks.append(validategen.__getitem__(i)[0])
    y_validate_chunks.append(validategen.__getitem__(i)[1])

vocab_size = load_object('vocab_size')

(20, 100)


In [78]:
model_comparison_table = {}
model_comparison_table['chunk_nuber'] = []
model_comparison_table['train_loss'] = []
model_comparison_table['train_accuracy'] = []
model_comparison_table['test_loss'] = []
model_comparison_table['test_accuracy'] = []
model_comparison_table['classification_report'] = []

In [82]:
def inputer(model_comparison_table,chunk_nuber,train_loss, train_accuracy,test_loss,test_accuracy,classification_report):#precision_score,recall_score,f1_score):
    model_comparison_table['chunk_nuber'].append(str(chunk_nuber))
    model_comparison_table['train_loss'].append(train_loss)
    model_comparison_table['train_accuracy'].append(train_accuracy)
    model_comparison_table['test_loss'].append(test_loss)    
    model_comparison_table['test_accuracy'].append(test_accuracy)  
    model_comparison_table['classification_report'].append(classification_report)    

In [84]:
train_test_Model = create_model()
train_test_Model.save("train_test_Model.h5")
for i in range((len(X_train_chunks)/2)):
    print("================ "+str(i)+" ================")
    train_test_Model = load_model("train_test_Model.h5")
    X_train,y_train = X_train_chunks[i],y_train_chunks[i]
    X_validation,y_validation = X_test_chunks[i],y_test_chunks[i]
    train_test_Model , history_train_test_Model = fit_model(train_test_Model,X_train,y_train,X_validation,y_validation)
    evaluation(train_test_Model,history_train_test_Model,X_train,y_train,X_validation,y_validation,model_comparison_table)
    train_test_Model.save("train_test_Model.h5")



In [85]:
df = pd.DataFrame(model_comparison_table)
df.to_csv('G:/JSContanaDataSet/result_df.csv',index=False)
df.head(5)

Unnamed: 0,chunk_nuber,train_loss,train_accuracy,test_loss,test_accuracy,classification_report
0,0,0.20441,0.9,0.74545,0.6,precision recall f1-score ...
1,1,0.009919,1.0,2.62609,0.3,precision recall f1-score ...
2,2,0.606332,0.65,0.784802,0.5,precision recall f1-score ...
3,3,0.591738,0.75,0.653741,0.6,precision recall f1-score ...
4,4,0.656104,0.5,0.718224,0.5,precision recall f1-score ...


In [96]:
df["classification_report"][0]

'              precision    recall  f1-score   support\n\n         0.0       0.50      0.50      0.50         4\n         1.0       0.67      0.67      0.67         6\n\n    accuracy                           0.60        10\n   macro avg       0.58      0.58      0.58        10\nweighted avg       0.60      0.60      0.60        10\n'

In [87]:
print(df["classification_report"][0])

              precision    recall  f1-score   support

         0.0       0.50      0.50      0.50         4
         1.0       0.67      0.67      0.67         6

    accuracy                           0.60        10
   macro avg       0.58      0.58      0.58        10
weighted avg       0.60      0.60      0.60        10



In [None]:
keras_function = K.function([train_test_Model.input], [train_test_Model.layers[1].output])
for_traingen = []
for i in range(len(X_train_chunks)):
    first_layer_output = keras_function([X_train_chunks[i], 1])
    for_traingen.append(first_layer_output[0])
save_object(for_traingen, "for_traingen")
for_traingen = load_object("for_traingen")

In [None]:
model_comparison_table_1 = {}
model_comparison_table_1['chunk_nuber'] = []
model_comparison_table_1['train_loss'] = []
model_comparison_table_1['train_accuracy'] = []
model_comparison_table_1['test_loss'] = []
model_comparison_table_1['test_accuracy'] = []
model_comparison_table_1['classification_report'] = []

In [None]:
validate_test_Model = create_model()
validate_test_Model.save("validate_test_Model.h5")
for i in range(len(X_validate_chunks)):
    print("================ "+str(i)+" ================")
    validate_test_Model = load_model("validate_test_Model.h5")
    X_train,y_train = X_validate_chunks[i],y_validate_chunks[i]
    X_validation,y_validation = X_test_chunks[i],y_test_chunks[i]
    validate_test_Model , history_validate_test_Model = fit_model(validate_test_Model,X_train,y_train,X_validation,y_validation)
    evaluation(validate_test_Model,history_validate_test_Model,X_train,y_train,X_validation,y_validation,model_comparison_table_1)
    validate_test_Model.save("validate_test_Model.h5")

In [None]:
df_1 = pd.DataFrame(model_comparison_table_1)
df_1.to_csv('G:/JSContanaDataSet/result_df_1.csv',index=False)
df_1.head(5)

In [None]:
keras_function = K.function([validate_test_Model.input], [validate_test_Model.layers[1].output])
for_validate = []
for i in range(len(X_validate_chunks)):
    first_layer_output = keras_function([X_validate_chunks[i], 1])
    for_validate.append(first_layer_output[0])
save_object(for_validate, "for_validate")
for_validate = load_object("for_validate")

In [None]:
model_comparison_table_2 = {}
model_comparison_table_2['chunk_nuber'] = []
model_comparison_table_2['train_loss'] = []
model_comparison_table_2['train_accuracy'] = []
model_comparison_table_2['test_loss'] = []
model_comparison_table_2['test_accuracy'] = []
model_comparison_table_2['classification_report'] = []

In [25]:
lstm_Model = create_model(shape=for_traingen[0][0].shape,without_lstm=True)
lstm_Model.save("lstm_Model.h5")
for i in range(len(for_traingen)):
    print("================ "+str(i)+" ================")
    lstm_Model = load_model("lstm_Model.h5")
    X_train,y_train = feature_selection(for_traingen[i],y_train_chunks[i])
    X_validation,y_validation = feature_selection(for_validate[i],X_validate_chunks[i])
    lstm_Model , history_lstm_Model = fit_model(lstm_Model,X_train,y_train,X_validation,y_validation)
    evaluation(lstm_Model,history_lstm_Model,X_train,y_train,X_validation,y_validation,model_comparison_table_2)
    lstm_Model.save("lstm_Model.h5")

In [None]:
df_2 = pd.DataFrame(model_comparison_table_2)
df_2.to_csv('G:/JSContanaDataSet/result_df_2.csv',index=False)
df_2.head(5)