In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from keras.models import Sequential
from keras import layers
from keras.backend import clear_session
from keras.preprocessing.text import Tokenizer
from keras.layers import Activation, Dense, Reshape 
import numpy as np
from keras import backend as K
import matplotlib.pyplot as plt
import pandas as pd
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout, Conv1D, GlobalMaxPooling1D
from keras.models import load_model
from numpy import asarray
from numpy import save
from numpy import load
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import esprima as esp
import re
from keras.initializers import Constant
from keras.optimizers import Adam
from keras.losses import BinaryCrossentropy
from keras.utils import plot_model
from keras import Model
import pydot

plt.style.use('ggplot')

In [2]:
def plot_history(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

In [3]:
# ***********************
def Read_data_and_split(path_csv_file='G:/JSContanaDataSet/script_path_df.csv'):
#     data = pd.read_csv("socialmedia_disaster.csv") #  9346 Samples
    data = pd.read_csv(path_csv_file)
    columns = data.columns
    sentences = data[columns[0]].values
    y = data[columns[1]].values
    sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=1000)
    return sentences_train, sentences_test, y_train, y_test

In [4]:
# ***********************
def Read_data_and_split(path_csv_file='G:/JSContanaDataSet/script_path_df.csv'):
    data = pd.read_csv(path_csv_file)
    columns = data.columns
    x = data[columns[0]].values
    y = data[columns[1]].values
    sentences_train, sentences_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1000)
    return sentences_train, sentences_test, y_train, y_test

In [5]:
def chunking_data(Data,chunkNum):
    chunkSize = Data.shape[0]/chunkNum
    temp = []
    for i in range(chunkNum):
        satrt = int(i * chunkSize)
        end = int((i+1) * chunkSize)
        if Data.shape[0]-end>0:
            temp.append(Data[satrt:end])
        else:
            temp.append(Data[satrt:Data.shape[0]])
    return temp

In [6]:
def preprocessing_dataset(num_words,sentences_train,sentences_test,maxlen):
    tokenizer = Tokenizer(num_words=num_words)
    tokenizer.fit_on_texts(sentences_train)
    X_train = tokenizer.texts_to_sequences(sentences_train)
    X_test = tokenizer.texts_to_sequences(sentences_test)
    vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index
    X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
    X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)
    return X_train,X_test,vocab_size

In [7]:
def evaluation(model,history,X_train,y_train,X_test,y_test):
    loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
    print("Training Accuracy: {:.4f}".format(accuracy))
    loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
    print("Testing Accuracy:  {:.4f}".format(accuracy))
    print(classification_report(np.round(model.predict(X_test)), y_test))
    plot_history(history)

In [8]:
# ££££££££££££££££££££££££££££££££
def Model(vocab_size,embedding_dim,maxlen,myoptimizer,myloss,mymetrics,shape=100,Type=0):
    model = Sequential()
    if Type==0:
        model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
        model.add(Bidirectional(LSTM(units=50, input_shape=(None, 50), return_sequences=True)))
    else :
         model.add(layers.InputLayer(input_shape=shape.shape))
        
    
    model.add(Bidirectional(LSTM(units=50, input_shape=(None, 50), return_sequences=True)))

    # TextCNN with 4 conv layers
    model.add(Conv1D(128, 7, activation='tanh', input_shape=(None, 32)))
    model.add(Conv1D(128, 15, activation='tanh'))
    model.add(Conv1D(128, 25, activation='tanh'))
    model.add(Conv1D(128, 35, activation='tanh'))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=myoptimizer,
                  loss=myloss,
                  metrics=[mymetrics])
    model.summary()
    return model

In [9]:
# *******************************
def fit_Model(model,model_name,X_train,y_train,X_test=0,y_test=0,chunk_size=10):
    model_name = str(model_name)+".h5"
    X_train_List = chunking_data(X_train,chunk_size)
    y_train_List = chunking_data(y_train,chunk_size)
   
    if X_test!=0:
        X_test_List = chunking_data(X_test,chunk_size)
        y_test_List = chunking_data(y_test,chunk_size)
        history = model.fit(X_train_List[0], y_train_List[0],epochs=10,verbose=False,validation_data=(X_test_List[0], y_test_List[0]))
    else:
        history = model.fit(X_train_List[0], y_train_List[0],epochs=10,verbose=False)
            
    model.save(model_name)
    for i in range(1,len(X_train_List)-1):
        model = load_model(model_name)
        if X_test!=0:
            model.fit(X_train_List[i], y_train_List[i],epochs=10,verbose=False,validation_data=(X_test_List[i], y_test_List[i]))
        else:
            model.fit(X_train_List[i], y_train_List[i],epochs=10,verbose=False)
        loaded_train_test_model.save(model_name)    
    return model , history , model_name

In [10]:
# *******************************
def fit_model(model,model_name,X_train,y_train,X_test=0,y_test=0):
    model_name = str(model_name)+".h5"
    if X_test!=0:
        history = model.fit(X_train, y_train,epochs=10,verbose=False,validation_data=(X_test, y_test))
    else:
        history = model.fit(X_train, y_train,epochs=10,verbose=False)
    model.save(model_name)
    return model , history , model_name

In [11]:
path_train_data = 'G:/JSContanaDataSet/script_path_df.csv'
num_words = 400000
maxlen = 1024
embedding_dim = 50
myoptimizer = 'adam'
myloss= 'binary_crossentropy'
mymetrics = 'accuracy'

sentences_train, sentences_test, y_train, y_test = Read_data_and_split(path_train_data)
# sentences_test, y_test = Read_data_and_split('G:/JSContanaDataSet/test_script_path_df.csv')
X_train,X_test,vocab_size = preprocessing_dataset(num_words,sentences_train,sentences_test,maxlen)
x_Data = np.concatenate((X_train, X_test), axis=0)
y_Data = np.concatenate((y_train, y_test), axis=0)

# Model for All Data

In [12]:
Model_for_all_data = Model(vocab_size,embedding_dim,maxlen,myoptimizer,myloss,mymetrics)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1024, 50)          80640950  
_________________________________________________________________
bidirectional (Bidirectional (None, 1024, 100)         40400     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 1024, 100)         60400     
_________________________________________________________________
conv1d (Conv1D)              (None, 1018, 128)         89728     
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 1004, 128)         245888    
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 980, 128)          409728    
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 946, 128)          5

In [None]:
Model_for_all_data , history , model_name = fit_model(Model_for_all_data,"Model_for_all_data",x_Data[:1000],y_Data[:1000])

In [2]:
# *******************
keras_function = K.function([Model_for_all_data.input], [Model_for_all_data.layers[1].output])
first_layer_output = keras_function([x_Data[:1000], 1])
save('first_layer_output.npy', first_layer_output[0])
# first_layer_output = load('first_layer_output.npy')  

# Model for train&test data

In [None]:
train_test_Model = Model(vocab_size,embedding_dim,maxlen,myoptimizer,myloss,mymetrics)

In [None]:
train_test_Model , train_test_Model_history , train_test_Model_model_name = fit_model(train_test_Model,"train_test_Model",X_train[1000],y_train[1000],X_test[1000],y_test[1000])

In [None]:
evaluation(train_test_Model,train_test_Model_history,X_train[:750],y_train[:750],X_test[:250],y_test[:250])

# Model Without first LSTM layer

In [None]:
Model_without_first_layer = Model(vocab_size,embedding_dim,maxlen,myoptimizer,myloss,mymetrics,shape=first_layer_output[0][0].shape,Type=1)

In [None]:
# lstm_X_train, lstm_X_test, lstm_y_train, lstm_y_test = train_test_split(first_layer_output[0], y_Data, test_size=0.25, random_state=1000)

In [None]:
Model_without_first_layer , Model_without_first_layer_history , Model_without_first_layer_name = fit_model(Model_without_first_layer,"Model_without_first_layer",first_layer_output[0][:750],y_train[:750],first_layer_output[0][750:1000],y_test[750:1000])

In [None]:
evaluation(Model_without_first_layer,Model_without_first_layer_history,first_layer_output[0][:750],y_train[:750],first_layer_output[0][750:1000],y_test[750:1000])

In [9]:
# # ***********************
# def fit_Model(X_train,):
#     x_Data_List = chunking_data(x_Data,9)
#     y_Data_List = chunking_data(y_Data,9)
#     model_All_Data = Model_for_all_data()
#     history = model_All_Data.fit(x_Data_List[0], y_Data_List[0],
#                         epochs=10,
#                         verbose=False,
#                         batch_size=10)
#     model_All_Data.save("model_All_Data.h5")
#     for i in range(1,len(x_Data_List)-1):
#         loaded_model_All_Data = load_model("model_All_Data.h5")
#         loaded_model_All_Data.fit(x_Data_List[i], y_Data_List[i],
#                         epochs=10,
#                         verbose=False,
#                         batch_size=10)
#         loaded_model_All_Data.save("model_All_Data.h5") 
#     keras_function = K.function([loaded_model_All_Data.input], [loaded_model_All_Data.layers[1].output])
#     output = keras_function([x_Data, 1])
#     save('data.npy', output[0])
#     first_layer_output = load('data.npy')    
#     return loaded_model_All_Data , first_layer_output , "model_All_Data.h5" , 'data.npy'

In [18]:
# loaded_model_All_Data , first_layer_output , Apath , Path_first_layer_output =  fit_Model_for_all_data()

In [11]:
# clear_session()

In [19]:
# # *****************************
# def train_test_Model():
#     train_test_model = Sequential()
#     train_test_model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
#     train_test_model.add(Bidirectional(LSTM(units=64 , batch_input_shape=(10,X_train.shape[0],X_train.shape[1]), return_sequences=True)))
#     train_test_model.add(Bidirectional(LSTM(units=32 , return_sequences=True)))

#     # TextCNN with 4 conv layers
#     train_test_model.add(Conv1D(128, 7, activation='tanh', input_shape=(None, 32)))
#     train_test_model.add(Conv1D(128, 15, activation='tanh'))
#     train_test_model.add(Conv1D(128, 25, activation='tanh'))
#     train_test_model.add(Conv1D(128, 35, activation='tanh'))
#     train_test_model.add(GlobalMaxPooling1D())
#     train_test_model.add(Dense(64, activation='relu'))
#     train_test_model.add(Dropout(0.2))
#     train_test_model.add(Dense(1, activation='sigmoid'))
#     train_test_model.compile(optimizer=myoptimizer,
#                   loss=myloss,
#                   metrics=[mymetrics])
#     train_test_model.summary()
#     return train_test_model

In [27]:
# loaded_train_test_model, history_1 , Bpath = fit_train_test_Model()

In [28]:
# evaluation(loaded_train_test_model,history_1,X_train,y_train,X_test,y_test)

In [29]:
# for delete
# evaluation(loaded_train_test_model,history_1,X_train,y_train,X_test,y_test)

In [16]:
# clear_session()

In [17]:
# # *************************
# def Model_without_first_lstm_layer():
#     model_wihout_lstm = Sequential()

#     model_wihout_lstm.add(layers.InputLayer(input_shape=first_layer_output[0].shape))
#     model_wihout_lstm.add(Bidirectional(LSTM(units=32 , return_sequences=True)))

#     # TextCNN with 4 conv layers
#     model_wihout_lstm.add(Conv1D(128, 7, activation='tanh', input_shape=(None, 32)))
#     model_wihout_lstm.add(Conv1D(128, 15, activation='tanh'))
#     model_wihout_lstm.add(Conv1D(128, 25, activation='tanh'))
#     model_wihout_lstm.add(Conv1D(128, 35, activation='tanh'))
#     model_wihout_lstm.add(GlobalMaxPooling1D())
#     model_wihout_lstm.add(Dense(64, activation='relu'))
#     model_wihout_lstm.add(Dropout(0.2))
#     model_wihout_lstm.add(Dense(1, activation='sigmoid'))
#     model_wihout_lstm.compile(optimizer=myoptimizer,
#                   loss=myloss,
#                   metrics=[mymetrics])
#     model_wihout_lstm.summary()
#     return model_wihout_lstm

In [18]:
# # ********************************
# def fit_Model_without_first_lstm_layer():
#     out_lstm_X_train, out_lstm_X_test, out_lstm_y_train, out_lstm_y_test = train_test_split(first_layer_output, y_Data, test_size=0.25, random_state=1000)
#     lX_train_List = chunking_data(out_lstm_X_train,9)
#     ly_train_List = chunking_data(out_lstm_y_train,9)
#     model_wihout_lstm = Model_without_first_lstm_layer()
#     lX_test_List = chunking_data(out_lstm_X_test,9)
#     ly_test_List = chunking_data(out_lstm_y_test,9)
#     history = model_wihout_lstm.fit(lX_train_List[0], ly_train_List[0],
#                         epochs=10,
#                         verbose=False,
#                         validation_data=(lX_test_List[0], ly_test_List[0]))
#     model_wihout_lstm.save("model_wihout_lstm.h5")
#     for i in range(1,len(lX_train_List)-1):
#         loaded_model_wihout_lstm = load_model("model_wihout_lstm.h5")
#         loaded_model_wihout_lstm.fit(lX_train_List[i], ly_train_List[i],
#                         epochs=10,
#                         verbose=False,
#                         validation_data=(lX_test_List[i], ly_test_List[i]))
#         loaded_model_wihout_lstm.save("model_wihout_lstm.h5")
#     return loaded_model_wihout_lstm,history , out_lstm_X_train, out_lstm_X_test, out_lstm_y_train, out_lstm_y_test , "model_wihout_lstm.h5" ,

In [30]:
# loaded_model_wihout_lstm,history_2 , out_lstm_X_train, out_lstm_X_test, out_lstm_y_train, out_lstm_y_test , Cpath = fit_Model_without_first_lstm_layer()

In [31]:
# evaluation(loaded_model_wihout_lstm,history_2,out_lstm_X_train,out_lstm_y_train,out_lstm_X_test,out_lstm_y_test)

In [21]:
# clear_session()

In [32]:
# for delete
# evaluation(loaded_model_wihout_lstm,history_2,out_lstm_X_train,out_lstm_y_train,out_lstm_X_test,out_lstm_y_test)

In [73]:
# convert java script code to the abstract syntax tree
# then extract the sequences of syntax units with detailed information
def sequence_of_syntax_units(scripts):
    """
    sctipts : list or single javascript code 
    return sequence of syntax units
    each syntax unit correspond to line in an abstract syntax tree
    """
    if type(scripts) is list:
        # esp.parseScript(script) returns abstract syntax tree of each js scripts
        return [re.sub('\s+', ' ', ''.join(str(esp.parseScript(script)).split('\n'))) for script in scripts]
    elif type(scripts) is str:
        return re.sub('\s+', ' ', ''.join(str(esp.parseScript(scripts)).split('\n')))
    else:
        raise ValueError('The type of scripts parameter must be {list or string}')

In [74]:
# test sequence_of_syntax_units 
test_script = """
var fs = Npm.require("fs");

Package.describe({
  name: 'arch:ace-editor',
  summary: 'Integrating Ace editor with Meteor since 2015',
  version: '1.1.1',
  git: 'https://github.com/0a-/meteor-ace-editor'
});

Package.onUse(function(api) {
  api.versionsFrom('1.0.2.1');
  api.use('tracker', 'client');
  var files = fs.readdirSync('ace-builds/src-noconflict');
  files.forEach(function(file){
    if(file.substr(-3)===".js"){
        api.add_files("ace-builds/src-noconflict/"+file, "client", {isAsset: true});
    }
  });
  api.addFiles('core.js','client');
  api.export('AceEditor','client');
});

Package.onTest(function(api) {
  api.use('tinytest');
  api.use('arch:ace-editor');
  api.addFiles('tests.js','client');
});

"""
sequence_of_syntax_units(test_script)

'{ type: "Program", sourceType: "script", body: [ { type: "VariableDeclaration", declarations: [ { type: "VariableDeclarator", id: { type: "Identifier", name: "fs" }, init: { type: "CallExpression", callee: { type: "MemberExpression", computed: False, object: { type: "Identifier", name: "Npm" }, property: { type: "Identifier", name: "require" } }, arguments: [ { type: "Literal", value: "fs", raw: "\\"fs\\"" } ] } } ], kind: "var" }, { type: "ExpressionStatement", expression: { type: "CallExpression", callee: { type: "MemberExpression", computed: False, object: { type: "Identifier", name: "Package" }, property: { type: "Identifier", name: "describe" } }, arguments: [ { type: "ObjectExpression", properties: [ { type: "Property", key: { type: "Identifier", name: "name" }, computed: False, value: { type: "Literal", value: "arch:ace-editor", raw: "\'arch:ace-editor\'" }, kind: "init", method: False, shorthand: False }, { type: "Property", key: { type: "Identifier", name: "summary" }, comput

In [40]:
import gensim 
from nltk.tokenize import word_tokenize
import pickle 

############################
MODEL_FILENAME= 'static_word2vec.txt'
TRAIN_DATA_FILENAME= 'train_data.pkl'
MAX_LENGTH= 1024
EMBEDDING_DIM=50
############################

# create word tokens
def sequence_of_syntax_units_to_tokens(seq_of_syn_units):
    """
    Convert Sequence of syntax units to tokens to train  word2vec model 
    -----------------------------------------
    seq_of_syn_units: list of sequence syntax units correspond to abstract syntax trees
    """
    return list(map(word_tokenize, seq_of_syn_units))

In [41]:
def preprocessing_dataset(X_train, X_test, max_len=MAX_LENGTH):
    """
    Map tokens to integers to feed them into Embedding layer
    """
    tokenizer = Tokenizer(400000)
    tokenizer.fit_on_texts(X_train)
    X_train_sequence = tokenizer.texts_to_sequences(X_train)
    X_test_sequence = tokenizer.texts_to_sequences(X_test)
    return pad_sequences(X_train_sequence, maxlen=max_len, padding='post'), pad_sequences(X_test_sequence, maxlen=max_len, padding='post')

In [42]:
def extract_js_code_with_no_tag(code):
    return  list(map(lambda x: re.sub(r';', ' ; ', x), re.findall(r'javascript:(.*?\(.*?\))[\\/<>]?', code, re.IGNORECASE)))
 
def extract_js_code(script):
    #list all the scripts tags
    internal_js = re.findall(r'<script>?(.*?\(.*?\))[\\/<>]?',script , re.IGNORECASE)
    # remove unwanted words like javascript keyword and quotes 
    # add space befor and after semicolon
    clean_js_codes = []
    for code in internal_js:
        if re.search(r'javascript', code):
            clean_js_codes += extract_js_code_with_no_tag(code)  
        else:
            clean_js_codes.append(re.sub(r';', ' ; ', code))
    r = list(set(clean_js_codes + extract_js_code_with_no_tag(script)))
    if len(r) == 0:
        print(script)
    return r

In [72]:
with open("G:/JSContanaDataSet/programs_training.txt", 'r', encoding='utf-8') as file:
    path_scripts = file.readlines()
script_df = pd.DataFrame(columns=["ScriptName","Label"])
scripts = []
Label = []
for path_script in range(len(path_scripts)-1):
    try:
        s_p = "G:/JSContanaDataSet/"+path_scripts[path_script][:-1]
        with open(s_p) as js_file:
            script = js_file.readlines()
        scripts.append(script)
        if path_script%2==0:
            Label.append(0)
        else:
            Label.append(1)
    except:
        print(i + "_ ")
script_df["ScriptName"] = scripts
script_df["Label"] = Label

NameError: name 'i' is not defined

In [None]:
script_df.head()

In [63]:
len(path_scripts)

100000

In [56]:
path_scripts

['/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */\n',
 '/* This Source Code Form is subject to the terms of the Mozilla Public\n',
 ' * License, v. 2.0. If a copy of the MPL was not distributed with this\n',
 ' * file, You can obtain one at http://mozilla.org/MPL/2.0/. */\n',
 '\n',
 "var gTestfile = 'regress-472450-04.js';\n",
 '//-----------------------------------------------------------------------------\n',
 'var BUGNUMBER = 472450;\n',
 "var summary = 'TM: Do not assert: StackBase(fp) + blockDepth == regs.sp';\n",
 "var actual = '';\n",
 "var expect = '';\n",
 '\n',
 '\n',
 '//-----------------------------------------------------------------------------\n',
 'test();\n',
 '//-----------------------------------------------------------------------------\n',
 '\n',
 'function test()\n',
 '{\n',
 "  enterFunc ('test');\n",
 '  printBugNumber(BUGNUMBER);\n',
 '  printStatus (summary);\n',
 ' \n',
 '  jit(true);\n',
 '\n',
 '  ({__proto__: #1=[#1#]});\n'

In [None]:
print('number of scripts {}'.format(len(scripts)))
clean_scripts = []
for sc in scripts:
    try:
        #parse the response HTML page
        clean_scripts.append(extract_js_code(sc.strip()))
    except:
        print(sc)
clean_scripts = list(filter(lambda x: len(x)>0, clean_scripts))
print('number of clean scripts: {}'.format(len(clean_scripts)))

In [None]:
dataset = []
for sc in clean_scripts:
    for s in sc:
        try:
            sequence_of_syntax_units(s)
            dataset.append(s)
        except:
            print(sc)

In [None]:
target = [1] * 181

In [None]:
len(dataset)

In [None]:
outputs = [layer.output for layer in model.layers]
outputs

In [None]:
plot_model(model, show_shapes=True)