In [1]:
import json

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.data import Dataset, AUTOTUNE
from tensorflow.keras.layers import TextVectorization, Dense, LeakyReLU, Bidirectional, LSTM, Dropout, Embedding
from tensorflow.keras.optimizers import Adam

In [40]:
with open("configs.json", "r") as config_file:
    configs = json.load(config_file)

RANDOM_STATE = configs["RANDOM_STATE"]
model_configs = configs['MODELS']['DEEP_LSTM']

tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
filecontent_df = pd.read_csv(f"{configs['OUTPUTS_BASE_PATH']}processed_filecontent.csv")
filecontent_df.sample(10)

Unnamed: 0,sample_path,content,extension,content_len,Ratio space,Ratio NL,Ratio tab,Ratio _,Ratio #,Ratio :,...,Ratio |,Ratio &&,Ratio ||,Ratio ^,Ratio (,Ratio ),Ratio {,Ratio },Ratio [,Ratio ]
276,UnityAssets/Textures/AA_on.png.meta,fileFormatVersion: 2\nguid: 8359ffa9c0b0417459...,meta,1209,0.162945,0.046319,0.0,0.0,0.0,0.051282,...,0.0,0.0,0.0,0.0,0.0,0.0,0.002481,0.002481,0.001654,0.001654
297335,repository/Topez-Server-Core.package/TDAbstrac...,as yet unclassified\nlookupMethodInCompact: me...,st,898,0.219376,0.026726,0.0,0.0,0.0,0.03118,...,0.002227,0.0,0.0,0.003341,0.014477,0.012249,0.0,0.0,0.007795,0.007795
168724,server/static/blockly-games/js-read-only/goog/...,// Copyright 2008 The Closure Library Authors....,svn-base,2429,0.147386,0.035406,0.0,0.00494,0.0,0.000823,...,0.001235,0.0,0.000412,0.0,0.005352,0.005352,0.00494,0.00494,0.0,0.0
371604,CoreAudio/AudioUnits/AUPublic/AUCarbonViewBase...,/*\tCopyright © 2007 Apple Inc. All Rights Res...,cpp,4096,0.103271,0.022461,0.039307,0.000977,0.001465,0.002686,...,0.000977,0.0,0.000488,0.0,0.007324,0.007324,0.003174,0.002686,0.000732,0.000732
96274,src/main/resources/db/migration/sqlserver/V1.0...,ALTER TABLE [${ohdsiSchema}].[feasibility_stud...,sql,1528,0.147906,0.020942,0.0,0.056937,0.0,0.0,...,0.0,0.0,0.0,0.0,0.003272,0.003272,0.006545,0.006545,0.030105,0.030105
370286,src/DoubleBuffer.cpp,/* Copyright (C) 2013 Webyog Inc\r\n\r\n Thi...,cpp,4096,0.128418,0.040283,0.014648,0.015625,0.000244,0.006348,...,0.0,0.0,0.0,0.0,0.010986,0.010742,0.004883,0.004395,0.000244,0.000244
308541,mysql-test/suite/sys_vars/t/gap_lock_write_log...,--source include/have_innodb.inc\n--source inc...,test,2292,0.055846,0.034031,0.0,0.086387,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000436,0.000436,0.0,0.0,0.0,0.0
56938,tools/ZoeeyDoc/docs/books/zoeeydoc/xsl/wiki/wi...,"<?xml version=""1.0"" encoding=""UTF-8""?>\n\n<!--...",xsl,1115,0.13991,0.026009,0.0,0.0,0.0,0.007175,...,0.0,0.0,0.0,0.0,0.001794,0.001794,0.0,0.0,0.0,0.0
42554,target/linux/ramips/dts/W306R_V20.dts,"/dts-v1/;\n\n/include/ ""rt3050.dtsi""\n\n/ {\n\...",dts,1380,0.077536,0.062319,0.104348,0.001449,0.002899,0.004348,...,0.0,0.0,0.0,0.0,0.0,0.0,0.012319,0.012319,0.0,0.0
295208,node_modules/zombie/node_modules/ws/build/Rele...,cmd_Release/obj.target/validation/src/validati...,d,2090,0.030144,0.011005,0.0,0.013876,0.0,0.00622,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
train_df, val_df = train_test_split(
    filecontent_df.copy(), 
    test_size = configs["VAL_SIZE"], 
    stratify = filecontent_df["extension"], 
    random_state = RANDOM_STATE
)
train_df.shape, val_df.shape

((349208, 61), (38801, 61))

In [5]:
label_encoder = LabelEncoder()
train_df["ext_encoded"] = label_encoder.fit_transform(train_df["extension"])
val_df["ext_encoded"] = label_encoder.transform(val_df["extension"])
len(label_encoder.classes_)

147

In [6]:
train_dataset = Dataset.from_tensor_slices((train_df["content"], train_df["ext_encoded"]))
train_dataset = train_dataset.shuffle(model_configs['BUFFER_SIZE']).batch(model_configs['BATCH_SIZE']).prefetch(AUTOTUNE)
train_dataset

<PrefetchDataset shapes: ((None,), (None,)), types: (tf.string, tf.int32)>

In [7]:
val_dataset = Dataset.from_tensor_slices((val_df["content"], val_df["ext_encoded"]))
val_dataset = val_dataset.shuffle(model_configs['BUFFER_SIZE']).batch(model_configs['BATCH_SIZE']).prefetch(AUTOTUNE)
val_dataset

<PrefetchDataset shapes: ((None,), (None,)), types: (tf.string, tf.int32)>

In [8]:
for examples in train_dataset.take(1):
    texts = examples[0][:3]
    labels = examples[1][:3]

    for text, label in zip(texts, labels):
        print('Text:\n', text.numpy())
        print(f'\nLabel: {label.numpy()} - {label_encoder.classes_[label.numpy()]}')
        print("-" * 128)

Text:

Label: 1 - H
--------------------------------------------------------------------------------------------------------------------------------
Text:
 b'diff --git a/spec/multi_json_spec.rb b/spec/multi_json_spec.rb\nindex 2df3350..9d2a376 100644\n--- a/spec/multi_json_spec.rb\n+++ b/spec/multi_json_spec.rb\n@@ -13,13 +13,17 @@ end\n describe "MultiJson" do\n   context \'engines\' do\n     it \'should default to the best available gem\' do\n+      begin\n       require \'yajl\'\n       MultiJson.engine.name.should == \'MultiJson::Engines::Yajl\'\n+      rescue LoadError\n+        pending "YAJL (or a dependency) not available."\n+      end\n     end\n \n     it \'should be settable via a symbol\' do\n-      MultiJson.engine = :yajl\n-      MultiJson.engine.name.should == \'MultiJson::Engines::Yajl\'\n+      MultiJson.engine = :json_pure\n+      MultiJson.engine.name.should == \'MultiJson::Engines::JsonPure\'\n     end\n     \n     it \'should be settable via a class\' do\n'

Label:

In [9]:
%%time

@tf.keras.utils.register_keras_serializable()
def unicode_split(text):
    return tf.strings.unicode_split(text, 'UTF-8')

encoder = TextVectorization(
    standardize = None,
    max_tokens = model_configs['VOCAB_SIZE'],
    ngrams = model_configs['N_GRAMS'],
    split = unicode_split,
    output_sequence_length = model_configs['SEQ_LEN'],
    pad_to_max_tokens = True
)
encoder.adapt(train_dataset.map(lambda text, label: text))

len(encoder.get_vocabulary())

Cause: could not parse the source code of <function <lambda> at 0x0000029B3ED3C4C0>: no matching AST found among candidates:

Cause: could not parse the source code of <function <lambda> at 0x0000029B3ED3C4C0>: no matching AST found among candidates:

Wall time: 4min 57s


5000

In [10]:
encoder.get_vocabulary()[:25]

['',
 '[UNK]',
 ' ',
 '   ',
 'e',
 't',
 'i',
 'o',
 'r',
 'a',
 'n',
 's',
 '\n',
 'l',
 'c',
 'd',
 'p',
 'u',
 'm',
 '"',
 '.',
 '-',
 'h',
 '\n  ',
 'f']

In [11]:
texts = ["if else", "SELECT *"]
encoder(texts).numpy()[:, :20]

array([[   6,   24,    2,    4,   13,   11,    4,  226,  183,  217,  146,
         325,   71,    0,    0,    0,    0,    0,    0,    0],
       [  45,   48,   76,   48,   46,   44,    2,   55,  473, 1217,  617,
         729,  800,  491,  172,    0,    0,    0,    0,    0]],
      dtype=int64)

In [12]:
np.array(encoder.get_vocabulary())[encoder(texts).numpy()[:, :20]]

array([['i', 'f', ' ', 'e', 'l', 's', 'e', 'i f', 'f  ', '  e', 'e l',
        'l s', 's e', '', '', '', '', '', '', ''],
       ['S', 'E', 'L', 'E', 'C', 'T', ' ', '*', 'S E', 'E L', 'L E',
        'E C', 'C T', 'T  ', '  *', '', '', '', '', '']], dtype='<U5')

In [13]:
model = tf.keras.Sequential([
    encoder,
    Embedding(
        input_dim = len(encoder.get_vocabulary()),
        output_dim = 32
        ),
    Bidirectional(LSTM(128,  return_sequences = True)),
    Bidirectional(LSTM(128,  return_sequences = True)),
    Bidirectional(LSTM(128)),
    Dropout(0.1),
    Dense(512, activation = LeakyReLU()),
    Dropout(0.1),
    Dense(256, activation = LeakyReLU()),
    Dropout(0.1),
    Dense(256, activation = LeakyReLU()),
    Dense(len(label_encoder.classes_), activation = "softmax")
])

model.compile(
    optimizer = Adam(model_configs["LEARNING_RATE"]),
    loss = 'sparse_categorical_crossentropy',
    metrics = ['sparse_categorical_accuracy']
)

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 4096)             0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 4096, 32)          160000    
                                                                 
 bidirectional (Bidirectiona  (None, 4096, 256)        164864    
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 4096, 256)        394240    
 nal)                                                            
                                                                 
 bidirectional_2 (Bidirectio  (None, 256)              394240    
 nal)                                                   

In [14]:
del filecontent_df

In [15]:
%%time

history = model.fit(
    train_dataset, 
    validation_data = val_dataset, 
    epochs = model_configs["EPOCHS"], 
    steps_per_epoch = model_configs["TRAINING_STEPS"], 
    validation_steps = model_configs["VALIDATION_STEPS"]
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Wall time: 3h 50min 1s


In [16]:
loss, accuracy = model.evaluate(val_dataset)
print(f"Validation Loss: {round(loss, 3)}")
print(f"Validation Accuracy: {round(accuracy, 3)}")

Validation Loss: 1.57
Validation Accuracy: 0.584


In [19]:
model.save(f"{configs['MODELS_BASE_PATH']}{model_configs['MODEL_NAME']}")



INFO:tensorflow:Assets written to: ../models/Deep_LSTM\assets


INFO:tensorflow:Assets written to: ../models/Deep_LSTM\assets


In [70]:
metadata_file_name = "embedding_metadata.tsv"
metadata_file_path = f"{configs['MODELS_BASE_PATH']}{model_configs['EMBEDDINGS_FOLDER']}/{metadata_file_name}"

token_names = {
    '': "Empty String",
    '[UNK]': "Unknown"
}

with open(metadata_file_path, 'w', encoding = 'UTF-8') as metadata_file:
    metadata_file.write("Unicode\tNGram\tID\n")
    for i, v in enumerate(encoder.get_vocabulary()):
        token = repr(v) if v not in token_names.keys() else token_names[v]
        n_gram = (len(v) // 2) + 1 if i > 1 else 0
        metadata_file.write(f"{token}\t{n_gram}\t{i}\n")

In [68]:
from tensorboard.plugins import projector

embedding_weights = tf.Variable(model.layers[1].get_weights()[0])
checkpoint = tf.train.Checkpoint(embedding = embedding_weights)
checkpoint.save(f"{configs['MODELS_BASE_PATH']}{model_configs['EMBEDDINGS_FOLDER']}/embedding.ckpt")

projector_config = projector.ProjectorConfig()
embedding_conf = projector_config.embeddings.add()
embedding_conf.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
embedding_conf.metadata_path = metadata_file_name
projector.visualize_embeddings(f"{configs['MODELS_BASE_PATH']}{model_configs['EMBEDDINGS_FOLDER']}", projector_config)
