In [1]:
import json

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.data import Dataset, AUTOTUNE
from tensorflow.keras.layers import TextVectorization, Dense, LeakyReLU, Bidirectional, LSTM, Dropout, Embedding
from tensorflow.keras.optimizers import Adam
from tensorboard.plugins import projector

In [2]:
with open("configs.json", "r") as config_file:
    configs = json.load(config_file)

RANDOM_STATE = configs["RANDOM_STATE"]
model_configs = configs['MODELS']['DEEP_LSTM']

tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
filecontent_df = pd.read_csv(f"{configs['OUTPUTS_BASE_PATH']}processed_filecontent.csv")
filecontent_df.sample(10)

Unnamed: 0,sample_path,content,extension,content_len,Ratio space,Ratio NL,Ratio tab,Ratio _,Ratio #,Ratio :,...,Ratio |,Ratio &&,Ratio ||,Ratio ^,Ratio (,Ratio ),Ratio {,Ratio },Ratio [,Ratio ]
374212,db/seeds/company/layouts/news.layout.html,"<html>\r\n<head>\r\n <meta name=""viewport"" co...",html,920,0.153261,0.03587,0.0,0.001087,0.001087,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.01413,0.01413,0.002174,0.002174
177476,src/main/scala/L/mapbox/mapbox.scala,package io.github.bbarker.L.mapbox\n\nimport s...,scala,1389,0.202304,0.044636,0.0,0.00144,0.0,0.018719,...,0.00072,0.0,0.0,0.0,0.009359,0.009359,0.00216,0.00216,0.00072,0.00072
348951,src/Network/Receive/kRO/RagexeRE_2010_03_09a.pm,##############################################...,pm,4096,0.102295,0.036377,0.034424,0.012695,0.040771,0.008545,...,0.0,0.0,0.0,0.0,0.010254,0.010254,0.012451,0.012451,0.002197,0.002197
24451,valentia.Tests/Functions/Helper/ScheduledTask/...,﻿$here = Split-Path -Parent $MyInvocation.MyCo...,ps1,4096,0.215576,0.023193,0.0,0.0,0.0,0.000488,...,0.002686,0.0,0.0,0.0,0.000732,0.000732,0.004395,0.003662,0.000244,0.000244
331046,config/endercore/endercore.cfg,# Configuration file\n\nenchants {\n # Allo...,cfg,2698,0.237954,0.04003,0.0,0.0,0.021127,0.017791,...,0.0,0.0,0.0,0.0,0.001112,0.001112,0.001112,0.001112,0.006672,0.006672
317212,csharp/ICT/Testing/lib/MFinance/SampleData/Bud...,"""Cost Centre"",""Account"",""Budget Type"",""Year"",""...",csv,778,0.003856,0.007712,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025707,0.025707
283039,plugins/syntaxhighlighter/localization/_syntax...,# Copyright (C) 2014 SyntaxHighlighter Evolved...,po,4096,0.076172,0.04248,0.0,0.0,0.009766,0.02417,...,0.0,0.0,0.0,0.0,0.000977,0.000977,0.0,0.0,0.000244,0.000244
143100,hl7x/gen/vendor/2.4/SQR_S25.xsd,"<?xml version =""1.0"" encoding=""UTF-8""?>\n<!--\...",xsd,4096,0.166992,0.020264,0.000977,0.010742,0.0,0.020264,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
272801,public/admin/_tpl/posts-loop.tpl,"{if $jobs}\n\n<div class=""list"">\n\n{foreach i...",tpl,2705,0.042144,0.014787,0.027357,0.018484,0.0,0.001848,...,0.0,0.0,0.0,0.0,0.003697,0.003697,0.02403,0.02403,0.002218,0.002218
174398,source/blog/2013-07-17-migrating-to-ios-7.mark...,"---\ntitle: ""Migrating to iOS 7""\ndate: 2013-0...",markdown,632,0.134494,0.018987,0.0,0.0,0.0,0.007911,...,0.0,0.0,0.0,0.0,0.003165,0.003165,0.0,0.0,0.003165,0.003165


In [4]:
train_df, val_df = train_test_split(
    filecontent_df.copy(), 
    test_size = configs["VAL_SIZE"], 
    stratify = filecontent_df["extension"], 
    random_state = RANDOM_STATE
)
train_df.shape, val_df.shape

((349208, 61), (38801, 61))

In [5]:
label_encoder = LabelEncoder()
train_df["ext_encoded"] = label_encoder.fit_transform(train_df["extension"])
val_df["ext_encoded"] = label_encoder.transform(val_df["extension"])
len(label_encoder.classes_)

147

In [6]:
train_dataset = Dataset.from_tensor_slices((train_df["content"], train_df["ext_encoded"]))
train_dataset = train_dataset.shuffle(model_configs['BUFFER_SIZE']).batch(model_configs['BATCH_SIZE']).prefetch(AUTOTUNE)
train_dataset

<PrefetchDataset shapes: ((None,), (None,)), types: (tf.string, tf.int32)>

In [7]:
val_dataset = Dataset.from_tensor_slices((val_df["content"], val_df["ext_encoded"]))
val_dataset = val_dataset.shuffle(model_configs['BUFFER_SIZE']).batch(model_configs['BATCH_SIZE']).prefetch(AUTOTUNE)
val_dataset

<PrefetchDataset shapes: ((None,), (None,)), types: (tf.string, tf.int32)>

In [8]:
for examples in train_dataset.take(1):
    texts = examples[0][:3]
    labels = examples[1][:3]

    for text, label in zip(texts, labels):
        print('Text:\n', text.numpy())
        print(f'\nLabel: {label.numpy()} - {label_encoder.classes_[label.numpy()]}')
        print("-" * 128)

Text:

Label: 6 - as
--------------------------------------------------------------------------------------------------------------------------------
Text:
 b'/******************************************************************************\n *\n * Freescale Semiconductor Inc.\n * (c) Copyright 2004-2010 Freescale Semiconductor, Inc.\n * ALL RIGHTS RESERVED.\n *\n **************************************************************************//*!\n *\n * @file dfu_mouse.c\n *\n * @author\n *\n * @version\n *\n * @date\n *\n * @brief  The file emulates a mouse with buttons\n *         4 buttons are used on the demo board for the emulation\n *         PTG0--------- for left click(For JS16 PTG0 is inactive)\n *         PTG1--------- for right click\n *         PTG2--------- if macro "UP_LEFT" is defined mouse moves "left"\n *                       else mouse moves "right"\n *         PTG3--------- if macro "UP_LEFT" is defined mouse moves "up"\n *                       else mouse moves "down"\n 

In [9]:
%%time

@tf.keras.utils.register_keras_serializable()
def unicode_split(text):
    return tf.strings.unicode_split(text, 'UTF-8')

encoder = TextVectorization(
    standardize = None,
    max_tokens = model_configs['VOCAB_SIZE'],
    ngrams = model_configs['N_GRAMS'],
    split = unicode_split,
    output_sequence_length = model_configs['SEQ_LEN'],
    pad_to_max_tokens = True
)
encoder.adapt(train_dataset.map(lambda text, label: text))

len(encoder.get_vocabulary())

Cause: could not parse the source code of <function <lambda> at 0x0000027B83E148B0>: no matching AST found among candidates:

Cause: could not parse the source code of <function <lambda> at 0x0000027B83E148B0>: no matching AST found among candidates:

Wall time: 5min 7s


5000

In [10]:
encoder.get_vocabulary()[:25]

['',
 '[UNK]',
 ' ',
 '   ',
 'e',
 't',
 'i',
 'o',
 'r',
 'a',
 'n',
 's',
 '\n',
 'l',
 'c',
 'd',
 'p',
 'u',
 'm',
 '"',
 '.',
 '-',
 'h',
 '\n  ',
 'f']

In [11]:
texts = ["if else", "SELECT *"]
encoder(texts).numpy()[:, :20]

array([[   6,   24,    2,    4,   13,   11,    4,  226,  183,  217,  146,
         325,   71,    0,    0,    0,    0,    0,    0,    0],
       [  45,   48,   76,   48,   46,   44,    2,   55,  473, 1217,  617,
         729,  800,  491,  172,    0,    0,    0,    0,    0]],
      dtype=int64)

In [12]:
np.array(encoder.get_vocabulary())[encoder(texts).numpy()[:, :20]]

array([['i', 'f', ' ', 'e', 'l', 's', 'e', 'i f', 'f  ', '  e', 'e l',
        'l s', 's e', '', '', '', '', '', '', ''],
       ['S', 'E', 'L', 'E', 'C', 'T', ' ', '*', 'S E', 'E L', 'L E',
        'E C', 'C T', 'T  ', '  *', '', '', '', '', '']], dtype='<U5')

In [13]:
model = tf.keras.Sequential([
    encoder,
    Embedding(
        input_dim = len(encoder.get_vocabulary()),
        output_dim = 32
        ),
    Bidirectional(LSTM(128,  return_sequences = True)),
    Bidirectional(LSTM(128,  return_sequences = True)),
    Bidirectional(LSTM(128)),
    Dropout(0.1),
    Dense(512, activation = LeakyReLU()),
    Dropout(0.1),
    Dense(256, activation = LeakyReLU()),
    Dropout(0.1),
    Dense(256, activation = LeakyReLU()),
    Dense(len(label_encoder.classes_), activation = "softmax")
])

model.compile(
    optimizer = Adam(model_configs["LEARNING_RATE"]),
    loss = 'sparse_categorical_crossentropy',
    metrics = ['sparse_categorical_accuracy']
)

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 4096)             0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 4096, 32)          160000    
                                                                 
 bidirectional (Bidirectiona  (None, 4096, 256)        164864    
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 4096, 256)        394240    
 nal)                                                            
                                                                 
 bidirectional_2 (Bidirectio  (None, 256)              394240    
 nal)                                                   

In [14]:
del filecontent_df

In [15]:
%%time

history = model.fit(
    train_dataset, 
    validation_data = val_dataset, 
    epochs = model_configs["EPOCHS"], 
    steps_per_epoch = model_configs["TRAINING_STEPS"], 
    validation_steps = model_configs["VALIDATION_STEPS"]
)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Wall time: 8h 33min 24s


In [16]:
loss, accuracy = model.evaluate(val_dataset)
print(f"Validation Loss: {round(loss, 3)}")
print(f"Validation Accuracy: {round(accuracy, 3)}")

Validation Loss: 0.989
Validation Accuracy: 0.725


In [17]:
model.save(f"{configs['MODELS_BASE_PATH']}{model_configs['MODEL_NAME']}")



INFO:tensorflow:Assets written to: ../models/Deep_LSTM\assets


INFO:tensorflow:Assets written to: ../models/Deep_LSTM\assets


In [18]:
metadata_file_name = "embedding_metadata.tsv"
metadata_file_path = f"{configs['MODELS_BASE_PATH']}{model_configs['EMBEDDINGS_FOLDER']}/{metadata_file_name}"

token_names = {
    '': "Empty String",
    '[UNK]': "Unknown"
}

with open(metadata_file_path, 'w', encoding = 'UTF-8') as metadata_file:
    metadata_file.write("Unicode\tNGram\tID\n")
    for i, v in enumerate(encoder.get_vocabulary()):
        token = repr(v) if v not in token_names.keys() else token_names[v]
        n_gram = (len(v) // 2) + 1 if i > 1 else 0
        metadata_file.write(f"{token}\t{n_gram}\t{i}\n")

embedding_weights = tf.Variable(model.layers[1].get_weights()[0])
checkpoint = tf.train.Checkpoint(embedding = embedding_weights)
checkpoint.save(f"{configs['MODELS_BASE_PATH']}{model_configs['EMBEDDINGS_FOLDER']}/embedding.ckpt")

projector_config = projector.ProjectorConfig()
embedding_conf = projector_config.embeddings.add()
embedding_conf.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
embedding_conf.metadata_path = metadata_file_name
projector.visualize_embeddings(f"{configs['MODELS_BASE_PATH']}{model_configs['EMBEDDINGS_FOLDER']}", projector_config)