# Sequence-to-sequence Transformer

**Author:** ABC<br>
**Date created:** 2022/01/20<br>
**Last modified:** 2022/04/20<br>
**Description:** Implementing a sequence-to-sequene Transformer and training it on a inline Comment Generation Task.

## Introduction
When discussing Automatic Code Comment generation, it plays a good role in the software development life cycle with software maintenance. But in reality, most programmers pay no attention to source code comments, they only pay attention to code, which situation decreases the program’s readability with maintainability. Therefore need a good mechanism for automatic comment generation to solve this problem to improve efficiency.



## Setup

In [None]:
import pathlib
import random
import string
import pandas as pd
import re
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization
from time import time

## Getting the data



In [None]:
#Mounting the gdrive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
t=time()

pre=pd.read_excel("/content/drive/MyDrive/Colab Notebooks/Data_No_Outliers.xlsx")
del (pre["Unnamed: 0"])
pre=pre.sample(frac=1)

print(f"There are {pre.shape[0]} data records in the dataset")
print('Time to Load the dataset: {} mins'.format(round((time() - t) / 60, 2)))

There are 258610 data records in the dataset
Time to Load the dataset: 0.48 mins


In [None]:
pre.head()

Unnamed: 0,code,cmt
181342,"public static final <k,v> mapset<k,v> unmodifi...",returns unmodifiable mapset.
256755,publicid = publicid.normalize(publicid);,// always normalize the public identifier befo...
21116,public void notifylisteners(gerritconnectionev...,notifies all listeners of a gerrit connection ...
99575,"public static <t, r1, r> completablefuture<r> ...",perform a for comprehension over a completable...
96772,"private void inject(object target, field field...",performs the actual instance injection.


## Parsing the data


In [None]:
text_pairs = []
for _, row in pre.iterrows():
  try:
    mthd,cmt=row["code"],row["cmt"]
    cmt = "[start] " + cmt + " [end]"
    text_pairs.append((mthd,cmt))
  except TypeError:
    pass

Here's what our code and comment pairs look like:

In [None]:
for _ in range(5):
    print(random.choice(text_pairs))

('public static int getpowerof2(long value) { preconditions.checkargument(ispowerof2(value)); return long.size-(long.numberofleadingzeros(value)+1); }', '[start] returns an integer x such that 2^x=value. throws an exception if value is not a power of 2. [end]')
('list<isubmission> filteredsubmissions = new arraylist<isubmission>();', '[start] // // create the output array // [end]')
('public static action status(final httpstatus status) { return new action(input -> { input.setstatus(status); return input; }); }', '[start] sets http status to response [end]')
('public static jpaentry convertedbobjectentrytojpaentry(edbobjectentry entry, jpaobject owner) { for (edbconverterstep step : steps) { if (step.doesstepfit(entry.gettype())) { logger.debug("edbconverterstep {} fit for type {}", step.getclass().getname(), entry.gettype()); return step.converttojpaentry(entry, owner); } } logger.error("no edbconverterstep fit for edbobjectentry {}", entry); return null; }', '[start] converts a jpaen

Now, let's split the sentence pairs into a training set, a validation set,
and a test set.

In [None]:
#70% for testing
#15% for Validataion
#15% for Testing

random.shuffle(text_pairs)
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples : num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples :]

print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

258610 total pairs
181028 training pairs
38791 validation pairs
38791 test pairs


## Vectorizing the text data



In [None]:
strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

In [None]:
vocab_size = 150000 ##??
sequence_length = 20 ###????
batch_size = 64


In [None]:
def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "")


code_vectorization = TextVectorization(
    max_tokens=vocab_size, output_mode="int", output_sequence_length=sequence_length,
)
cmt_vectorization = TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length + 1,
    standardize='lower_and_strip_punctuation'
)
#standardize=custom_standardization,

In [None]:
train_code_texts = [pair[0] for pair in train_pairs]
train_cmt_texts = [pair[1] for pair in train_pairs]
code_vectorization.adapt(train_code_texts)
cmt_vectorization.adapt(train_cmt_texts)

In [None]:
vectorize_layer_model_1 = tf.keras.models.Sequential()
vectorize_layer_model_1.add(tf.keras.Input(shape=(1,), dtype=tf.string))
vectorize_layer_model_1.add(cmt_vectorization)

filepath = "vectorizer_cmt"
vectorize_layer_model_1.save(filepath, save_format="tf")

loaded_vectorize_layer_model_1 = tf.keras.models.load_model("/content/vectorizer_cmt")
cmt_vectorization = loaded_vectorize_layer_model_1.layers[0]


INFO:tensorflow:Assets written to: vectorizer_cmt/assets


In [None]:
vectorize_layer_model_2 = tf.keras.models.Sequential()
vectorize_layer_model_2.add(tf.keras.Input(shape=(1,), dtype=tf.string))
vectorize_layer_model_2.add(code_vectorization)

filepath = "vectorizer_code"
vectorize_layer_model_2.save(filepath, save_format="tf")

loaded_vectorize_layer_model_1 = tf.keras.models.load_model("/content/vectorizer_code")
code_vectorization = loaded_vectorize_layer_model_1.layers[0]

INFO:tensorflow:Assets written to: vectorizer_code/assets


In [None]:
from google.colab import files
!zip -r /content/V_cmt.zip /content/vectorizer_cmt

updating: content/vectorizer_cmt/ (stored 0%)
updating: content/vectorizer_cmt/saved_model.pb (deflated 69%)
updating: content/vectorizer_cmt/variables/ (stored 0%)
updating: content/vectorizer_cmt/variables/variables.data-00000-of-00001 (deflated 54%)
updating: content/vectorizer_cmt/variables/variables.index (deflated 20%)
updating: content/vectorizer_cmt/keras_metadata.pb (deflated 79%)
updating: content/vectorizer_cmt/assets/ (stored 0%)


In [None]:
from google.colab import files
!zip -r /content/V_code.zip /content/vectorizer_code

updating: content/vectorizer_code/ (stored 0%)
updating: content/vectorizer_code/saved_model.pb (deflated 70%)
updating: content/vectorizer_code/variables/ (stored 0%)
updating: content/vectorizer_code/variables/variables.data-00000-of-00001 (deflated 54%)
updating: content/vectorizer_code/variables/variables.index (deflated 20%)
updating: content/vectorizer_code/keras_metadata.pb (deflated 79%)
updating: content/vectorizer_code/assets/ (stored 0%)


In [None]:
code_vectorization.vocabulary_size(),cmt_vectorization.vocabulary_size()

(150000, 70130)

In [None]:
code_vectorization.get_vocabulary()[0:25]

['',
 '[UNK]',
 'return',
 'public',
 'if',
 'new',
 'null',
 'string',
 'void',
 'static',
 'final',
 'int',
 'throws',
 'for',
 '0',
 'private',
 'boolean',
 'i',
 'throw',
 't',
 'override',
 'else',
 'e',
 'value',
 'object']

In [None]:
def format_dataset(cde, cmts):
    code = code_vectorization(cde)
    cmt = cmt_vectorization(cmts)
    return ({"encoder_inputs": code, "decoder_inputs": cmt[:, :-1],}, cmt[:, 1:])


def make_dataset(pairs):
    code_texts, cmt_texts = zip(*pairs)
    code_texts = list(code_texts)
    cmt_texts = list(cmt_texts)
    dataset = tf.data.Dataset.from_tensor_slices((code_texts, cmt_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset)
    return dataset.shuffle(2048).prefetch(16).cache()


train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [None]:
print(train_ds)
print(val_ds)

<CacheDataset element_spec=({'encoder_inputs': TensorSpec(shape=(None, 20), dtype=tf.int64, name=None), 'decoder_inputs': TensorSpec(shape=(None, 20), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, 20), dtype=tf.int64, name=None))>
<CacheDataset element_spec=({'encoder_inputs': TensorSpec(shape=(None, 20), dtype=tf.int64, name=None), 'decoder_inputs': TensorSpec(shape=(None, 20), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, 20), dtype=tf.int64, name=None))>


In [None]:
#One batch of data
for raw_record in train_ds.take(1):
  print(raw_record)


({'encoder_inputs': <tf.Tensor: shape=(64, 20), dtype=int64, numpy=
array([[  4806,   1144,  46693, ...,      0,      0,      0],
       [     3,      9,      8, ...,   1059,     25,   1059],
       [     3,      9,    434, ...,      6,     13,     10],
       ...,
       [     3,  25328,      1, ...,      0,      0,      0],
       [     3,     16,      1, ...,      7,    446, 140584],
       [     3,      9,    142, ...,      5,      1,      4]])>, 'decoder_inputs': <tf.Tensor: shape=(64, 20), dtype=int64, numpy=
array([[    3,  2061,    45, ...,     0,     0,     0],
       [    3,   330,   235, ...,     0,     0,     0],
       [    3,  3317,   702, ...,     0,     0,     0],
       ...,
       [    3,    48,     2, ...,    41,    22,    41],
       [    3,    42,   110, ...,     0,     0,     0],
       [    3, 50931,     4, ...,     0,     0,     0]])>}, <tf.Tensor: shape=(64, 20), dtype=int64, numpy=
array([[ 2061,    45,     4, ...,     0,     0,     0],
       [  330,   235,  

Let's take a quick look at the sequence shapes
(we have batches of 64 pairs, and all sequences are 20 steps long):

In [None]:
for inputs, targets in train_ds.take(1):
    print(f'inputs["encoder_inputs"].shape: {inputs["encoder_inputs"].shape}')
    print(f'inputs["decoder_inputs"].shape: {inputs["decoder_inputs"].shape}')
    print(f"targets.shape: {targets.shape}")

inputs["encoder_inputs"].shape: (64, 20)
inputs["decoder_inputs"].shape: (64, 20)
targets.shape: (64, 20)


## Building the model



Our sequence-to-sequence Transformer consists of a TransformerEncoder and a TransformerDecoder chained together. To make the model aware of word order, we also use a PositionalEmbedding layer.

The source sequence will be pass to the TransformerEncoder, which will produce a new representation of it. This new representation will then be passed to the TransformerDecoder, together with the target sequence so far (target words 0 to N). The TransformerDecoder will then seek to predict the next words in the target sequence (N+1 and beyond)

In [None]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super(TransformerEncoder, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, mask=None):
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, tf.newaxis, :], dtype="int32")
        attention_output = self.attention(
            query=inputs, value=inputs, key=inputs, attention_mask=padding_mask
        )
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)


In [None]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
        super(PositionalEmbedding, self).__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=embed_dim
        )
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)

In [None]:
class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, latent_dim, num_heads, **kwargs):
        super(TransformerDecoder, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(latent_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)

        attention_output_1 = self.attention_1(
            query=inputs, value=inputs, key=inputs, attention_mask=causal_mask
        )
        out_1 = self.layernorm_1(inputs + attention_output_1)

        attention_output_2 = self.attention_2(
            query=out_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,
        )
        out_2 = self.layernorm_2(out_1 + attention_output_2)

        proj_output = self.dense_proj(out_2)
        return self.layernorm_3(out_2 + proj_output)

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
            axis=0,
        )
        return tf.tile(mask, mult)



Next, we assemble the end-to-end model.

In [None]:
embed_dim = 256
latent_dim = 2048
num_heads = 8

encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, latent_dim, num_heads)(x)
encoder = keras.Model(encoder_inputs, encoder_outputs)

decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
encoded_seq_inputs = keras.Input(shape=(None, embed_dim), name="decoder_state_inputs")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x, encoded_seq_inputs)
x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x)
decoder = keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)

decoder_outputs = decoder([decoder_inputs, encoder_outputs])
transformer = keras.Model(
    [encoder_inputs, decoder_inputs], decoder_outputs, name="transformer"
)

In [None]:
transformer.summary()

Model: "transformer"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 positional_embedding (Position  (None, None, 256)   38405120    ['encoder_inputs[0][0]']         
 alEmbedding)                                                                                     
                                                                                                  
 decoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 transformer_encoder (Transform  (None, None, 256)   3155456     ['positional_embedding[

## Training our model



In [None]:
#Training Machine available on Colab
! nvidia-smi

Sat Apr 30 15:27:27 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   62C    P0    39W / 250W |   1405MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
#Add Model Checkpoints

!mkdir check
filepath="/content/check/weights-{epoch:02d}-{val_accuracy:.2f}.h5"
#checkpoint_filepath = '/content/check/'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

mkdir: cannot create directory ‘check’: File exists


In [None]:
#Adding Callbacks

csv_logger=tf.keras.callbacks.CSVLogger("/content/Data.csv")  #Log training data for future use../../trained_models/"/content/content
Prog_bar=tf.keras.callbacks.ProgbarLogger(count_mode='samples', stateful_metrics=None)
tensorboard = tf.keras.callbacks.TensorBoard(
  log_dir='./content/Model/Logs',
  histogram_freq=1,
)

In [None]:
epochs = 60 

transformer.compile(
    "rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
transformer.fit(train_ds, epochs=epochs, validation_data=val_ds,callbacks=[model_checkpoint_callback,csv_logger,tensorboard])

Layer PositionalEmbedding has arguments ['self', 'sequence_length', 'vocab_size', 'embed_dim']
in `__init__` and therefore must override `get_config()`.

Example:

class CustomLayer(keras.layers.Layer):
    def __init__(self, arg1, arg2):
        super().__init__()
        self.arg1 = arg1
        self.arg2 = arg2

    def get_config(self):
        config = super().get_config()
        config.update({
            "arg1": self.arg1,
            "arg2": self.arg2,
        })
        return config
Epoch 1/60


  layer_config = serialize_layer_fn(layer)


Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60
 388/2829 [===>..........................] - ETA: 13:53 - loss: 1.8999 - accuracy: 0.6134

In [None]:
#Plots
#Plot the loss and Accuracy plots
accuracy=transformer.history.history["accuracy"]
val_accuracy=transformer.history.history["val_accuracy"]

from matplotlib import pyplot as plt
plt.figure()
plt.plot(val_accuracy,label="Accuracy Plot - Validation Data")
plt.plot(accuracy,label="Accuracy Plot - Training Data")
plt.title("Accuracy Variation")
plt.xlabel("Iterations")
plt.ylabel("Accuracy")
plt.plot()
plt.legend()


In [None]:
#Loss plot
plt.figure()
loss=transformer.history.history["loss"]
val_loss=transformer.history.history["val_loss"]

plt.plot(val_loss,label="Loss Plot - Validation Data")
plt.plot(loss,label="Loss Plot - Training Data")
plt.title("Loss Variation")
plt.xlabel("Iterations")
plt.ylabel("Loss")
plt.plot()
plt.legend()

In [None]:
%load_ext tensorboard


In [None]:
!kill '1358'
%tensorboard --logdir /content/content/Model/

## Decoding test sentences



In [None]:
cmt_vocab = cmt_vectorization.get_vocabulary()
cmt_index_lookup = dict(zip(range(len(cmt_vocab)), cmt_vocab))
max_decoded_sentence_length = 20


def decode_sequence(input_sentence):
    input_sentence=input_sentence.lower()
    tokenized_input_sentence = code_vectorization([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = cmt_vectorization([decoded_sentence])[:, :-1]
        predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])

        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = cmt_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token

        if sampled_token == "[end]":
            break
    return decoded_sentence


test_code_texts = [pair[0] for pair in test_pairs]


In [None]:
for _ in range(2):
    input_code = random.choice(test_code_texts)
    Comment = decode_sequence(input_code)
    print(f"Input Sequence is :{input_code}")
    print(f"Predicted Comment Output : {Comment}\n")

In [None]:
#Testing
My_code="15+13"

Comment = decode_sequence(My_code)
print(Comment)

In [None]:
transformer.save_weights("Final_Model_Weights.h5")

In [None]:
#############
transformer.load_weights("/content/Final_Model_Weights.h5")

In [None]:
#Download
!zip -r /content/Check.zip /content/check

In [None]:
!zip -r /content/content.zip /content/content