**Objective:** Design, implement and evaluate deep learning solution to fix syntactic errors in C programs. 

##What to do

* Prepare data for deep learning
* Implement a deep learning model for sequence to sequence translation (from buggy to fix)
* Training and evaluating the deep learning model.

## Problem Statement

* <b>Line to line fixing </b> - map sourceLineTokens to targetLineTokens
* <b>Program to line fixing </b> - map sourceTokens to targetLineTokens

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import numpy as np
import ast
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import tensorflow as tf
from tensorflow import keras

In [4]:
df = pd.read_csv("")

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,sourceText,targetText,sourceLineText,targetLineText,lineNums_Text,sourceTokens,targetTokens,sourceLineTokens,targetLineTokens
0,0,#include<stdio.h>\n\nint check_prime(int num)\...,#include<stdio.h>\n\nint check_prime(int num)\...,if ( num % i != = 0 ) \n,if ( num % i == 0 ) \n,10,"[['#include', '<stdio.h>'], [], ['int', 'check...","[['#include', '<stdio.h>'], [], ['int', 'check...","['if', '(', 'num', '%', 'i', '!=', '=', '0', ')']","['if', '(', 'num', '%', 'i', '==', '0', ')']"
1,1,"#include <stdio.h>\nstruct point{\n int x,y...","#include <stdio.h>\nstruct point{\n int x,y...",\n,} \n,39,"[['#include', ' <stdio.h>'], ['struct', 'point...","[['#include', ' <stdio.h>'], ['struct', 'point...",[],['}']
2,2,#include<stdio.h>\nint main(){\n int a=10;\...,#include<stdio.h>\nint main(){\n int a=10;\...,d = ( b * b ) - 4 * a * c \n,d = ( b * b ) - 4 * a * c ; \n,7,"[['#include', '<stdio.h>'], ['int', 'main', '(...","[['#include', '<stdio.h>'], ['int', 'main', '(...","['d', '=', '(', 'b', '*', 'b', ')', '-', '4', ...","['d', '=', '(', 'b', '*', 'b', ')', '-', '4', ..."
3,3,"#include<stdio.h>\nint main(){\n printf(""Le...","#include<stdio.h>\nint main(){\n printf(""Le...","printf ( ""Let\\s \""C"" ! ""); \n","printf ( ""Let\\s \""C\""!"" ) ; \n",3,"[['#include', '<stdio.h>'], ['int', 'main', '(...","[['#include', '<stdio.h>'], ['int', 'main', '(...","['printf', '(', '""Let\\\\s \\""', 'C', '"" ! ""',...","['printf', '(', '""Let\\\\s \\""', 'C', '\\', '""..."
4,4,"#include <stdio.h>\nint main() {\n\tint n,m,i,...","#include <stdio.h>\nint main() {\n\tint n,m,i,...",print f \n,\n,60,"[['#include', ' <stdio.h>'], ['int', 'main', '...","[['#include', ' <stdio.h>'], ['int', 'main', '...","['print', 'f']",[]


In [6]:
PAD_token = 0   # Used for padding short sentences
SOS_token = 1   # Start-of-sentence token
EOS_token = 2   # End-of-sentence token
OOV_token = 3   # out of vocabulary token

dim = 50

class Vocabulary:
    

    def __init__(self, name):
      
      self.name = name
      self.word2index = {"OOV": OOV_token, "PAD": PAD_token, "SOS": SOS_token, "EOS": EOS_token}
      self.word2count = {"OOV": 0}
      self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS", OOV_token: "OOV"}
      self.num_words = 4
      self.longest_token_list = 0

    def add_word(self, word):

      if word not in self.word2index:
        # First entry of word into vocabulary
        self.word2index[word] = self.num_words
        self.word2count[word] = 1
        self.index2word[self.num_words] = word
        self.num_words += 1

      self.word2count[word] += 1
          
    def add_tokens(self, tokens, consider_oov=False):

      
      for token in tokens:
        self.add_word(token)
      

    def to_word(self, index):
      return self.index2word[index]

    def to_index(self, word):
      return self.word2index[word]

In [7]:


input_tokens = []
target_tokens = []

vocab = Vocabulary("input")

for i in range(len(df)):
  sourceLineToken = ast.literal_eval(df['sourceLineTokens'][i])
  input_tokens.append(sourceLineToken)
  vocab.add_tokens(sourceLineToken)

for i in range(len(df)):
  targetLineToken = ast.literal_eval(df['targetLineTokens'][i])
  target_tokens.append(targetLineToken)



vocab.word2count = sorted(vocab.word2count.items(), key = lambda kv:(kv[1], kv[0]), reverse=True)

valid_dict = dict(vocab.word2count[:300])

vocab = Vocabulary("output")

for key in valid_dict.keys():
  vocab.add_word(key)


for i in range(len(input_tokens)):
  for j in range(len(input_tokens[i])):
    if input_tokens[i][j] not in vocab.word2index.keys():
      input_tokens[i][j] = "OOV"
  
for i in range(len(target_tokens)):
  for j in range(len(target_tokens[i])):
    if target_tokens[i][j] not in vocab.word2index.keys():
      target_tokens[i][j] = "OOV"




In [8]:
input_tokens[0]

['if', '(', 'num', '%', 'i', '!=', '=', '0', ')']

In [9]:
for i in range(len(input_tokens)):
  input_tokens[i] = input_tokens[i][:dim-1]

max_len = 0
for j in range(len(target_tokens)):
  target_tokens[j] = target_tokens[j][:dim-1]
  max_len = max(max_len, len(target_tokens[j]))

max_len
  

49

In [10]:
vocab.num_words

304

In [11]:
encoder_input_data = np.zeros(
    (len(input_tokens), dim + 1, vocab.num_words), dtype=float
)

decoder_input_data = np.zeros(
    (len(input_tokens), dim + 1, vocab.num_words), dtype=float
)

decoder_target_data = np.zeros(
    (len(input_tokens), dim + 1, vocab.num_words), dtype=float
)

In [12]:
for i, (input_token, target_token) in enumerate(zip(input_tokens, target_tokens)):

    # input_token = input_token[:dim-1]
    for t, token in enumerate(input_token):
        encoder_input_data[i, t, vocab.word2index[token]] = 1.0

    t = t + 1
    encoder_input_data[i, t, EOS_token] = 1.0

    encoder_input_data[i, t + 1: , PAD_token] = 1.0

    # For decoder

    # target_token = target_token[:dim - 1]
    decoder_input_data[i, 0, SOS_token] = 1.0
    for t, token in enumerate(target_token):
        # print(vocab_output.word2index[token], t+1)
        decoder_input_data[i, t + 1, vocab.word2index[token]] = 1.0
        decoder_target_data[i, t, vocab.word2index[token]] = 1.0
        # print(t+1, vocab.word2index[token])
    # print(t+1, EOS_token)
    t = t + 1
    decoder_input_data[i, t + 1, EOS_token] = 1.0
    decoder_target_data[i, t, EOS_token] = 1.0

    decoder_input_data[i, t + 2: , PAD_token] = 1.0
    decoder_target_data[i, t + 1: , PAD_token] = 1.0

    # print(decoder_input_data[i, t + 7 , PAD_token])



In [13]:
decoder_target_data.shape

(14643, 51, 304)

In [14]:
batch_size = 64  # Batch size for training.
epochs = 100  # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding space.
num_samples = 10000  # Number of samples to train on.

# Define an input sequence and process it.
encoder_inputs = keras.Input(shape=(None, vocab.num_words))
encoder = keras.layers.LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)

# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = keras.Input(shape=(None, vocab.num_words))

# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = keras.layers.Dense(vocab.num_words, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [15]:
from keras.callbacks import ModelCheckpoint

model.compile(
    optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"]
)

filepath="/content/drive/MyDrive/Colab Notebooks/ASEML/"
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.3,
    callbacks=callbacks_list,
)

Epoch 1/100

Epoch 00001: val_accuracy improved from -inf to 0.80265, saving model to /content/drive/MyDrive/Colab Notebooks/ASEML




INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


Epoch 2/100

Epoch 00002: val_accuracy improved from 0.80265 to 0.84433, saving model to /content/drive/MyDrive/Colab Notebooks/ASEML




INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


Epoch 3/100

Epoch 00003: val_accuracy improved from 0.84433 to 0.87374, saving model to /content/drive/MyDrive/Colab Notebooks/ASEML




INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


Epoch 4/100

Epoch 00004: val_accuracy improved from 0.87374 to 0.88403, saving model to /content/drive/MyDrive/Colab Notebooks/ASEML




INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


Epoch 5/100

Epoch 00005: val_accuracy improved from 0.88403 to 0.88724, saving model to /content/drive/MyDrive/Colab Notebooks/ASEML




INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


Epoch 6/100

Epoch 00006: val_accuracy improved from 0.88724 to 0.89528, saving model to /content/drive/MyDrive/Colab Notebooks/ASEML




INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


Epoch 7/100

Epoch 00007: val_accuracy improved from 0.89528 to 0.90343, saving model to /content/drive/MyDrive/Colab Notebooks/ASEML




INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


Epoch 8/100

Epoch 00008: val_accuracy improved from 0.90343 to 0.90738, saving model to /content/drive/MyDrive/Colab Notebooks/ASEML




INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


Epoch 9/100

Epoch 00009: val_accuracy improved from 0.90738 to 0.91275, saving model to /content/drive/MyDrive/Colab Notebooks/ASEML




INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


Epoch 10/100

Epoch 00010: val_accuracy improved from 0.91275 to 0.91370, saving model to /content/drive/MyDrive/Colab Notebooks/ASEML




INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


Epoch 11/100

Epoch 00011: val_accuracy improved from 0.91370 to 0.91686, saving model to /content/drive/MyDrive/Colab Notebooks/ASEML




INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


Epoch 12/100

Epoch 00012: val_accuracy did not improve from 0.91686
Epoch 13/100

Epoch 00013: val_accuracy did not improve from 0.91686
Epoch 14/100

Epoch 00014: val_accuracy improved from 0.91686 to 0.92191, saving model to /content/drive/MyDrive/Colab Notebooks/ASEML




INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


Epoch 15/100

Epoch 00015: val_accuracy improved from 0.92191 to 0.92459, saving model to /content/drive/MyDrive/Colab Notebooks/ASEML




INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


Epoch 16/100

Epoch 00016: val_accuracy improved from 0.92459 to 0.92534, saving model to /content/drive/MyDrive/Colab Notebooks/ASEML




INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


Epoch 17/100

Epoch 00017: val_accuracy improved from 0.92534 to 0.92763, saving model to /content/drive/MyDrive/Colab Notebooks/ASEML




INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


Epoch 18/100

Epoch 00018: val_accuracy improved from 0.92763 to 0.92805, saving model to /content/drive/MyDrive/Colab Notebooks/ASEML




INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


Epoch 19/100

Epoch 00019: val_accuracy did not improve from 0.92805
Epoch 20/100

Epoch 00020: val_accuracy improved from 0.92805 to 0.92938, saving model to /content/drive/MyDrive/Colab Notebooks/ASEML




INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


Epoch 21/100

Epoch 00021: val_accuracy improved from 0.92938 to 0.93161, saving model to /content/drive/MyDrive/Colab Notebooks/ASEML




INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


Epoch 22/100

Epoch 00022: val_accuracy did not improve from 0.93161
Epoch 23/100

Epoch 00023: val_accuracy improved from 0.93161 to 0.93243, saving model to /content/drive/MyDrive/Colab Notebooks/ASEML




INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


Epoch 24/100

Epoch 00024: val_accuracy improved from 0.93243 to 0.93323, saving model to /content/drive/MyDrive/Colab Notebooks/ASEML




INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


Epoch 25/100

Epoch 00025: val_accuracy did not improve from 0.93323
Epoch 26/100

Epoch 00026: val_accuracy did not improve from 0.93323
Epoch 27/100

Epoch 00027: val_accuracy improved from 0.93323 to 0.93378, saving model to /content/drive/MyDrive/Colab Notebooks/ASEML




INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


Epoch 28/100

Epoch 00028: val_accuracy improved from 0.93378 to 0.93550, saving model to /content/drive/MyDrive/Colab Notebooks/ASEML




INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


Epoch 29/100

Epoch 00029: val_accuracy improved from 0.93550 to 0.93564, saving model to /content/drive/MyDrive/Colab Notebooks/ASEML




INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


Epoch 30/100

Epoch 00030: val_accuracy did not improve from 0.93564
Epoch 31/100

Epoch 00031: val_accuracy improved from 0.93564 to 0.93781, saving model to /content/drive/MyDrive/Colab Notebooks/ASEML




INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


Epoch 32/100

Epoch 00032: val_accuracy did not improve from 0.93781
Epoch 33/100

Epoch 00033: val_accuracy did not improve from 0.93781
Epoch 34/100

Epoch 00034: val_accuracy did not improve from 0.93781
Epoch 35/100

Epoch 00035: val_accuracy improved from 0.93781 to 0.93811, saving model to /content/drive/MyDrive/Colab Notebooks/ASEML




INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


Epoch 36/100

Epoch 00036: val_accuracy improved from 0.93811 to 0.93960, saving model to /content/drive/MyDrive/Colab Notebooks/ASEML




INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


Epoch 37/100

Epoch 00037: val_accuracy did not improve from 0.93960
Epoch 38/100

Epoch 00038: val_accuracy did not improve from 0.93960
Epoch 39/100

Epoch 00039: val_accuracy did not improve from 0.93960
Epoch 40/100

Epoch 00040: val_accuracy did not improve from 0.93960
Epoch 41/100

Epoch 00041: val_accuracy did not improve from 0.93960
Epoch 42/100

Epoch 00042: val_accuracy did not improve from 0.93960
Epoch 43/100

Epoch 00043: val_accuracy did not improve from 0.93960
Epoch 44/100

Epoch 00044: val_accuracy did not improve from 0.93960
Epoch 45/100

Epoch 00045: val_accuracy did not improve from 0.93960
Epoch 46/100

Epoch 00046: val_accuracy did not improve from 0.93960
Epoch 47/100

Epoch 00047: val_accuracy improved from 0.93960 to 0.93964, saving model to /content/drive/MyDrive/Colab Notebooks/ASEML




INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


Epoch 48/100

Epoch 00048: val_accuracy did not improve from 0.93964
Epoch 49/100

Epoch 00049: val_accuracy did not improve from 0.93964
Epoch 50/100

Epoch 00050: val_accuracy did not improve from 0.93964
Epoch 51/100

Epoch 00051: val_accuracy improved from 0.93964 to 0.93998, saving model to /content/drive/MyDrive/Colab Notebooks/ASEML




INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


Epoch 52/100

Epoch 00052: val_accuracy did not improve from 0.93998
Epoch 53/100

Epoch 00053: val_accuracy did not improve from 0.93998
Epoch 54/100

Epoch 00054: val_accuracy did not improve from 0.93998
Epoch 55/100

Epoch 00055: val_accuracy did not improve from 0.93998
Epoch 56/100

Epoch 00056: val_accuracy did not improve from 0.93998
Epoch 57/100

Epoch 00057: val_accuracy did not improve from 0.93998
Epoch 58/100

Epoch 00058: val_accuracy improved from 0.93998 to 0.94050, saving model to /content/drive/MyDrive/Colab Notebooks/ASEML




INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/ASEML/assets


Epoch 59/100

Epoch 00059: val_accuracy did not improve from 0.94050
Epoch 60/100

Epoch 00060: val_accuracy did not improve from 0.94050
Epoch 61/100

Epoch 00061: val_accuracy did not improve from 0.94050
Epoch 62/100

Epoch 00062: val_accuracy did not improve from 0.94050
Epoch 63/100

Epoch 00063: val_accuracy did not improve from 0.94050
Epoch 64/100

Epoch 00064: val_accuracy did not improve from 0.94050
Epoch 65/100

Epoch 00065: val_accuracy did not improve from 0.94050
Epoch 66/100

Epoch 00066: val_accuracy did not improve from 0.94050
Epoch 67/100

Epoch 00067: val_accuracy did not improve from 0.94050
Epoch 68/100

Epoch 00068: val_accuracy did not improve from 0.94050
Epoch 69/100

Epoch 00069: val_accuracy did not improve from 0.94050
Epoch 70/100

Epoch 00070: val_accuracy did not improve from 0.94050
Epoch 71/100

Epoch 00071: val_accuracy did not improve from 0.94050
Epoch 72/100

Epoch 00072: val_accuracy did not improve from 0.94050
Epoch 73/100

Epoch 00073: val_acc

<tensorflow.python.keras.callbacks.History at 0x7f74fd5f4110>

In [16]:
 # Define sampling models
# Restore the model and construct the encoder and decoder.
model = keras.models.load_model("/content/drive/MyDrive/Colab Notebooks/ASEML/s2s")

encoder_inputs = model.input[0]  # input_1
encoder_outputs, state_h_enc, state_c_enc = model.layers[2].output  # lstm_1
encoder_states = [state_h_enc, state_c_enc]
encoder_model = keras.Model(encoder_inputs, encoder_states)

decoder_inputs = model.input[1]  # input_2
decoder_state_input_h = keras.Input(shape=(latent_dim,), name="input_3")
decoder_state_input_c = keras.Input(shape=(latent_dim,), name="input_4")
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_lstm = model.layers[3]
decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs
)
decoder_states = [state_h_dec, state_c_dec]
decoder_dense = model.layers[4]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = keras.Model(
    [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states
)


In [47]:
val_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/ASEML/demo/valid.csv")
val_df.head()

Unnamed: 0.1,Unnamed: 0,sourceText,targetText,sourceLineText,targetLineText,lineNums_Text,sourceTokens,targetTokens,sourceLineTokens,targetLineTokens
0,14643,#include <stdio.h>\n#include <stdlib.h>\nint f...,#include <stdio.h>\n#include <stdlib.h>\nint f...,if ( ( factorial ( x ) >= n1 ) && ( facdtorial...,if ( ( factorial ( x ) >= n1 ) && ( factorial ...,18,"[['#include', ' <stdio.h>'], ['#include', ' <s...","[['#include', ' <stdio.h>'], ['#include', ' <s...","['if', '(', '(', 'factorial', '(', 'x', ')', '...","['if', '(', '(', 'factorial', '(', 'x', ')', '..."
1,14644,#include<stdio.h>\n#include<stdlib.h>\nint mai...,#include<stdio.h>\n#include<stdlib.h>\nint mai...,"scanf ( ""%d"" , & a [ i ] ) ; \n","scanf ( ""%d"" , & a [ 0 ] ) ; \n",9,"[['#include', '<stdio.h>'], ['#include', '<std...","[['#include', '<stdio.h>'], ['#include', '<std...","['scanf', '(', '""%d""', ',', '&', 'a', '[', 'i'...","['scanf', '(', '""%d""', ',', '&', 'a', '[', '0'..."
2,14645,"#include<stdio.h>\n\nint main()\n{\n int a,...","#include<stdio.h>\n\nint main()\n{\n int a,...",a = a / 10 \n,a = a / 10 ; \n,11,"[['#include', '<stdio.h>'], [], ['int', 'main'...","[['#include', '<stdio.h>'], [], ['int', 'main'...","['a', '=', 'a', '/', '10']","['a', '=', 'a', '/', '10', ';']"
3,14646,#include<stdio.h>\nint ant_sym(int a[100][100]...,#include<stdio.h>\nint ant_sym(int a[100][100]...,"int k , c ; \n","int k , c , x , y ; \n",31,"[['#include', '<stdio.h>'], ['int', 'ant_sym',...","[['#include', '<stdio.h>'], ['int', 'ant_sym',...","['int', 'k', ',', 'c', ';']","['int', 'k', ',', 'c', ',', 'x', ',', 'y', ';']"
4,14647,#include <stdio.h>\nint rot(char a[100])\n{\n ...,#include <stdio.h>\nint rot(char a[100])\n{\n ...,ch = rot ( a [ s ] ) ; \n,ch = rot ( a [ i ] ) ; \n,22,"[['#include', ' <stdio.h>'], ['int', 'rot', '(...","[['#include', ' <stdio.h>'], ['int', 'rot', '(...","['ch', '=', 'rot', '(', 'a', '[', 's', ']', ')...","['ch', '=', 'rot', '(', 'a', '[', 'i', ']', ')..."


In [48]:
def correct_code(input_token):

  states_value = encoder_model.predict(input_token)
  target_seq = np.zeros((1, 1, vocab.num_words))
  target_seq[0, 0, SOS_token] = 1.0

  correct_code = []

  while True:

    output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
    sampled_token_index = np.argmax(output_tokens[0, -1, :])

    # print(type(sampled_token_index))
    sampled_token = vocab.index2word[sampled_token_index]
    # print(sampled_token)
    correct_code.append(sampled_token)

    target_seq = np.zeros((1, 1, vocab.num_words))
    target_seq[0, 0, sampled_token_index] = 1.0

    states_value = [h, c]

    if sampled_token == "EOS" or len(correct_code) > dim:
      break
  
  return correct_code


In [49]:
df = val_df

input_tokens = []

for i in range(len(df)):
  sourceLineToken = ast.literal_eval(df['sourceLineTokens'][i])
  input_tokens.append(sourceLineToken)

for i in range(len(input_tokens)):
  for j in range(len(input_tokens[i])):
    if input_tokens[i][j] not in vocab.word2index.keys():
      input_tokens[i][j] = "OOV"

for i in range(len(input_tokens)):
  input_tokens[i] = input_tokens[i][:dim-1]


encoder_input_data = np.zeros(
    (len(input_tokens), dim + 1, vocab.num_words), dtype=float
)

for i, (input_token) in enumerate(input_tokens):

    # input_token = input_token[:dim-1]
    for t, token in enumerate(input_token):
        encoder_input_data[i, t, vocab.word2index[token]] = 1.0

    t = t + 1
    encoder_input_data[i, t, EOS_token] = 1.0

    encoder_input_data[i, t + 1: , PAD_token] = 1.0


In [50]:

corrected_code = []

ip = len(val_df)
print(ip)

for i in range(len(val_df)):
  sourceLineToken = ast.literal_eval(val_df['sourceLineTokens'][i])
  targetLineToken = ast.literal_eval(val_df['targetLineTokens'][i])

  cor_code = correct_code(encoder_input_data[i:i+1])

  corrected_code.append(cor_code)
  
  for j, token in enumerate(targetLineToken):
    if token not in vocab.word2index.keys():
      targetLineToken[j] = 'OOV'

  end = min(len(targetLineToken), dim)
  for j in range(end):
    if targetLineToken[j] != cor_code[j]:
      ip -= 1
      break
  
      
print(ip)

2584
835


In [61]:
accuracy = ip / len(val_df)
accuracy    # 32% accuracy

0.3231424148606811

In [58]:
val_df['fixedTokens'] = corrected_code
val_df.to_csv('/content/drive/MyDrive/Colab Notebooks/ASEML/valid_output.csv')

In [51]:
835/2584

0.3231424148606811

In [57]:
val_df.head()

Unnamed: 0.1,Unnamed: 0,sourceText,targetText,sourceLineText,targetLineText,lineNums_Text,sourceTokens,targetTokens,sourceLineTokens,targetLineTokens,fixedTokens
0,14643,#include <stdio.h>\n#include <stdlib.h>\nint f...,#include <stdio.h>\n#include <stdlib.h>\nint f...,if ( ( factorial ( x ) >= n1 ) && ( facdtorial...,if ( ( factorial ( x ) >= n1 ) && ( factorial ...,18,"[['#include', ' <stdio.h>'], ['#include', ' <s...","[['#include', ' <stdio.h>'], ['#include', ' <s...","['if', '(', '(', 'factorial', '(', 'x', ')', '...","['if', '(', '(', 'factorial', '(', 'x', ')', '...","[if, (, (, (, OOV, ->, x, ), >, (, y, ->, y, )..."
1,14644,#include<stdio.h>\n#include<stdlib.h>\nint mai...,#include<stdio.h>\n#include<stdlib.h>\nint mai...,"scanf ( ""%d"" , & a [ i ] ) ; \n","scanf ( ""%d"" , & a [ 0 ] ) ; \n",9,"[['#include', '<stdio.h>'], ['#include', '<std...","[['#include', '<stdio.h>'], ['#include', '<std...","['scanf', '(', '""%d""', ',', '&', 'a', '[', 'i'...","['scanf', '(', '""%d""', ',', '&', 'a', '[', '0'...","[scanf, (, ""%d"", ,, &, a, [, i, ], ), ;, EOS]"
2,14645,"#include<stdio.h>\n\nint main()\n{\n int a,...","#include<stdio.h>\n\nint main()\n{\n int a,...",a = a / 10 \n,a = a / 10 ; \n,11,"[['#include', '<stdio.h>'], [], ['int', 'main'...","[['#include', '<stdio.h>'], [], ['int', 'main'...","['a', '=', 'a', '/', '10']","['a', '=', 'a', '/', '10', ';']","[a, =, a, /, 10, ;, EOS]"
3,14646,#include<stdio.h>\nint ant_sym(int a[100][100]...,#include<stdio.h>\nint ant_sym(int a[100][100]...,"int k , c ; \n","int k , c , x , y ; \n",31,"[['#include', '<stdio.h>'], ['int', 'ant_sym',...","[['#include', '<stdio.h>'], ['int', 'ant_sym',...","['int', 'k', ',', 'c', ';']","['int', 'k', ',', 'c', ',', 'x', ',', 'y', ';']","[int, k, ;, EOS]"
4,14647,#include <stdio.h>\nint rot(char a[100])\n{\n ...,#include <stdio.h>\nint rot(char a[100])\n{\n ...,ch = rot ( a [ s ] ) ; \n,ch = rot ( a [ i ] ) ; \n,22,"[['#include', ' <stdio.h>'], ['int', 'rot', '(...","[['#include', ' <stdio.h>'], ['int', 'rot', '(...","['ch', '=', 'rot', '(', 'a', '[', 's', ']', ')...","['ch', '=', 'rot', '(', 'a', '[', 'i', ']', ')...","[OOV, +=, OOV, (, a, ,, n, ), ;, EOS]"
