<a href="https://colab.research.google.com/github/pasumarthi/NLP/blob/main/english_to_python_working_python_tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# English to Python code generation
In this notebook, from english sentence python code is generated

The technique used in this project is building Neural Transformer based on paper. Attention is All you Need - https://arxiv.org/pdf/1706.03762.pdf.

The Sequnece2Sequnce Model used can be depicted in the diagram below

![](https://raw.githubusercontent.com/bentrevett/pytorch-seq2seq/9479fcb532214ad26fd4bda9fcf081a05e1aaf4e/assets/transformer1.png)



In [3]:
# Mount the google drive
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
import torch
import torch.nn as n
import torch.optim as optim


from torchtext.legacy.data import Field, BucketIterator

import spacy
import numpy as np

import random
import math
import time

Then set a random seed for deterministic results/reproducability.

In [5]:
SEED = 1222

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

Instantiate English spaCy models.

In [6]:
%%bash
python -m spacy download en


[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [7]:
spacy_en = spacy.load('en')

Tokenize method is defined for English

In [8]:
import tokenize
import io

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

def tokenize_python(text):
    """
    Tokenizes Python Code to list of strings
    """
    python_token_list = []
    raised_exception = False
    try:
        tokens = tokenize.tokenize(io.BytesIO(text.encode('utf-8')).readline)
        for in_tuple in tokens:
            if in_tuple.type == tokenize.COMMENT:
                continue
            elif in_tuple.type == tokenize.ENCODING:
                continue
            elif in_tuple.type == tokenize.INDENT:
                python_token_list.append("INDENT")
            elif in_tuple.type == tokenize.DEDENT:
                python_token_list.append("DEDENT")
            elif in_tuple.type == tokenize.NL or in_tuple.type == tokenize.NEWLINE:
                python_token_list.append("NEWLINE")
            elif in_tuple.type == tokenize.ENDMARKER :
                continue
            else:
                python_token_list.append(in_tuple.string)
    except Exception:
        raised_exception = True
        #print( "Exception: ", Exception, " program: ", text)
    return python_token_list

## Download the sample python samples

In [9]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math


USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")

In [10]:
import nltk
import string
import re

from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer


from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
import pandas as pd
import string
import nltk
import random
import random
#import google_trans_new
#from google_trans_new import google_translator

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
lem = WordNetLemmatizer()

def clean_text(text):
    ## lower case
    if not isinstance(text, str):
      return str(text) 
    cleaned = text.lower()

    urls_pattern = re.compile(r'https?://\S+|www.\S+')
    cleaned = urls_pattern.sub(r'',cleaned)
    
    ## remove punctuations
    punctuations = string.punctuation
    cleaned_temp = "".join(character for character in cleaned if character not in punctuations)
    
    ## remove stopwords 
    words = cleaned_temp.split()
    #stopword_lists = stopwords.words("english")
    #cleaned = [word for word in words if word not in stopword_lists]
    cleaned = words
    
    ## normalization - lemmatization
    #cleaned = [lem.lemmatize(word, "v") for word in cleaned]
    #cleaned = [lem.lemmatize(word, "n") for word in cleaned]
    
    ## join 
    cleaned = " ".join(cleaned)
    return cleaned



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [11]:
import os
os.chdir("/content/gdrive/My Drive/NLP_end")

In [12]:
english_text_python_program_pair_list = []
process_python_code=False
i=1
with open('dataset_python_cleaned_final.txt', 'r', encoding="utf8") as f:
    for line in f:
        #print(i)
        i += 1
        if process_python_code==False:
            if line.strip() == '':
                continue
            if line.startswith('#'):
                english_text = line
                #english_text_list.append(line)
                process_python_code=True
                python_program=''
            else:
                print(i, ": ", line)            
        else:
            if line.strip() == '':
                process_python_code=False
                english_text_python_program_pair_list.append((english_text, python_program))
                python_program=''
                english_text =''
            if line.lstrip().startswith('#'):
                continue
            else:
                python_program += line
print(i, ": ", line)                 

37405 :  print("Binary Right Shift", c)


In [13]:
len(english_text_python_program_pair_list)

4727

In [14]:
english_text_list,python_program_list  = zip(*english_text_python_program_pair_list)

In [15]:
import pandas as pd

df = pd.DataFrame({'English': english_text_list, 'Python':python_program_list })

In [16]:
df.head(5)

Unnamed: 0,English,Python
0,# write a python program to add two numbers \n,num1 = 1.5\nnum2 = 6.3\nsum = num1 + num2\npri...
1,# write a python function to add two user prov...,"def add_two_numbers(num1, num2):\n sum = nu..."
2,# write a program to find and print the larges...,num1 = 10\nnum2 = 12\nnum3 = 14\nif (num1 >= n...
3,# write a program to find and print the smalle...,num1 = 10\nnum2 = 12\nnum3 = 14\nif (num1 <= n...
4,# Write a python function to merge two given l...,"def merge_lists(l1, l2):\n return l1 + l2\n"


In [17]:
from tqdm import tqdm_notebook as tqdm
tqdm().pandas() 

df['English'] = df['English'].progress_apply(lambda txt: clean_text(txt))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

  from pandas import Panel


HBox(children=(FloatProgress(value=0.0, max=4727.0), HTML(value='')))




In [18]:
df['Python'] = df['Python'].progress_apply(lambda txt: txt.lstrip())

HBox(children=(FloatProgress(value=0.0, max=4727.0), HTML(value='')))




In [19]:
df.head(5)

Unnamed: 0,English,Python
0,write a python program to add two numbers,num1 = 1.5\nnum2 = 6.3\nsum = num1 + num2\npri...
1,write a python function to add two user provid...,"def add_two_numbers(num1, num2):\n sum = nu..."
2,write a program to find and print the largest ...,num1 = 10\nnum2 = 12\nnum3 = 14\nif (num1 >= n...
3,write a program to find and print the smallest...,num1 = 10\nnum2 = 12\nnum3 = 14\nif (num1 <= n...
4,write a python function to merge two given lis...,"def merge_lists(l1, l2):\n return l1 + l2\n"


In [20]:
import random
import torch, torchtext
from torchtext import data
import torch.nn as nn
import torch.optim as optim

from torchtext.legacy.data import Field, BucketIterator, Example, Dataset

import spacy
import numpy as np
import math
import time

In [21]:
SRC = Field(tokenize=tokenize_en, 
            init_token='<sos>', 
            eos_token='<eos>',            
            batch_first = True, 
            lower=True)

TRG = Field(tokenize = tokenize_python, 
            init_token='<sos>', 
            eos_token='<eos>', 
            batch_first = True
            )

In [22]:
fields = [('English', SRC),('Python',TRG)]

In [23]:
from sklearn.model_selection import train_test_split
train, valid = train_test_split(df, test_size=0.02)

In [24]:
train = train.reset_index(drop=True) ## This is being done because data.Example.fromlist was failing
valid = valid.reset_index(drop=True) 

In [25]:
train

Unnamed: 0,English,Python
0,46 define a function which can generate a list...,import requests\ndef get_encoding(url):\n d...
1,write a function that will provide the ascii v...,def charToASCII(chr):\n return f'ASCII value ...
2,given a python list turn every item of a list ...,"aList = [1, 2, 3, 4, 5, 6, 7]\naList = [x * x..."
3,write a function that returns relu value of th...,def relu(x:float) -> float:\n x = 0 if x < ...
4,write a python function to get the surfacearea...,"def pyramid_surface_area(base_area, height):\n..."
...,...,...
4627,write a program to move numbers to the end of ...,str1 = 'hi 123 how are you doing? 567 is with ...
4628,write a program to strips every vowel from a s...,"vowels = ('a', 'e', 'i', 'o', 'u')\ninput_stri..."
4629,6 python add all values of another list,"a = [1, 2, 3]\nb = [4, 5, 6]\na += b\n"
4630,calculate the sum of three given numbers if th...,"def sum_thrice(x, y, z):\n sum1 = x + y + z..."


In [26]:
MAX_OUTPUT_SEQ_LENGTH = 100

In [27]:
example_trng = [Example.fromlist([train.English[i],train.Python[i]], fields) for i in range(train.shape[0]) if len(tokenize_python(train.Python[i])) <= MAX_OUTPUT_SEQ_LENGTH - 4 ] 
example_val = [Example.fromlist([valid.English[i],valid.Python[i]], fields) for i in range(valid.shape[0]) if len(tokenize_python(valid.Python[i])) <= MAX_OUTPUT_SEQ_LENGTH - 4 ] 




In [28]:
train_dataset = Dataset(example_trng, fields)
valid_dataset = Dataset(example_val, fields)

In [29]:
vars(train_dataset.examples[10])

{'English': ['write',
  'a',
  'python',
  'function',
  'to',
  'read',
  'a',
  'csv',
  'file',
  'and',
  'print',
  'its',
  'content'],
 'Python': ['def',
  'read_csv',
  '(',
  'filename',
  ')',
  ':',
  'NEWLINE',
  'INDENT',
  'import',
  'csv',
  'NEWLINE',
  'with',
  'open',
  '(',
  'filename',
  ',',
  'newline',
  '=',
  "''",
  ')',
  'as',
  'f',
  ':',
  'NEWLINE',
  'INDENT',
  'reader',
  '=',
  'csv',
  '.',
  'reader',
  '(',
  'f',
  ')',
  'NEWLINE',
  'for',
  'row',
  'in',
  'reader',
  ':',
  'NEWLINE',
  'INDENT',
  'print',
  '(',
  'row',
  ')',
  'NEWLINE',
  'DEDENT',
  'DEDENT',
  'DEDENT']}

In [30]:
vars(valid_dataset.examples[10])

{'English': ['write',
  'a',
  'python',
  'program',
  'to',
  'get',
  'numbers',
  'divisible',
  'by',
  'fifteen',
  'from',
  'a',
  'list'],
 'Python': ['num_list',
  '=',
  '[',
  '45',
  ',',
  '55',
  ',',
  '60',
  ',',
  '37',
  ',',
  '100',
  ',',
  '105',
  ',',
  '220',
  ']',
  'NEWLINE',
  'result',
  '=',
  'list',
  '(',
  'filter',
  '(',
  'lambda',
  'x',
  ':',
  '(',
  'x',
  '%',
  '15',
  '==',
  '0',
  ')',
  ',',
  'num_list',
  ')',
  ')',
  'NEWLINE',
  'print',
  '(',
  'f"Numbers divisible by 15 are {result}"',
  ')',
  'NEWLINE']}

In [31]:
SRC.build_vocab(train_dataset)


In [32]:
TRG.build_vocab(train_dataset)

In [33]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [34]:
BATCH_SIZE = 128

train_iterator, valid_iterator = BucketIterator.splits(
    (train_dataset, valid_dataset), 
    batch_size = BATCH_SIZE,
    sort_key=lambda x:len(x.English),
    sort_within_batch = False, 
    device = device)

Create our fields to process our data. This will append the "start of sentence" and "end of sentence" tokens as well as converting all words to lowercase.

Load our data.

We'll also print out an example just to double check they're not reversed.

Then create our vocabulary, converting all tokens appearing less than twice into `<unk>` tokens.

Finally, define the `device` and create our iterators.

## Building the Seq2Seq Model

### Encoder

![](https://raw.githubusercontent.com/bentrevett/pytorch-seq2seq/9479fcb532214ad26fd4bda9fcf081a05e1aaf4e/assets/transformer-encoder.png)


In [35]:
class Encoder(nn.Module):
    def __init__(self, 
                 input_dim, 
                 hid_dim, 
                 n_layers, 
                 n_heads, 
                 pf_dim,
                 dropout, 
                 device,
                 max_length = 100):
        super().__init__()

        self.device = device
        
        self.tok_embedding = nn.Embedding(input_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)
        
        self.layers = nn.ModuleList([EncoderLayer(hid_dim, 
                                                  n_heads, 
                                                  pf_dim,
                                                  dropout, 
                                                  device) 
                                     for _ in range(n_layers)])
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        
    def forward(self, src, src_mask):
        
        #src = [batch size, src len]
        #src_mask = [batch size, 1, 1, src len]
        
        batch_size = src.shape[0]
        src_len = src.shape[1]
        
        pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        
        #pos = [batch size, src len]
        
        src = self.dropout((self.tok_embedding(src) * self.scale) + self.pos_embedding(pos))
        #src = (self.tok_embedding(src) * self.scale) + self.pos_embedding(pos)
        
        #src = [batch size, src len, hid dim]
        
        for layer in self.layers:
            src = layer(src, src_mask)
            
        #src = [batch size, src len, hid dim]
            
        return src

In [36]:
class EncoderLayer(nn.Module):
    def __init__(self, 
                 hid_dim, 
                 n_heads, 
                 pf_dim,  
                 dropout, 
                 device):
        super().__init__()
        
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, 
                                                                     pf_dim, 
                                                                     dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src, src_mask):
        
        #src = [batch size, src len, hid dim]
        #src_mask = [batch size, 1, 1, src len] 
                
        #self attention
        _src, _ = self.self_attention(src, src, src, src_mask)
        
        #dropout, residual connection and layer norm
        src = self.self_attn_layer_norm(src + self.dropout(_src))
        
        #src = [batch size, src len, hid dim]
        
        #positionwise feedforward
        _src = self.positionwise_feedforward(src)
        
        #dropout, residual and layer norm
        src = self.ff_layer_norm(src + self.dropout(_src))
        
        #src = [batch size, src len, hid dim]
        
        return src

In [37]:
class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, dropout, device):
        super().__init__()
        
        assert hid_dim % n_heads == 0
        
        self.hid_dim = hid_dim
        self.n_heads = n_heads
        self.head_dim = hid_dim // n_heads
        
        self.fc_q = nn.Linear(hid_dim, hid_dim)
        self.fc_k = nn.Linear(hid_dim, hid_dim)
        self.fc_v = nn.Linear(hid_dim, hid_dim)
        
        self.fc_o = nn.Linear(hid_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)
        
    def forward(self, query, key, value, mask = None):
        
        batch_size = query.shape[0]
        
        #query = [batch size, query len, hid dim]
        #key = [batch size, key len, hid dim]
        #value = [batch size, value len, hid dim]
                
        Q = self.fc_q(query)
        K = self.fc_k(key)
        V = self.fc_v(value)
        
        #Q = [batch size, query len, hid dim]
        #K = [batch size, key len, hid dim]
        #V = [batch size, value len, hid dim]
                
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        
        #Q = [batch size, n heads, query len, head dim]
        #K = [batch size, n heads, key len, head dim]
        #V = [batch size, n heads, value len, head dim]
                
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale
        
        #energy = [batch size, n heads, query len, key len]
        
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)
        
        attention = torch.softmax(energy, dim = -1)
                
        #attention = [batch size, n heads, query len, key len]
                
        x = torch.matmul(self.dropout(attention), V)
        
        #x = [batch size, n heads, query len, head dim]
        
        x = x.permute(0, 2, 1, 3).contiguous()
        
        #x = [batch size, query len, n heads, head dim]
        
        x = x.view(batch_size, -1, self.hid_dim)
        
        #x = [batch size, query len, hid dim]
        
        x = self.fc_o(x)
        
        #x = [batch size, query len, hid dim]
        
        return x, attention

In [38]:
class PositionwiseFeedforwardLayer(nn.Module):
    def __init__(self, hid_dim, pf_dim, dropout):
        super().__init__()
        
        self.fc_1 = nn.Linear(hid_dim, pf_dim)
        self.fc_2 = nn.Linear(pf_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [batch size, seq len, hid dim]
        
        x = self.dropout(torch.relu(self.fc_1(x)))
        
        #x = [batch size, seq len, pf dim]
        
        x = self.fc_2(x)
        
        #x = [batch size, seq len, hid dim]
        
        return x

## Decoder

![](https://raw.githubusercontent.com/bentrevett/pytorch-seq2seq/9479fcb532214ad26fd4bda9fcf081a05e1aaf4e/assets/transformer-decoder.png)

In [39]:
class Decoder(nn.Module):
    def __init__(self, 
                 output_dim, 
                 hid_dim, 
                 n_layers, 
                 n_heads, 
                 pf_dim, 
                 dropout, 
                 device,
                 max_length = MAX_OUTPUT_SEQ_LENGTH):
        super().__init__()
        
        self.device = device
        #print("Decoder __init__ : output_dim=", output_dim, " max_length=", max_length, " hid dim=", hid_dim)
        
        self.tok_embedding = nn.Embedding(output_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)
        
        self.layers = nn.ModuleList([DecoderLayer(hid_dim, 
                                                  n_heads, 
                                                  pf_dim, 
                                                  dropout, 
                                                  device)
                                     for _ in range(n_layers)])
        
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        self.max_length = max_length
        
    def forward(self, trg, enc_src, trg_mask, src_mask):
        
        #trg = [batch size, trg len]
        #enc_src = [batch size, src len, hid dim]
        #trg_mask = [batch size, 1, trg len, trg len]
        #src_mask = [batch size, 1, 1, src len]
                
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]

        #print("Decoder.forward: batch_size: ", batch_size, "trg_len=", trg_len )
        
        pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        #print("Decoder.forward: pos =", pos)
        #print("Decoder.forward: trg =", trg)                    
        #pos = [batch size, trg len]
        tok_embed = self.tok_embedding(trg)
       
        #print("Decoder.forward: tok_embed =", tok_embed.shape)
        pos_embed = self.pos_embedding(pos)
        #print("Decoder.forward: pos_embed shape =", pos_embed.shape)
            
        #trg = self.dropout((self.tok_embedding(trg) * self.scale) + self.pos_embedding(pos))
        trg = self.dropout((tok_embed*self.scale) + pos_embed)
        #print("Decoder.forward: trg=", trg)
        #trg = (self.tok_embedding(trg) * self.scale) + self.pos_embedding(pos)
                
        #trg = [batch size, trg len, hid dim]
        
        #print("Decoder.forward: before calling layers")
        for layer in self.layers:
            trg, attention = layer(trg, enc_src, trg_mask, src_mask)
        #print("Decoder.forward: after calling layers: trg=", trg)
        #print("Decoder.forward: after calling layers: attention=", trg)
        #trg = [batch size, trg len, hid dim]
        #attention = [batch size, n heads, trg len, src len]
        
        #print("Decoder.forward: before calling fc_out")
        output = self.fc_out(trg)
        #print("Decoder.forward: after calling fc_out: output=", output)
        #output = [batch size, trg len, output dim]
            
        return output, attention

In [40]:
class DecoderLayer(nn.Module):
    def __init__(self, 
                 hid_dim, 
                 n_heads, 
                 pf_dim, 
                 dropout, 
                 device):
        super().__init__()
        
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.enc_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.encoder_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, 
                                                                     pf_dim, 
                                                                     dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, trg, enc_src, trg_mask, src_mask):
        
        #trg = [batch size, trg len, hid dim]
        #enc_src = [batch size, src len, hid dim]
        #trg_mask = [batch size, 1, trg len, trg len]
        #src_mask = [batch size, 1, 1, src len]
        
        #self attention
        _trg, _ = self.self_attention(trg, trg, trg, trg_mask)
        
        #dropout, residual connection and layer norm
        trg = self.self_attn_layer_norm(trg + self.dropout(_trg))
            
        #trg = [batch size, trg len, hid dim]
            
        #encoder attention
        _trg, attention = self.encoder_attention(trg, enc_src, enc_src, src_mask)
        # query, key, value
        
        #dropout, residual connection and layer norm
        trg = self.enc_attn_layer_norm(trg + self.dropout(_trg))
                    
        #trg = [batch size, trg len, hid dim]
        
        #positionwise feedforward
        _trg = self.positionwise_feedforward(trg)
        
        #dropout, residual and layer norm
        trg = self.ff_layer_norm(trg + self.dropout(_trg))
        
        #trg = [batch size, trg len, hid dim]
        #attention = [batch size, n heads, trg len, src len]
        
        return trg, attention

In [41]:
class Seq2Seq(nn.Module):
    def __init__(self, 
                 encoder, 
                 decoder, 
                 src_pad_idx, 
                 trg_pad_idx, 
                 device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device
        
    def make_src_mask(self, src):
        
        #src = [batch size, src len]
        
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)

        #src_mask = [batch size, 1, 1, src len]

        return src_mask
    
    def make_trg_mask(self, trg):
        
        #trg = [batch size, trg len]
        
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)
        
        #trg_pad_mask = [batch size, 1, 1, trg len]
        
        trg_len = trg.shape[1]
        
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device = self.device)).bool()
        
        #trg_sub_mask = [trg len, trg len]
            
        trg_mask = trg_pad_mask & trg_sub_mask
        
        #trg_mask = [batch size, 1, trg len, trg len]
        
        return trg_mask

    def forward(self, src, trg):
        
        #src = [batch size, src len]
        #trg = [batch size, trg len]
  
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        #src_mask = [batch size, 1, 1, src len]
        #trg_mask = [batch size, 1, trg len, trg len]
        enc_src = self.encoder(src, src_mask)
        
        #enc_src = [batch size, src len, hid dim]    
        output, attention = self.decoder(trg, enc_src, trg_mask, src_mask)
        #output = [batch size, trg len, output dim]
        #attention = [batch size, n heads, trg len, src len]
        
        return output, attention

## Seq2Seq Model

Putting the encoder and decoder together, we get:
![](https://raw.githubusercontent.com/bentrevett/pytorch-seq2seq/9479fcb532214ad26fd4bda9fcf081a05e1aaf4e/assets/transformer1.png)



# Training the Seq2Seq Model

The rest of this session is very similar to the previous one. 

We initialise our encoder, decoder and seq2seq model (placing it on the GPU if we have one). As before, the embedding dimensions and the amount of dropout used can be different between the encoder and the decoder, but the hidden dimensions must remain the same.

In [42]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
HID_DIM = 256
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_LAYERS = 3
DEC_LAYERS = 3
ENC_HEADS = 8
DEC_HEADS = 8
ENC_PF_DIM = 512
DEC_PF_DIM = 512
ENC_DROPOUT = 0.1
DEC_DROPOUT = 0.1

enc = Encoder(INPUT_DIM, 
              HID_DIM, 
              ENC_LAYERS, 
              ENC_HEADS, 
              ENC_PF_DIM, 
              ENC_DROPOUT, 
              device)

dec = Decoder(OUTPUT_DIM, 
              HID_DIM, 
              DEC_LAYERS, 
              DEC_HEADS, 
              DEC_PF_DIM, 
              DEC_DROPOUT, 
              device)

In [43]:
OUTPUT_DIM

4830

In [44]:
SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
SRC_UNK_IDX = SRC.vocab.stoi[SRC.unk_token]
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
TRG_UNK_IDX = TRG.vocab.stoi[TRG.unk_token]

model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, device).to(device)

In [45]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean=0, std=0.01)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (tok_embedding): Embedding(1912, 256)
    (pos_embedding): Embedding(100, 256)
    (layers): ModuleList(
      (0): EncoderLayer(
        (self_attn_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (ff_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (self_attention): MultiHeadAttentionLayer(
          (fc_q): Linear(in_features=256, out_features=256, bias=True)
          (fc_k): Linear(in_features=256, out_features=256, bias=True)
          (fc_v): Linear(in_features=256, out_features=256, bias=True)
          (fc_o): Linear(in_features=256, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (positionwise_feedforward): PositionwiseFeedforwardLayer(
          (fc_1): Linear(in_features=256, out_features=512, bias=True)
          (fc_2): Linear(in_features=512, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
     

In [46]:
enc.tok_embedding.weight.data[SRC_PAD_IDX] = torch.zeros(ENC_EMB_DIM)
enc.tok_embedding.weight.data[SRC_UNK_IDX] = torch.zeros(ENC_EMB_DIM)

In [47]:
dec.tok_embedding.weight.data[TRG_PAD_IDX] = torch.zeros(DEC_EMB_DIM)
dec.tok_embedding.weight.data[SRC_UNK_IDX] = torch.zeros(DEC_EMB_DIM)

Next, we initialize our parameters. The paper states the parameters are initialized from a normal distribution with a mean of 0 and a standard deviation of 0.01, i.e. $\mathcal{N}(0, 0.01)$. 

It also states we should initialize the recurrent parameters to a special initialization, however to keep things simple we'll also initialize them to $\mathcal{N}(0, 0.01)$.

We print out the number of parameters.

Even though we only have a single layer RNN for our encoder and decoder we actually have **more** parameters  than the last model. This is due to the increased size of the inputs to the GRU and the linear layer. However, it is not a significant amount of parameters and causes a minimal amount of increase in training time (~3 seconds per epoch extra).

In [48]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 6,972,126 trainable parameters


We initiaize our optimizer.

In [49]:
LEARNING_RATE = 0.0005


optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)

We also initialize the loss function, making sure to ignore the loss on `<pad>` tokens.

In [50]:
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)


We then create the training loop...

In [51]:
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.English
        trg = batch.Python
        
        optimizer.zero_grad()
        
        output, _ = model(src, trg[:,:-1])
                
        #output = [batch size, trg len - 1, output dim]
        #trg = [batch size, trg len]
            
        output_dim = output.shape[-1]
            
        output = output.contiguous().view(-1, output_dim)
        trg = trg[:,1:].contiguous().view(-1)
                
        #output = [batch size * trg len - 1, output dim]
        #trg = [batch size * trg len - 1]
            
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [76]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.English
            trg = batch.Python

            output, _ = model(src, trg[:,:-1])
            
            #output = [batch size, trg len - 1, output dim]
            #trg = [batch size, trg len]
            
            output_dim = output.shape[-1]
            
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:,1:].contiguous().view(-1)
            
            #output = [batch size * trg len - 1, output dim]
            #trg = [batch size * trg len - 1]
            
            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

...and the evaluation loop, remembering to set the model to `eval` mode and turn off teaching forcing.

In [77]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


In [78]:
N_EPOCHS = 250
CLIP = 1
from torch.optim.lr_scheduler import ReduceLROnPlateau

scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.8, patience=10, verbose=True)

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'english_python.pt')
    scheduler.step(train_loss)
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 0m 4s
	Train Loss: 4.901 | Train PPL: 134.465
	 Val. Loss: 4.195 |  Val. PPL:  66.358
Epoch: 02 | Time: 0m 4s
	Train Loss: 3.731 | Train PPL:  41.722
	 Val. Loss: 3.297 |  Val. PPL:  27.024
Epoch: 03 | Time: 0m 4s
	Train Loss: 3.009 | Train PPL:  20.259
	 Val. Loss: 2.760 |  Val. PPL:  15.797
Epoch: 04 | Time: 0m 4s
	Train Loss: 2.565 | Train PPL:  13.003
	 Val. Loss: 2.435 |  Val. PPL:  11.417
Epoch: 05 | Time: 0m 4s
	Train Loss: 2.277 | Train PPL:   9.745
	 Val. Loss: 2.211 |  Val. PPL:   9.123
Epoch: 06 | Time: 0m 4s
	Train Loss: 2.070 | Train PPL:   7.927
	 Val. Loss: 2.059 |  Val. PPL:   7.838
Epoch: 07 | Time: 0m 4s
	Train Loss: 1.910 | Train PPL:   6.754
	 Val. Loss: 1.929 |  Val. PPL:   6.884
Epoch: 08 | Time: 0m 4s
	Train Loss: 1.781 | Train PPL:   5.937
	 Val. Loss: 1.814 |  Val. PPL:   6.136
Epoch: 09 | Time: 0m 4s
	Train Loss: 1.664 | Train PPL:   5.282
	 Val. Loss: 1.721 |  Val. PPL:   5.592
Epoch: 10 | Time: 0m 4s
	Train Loss: 1.566 | Train PPL:   4.787


Then, we train our model, saving the parameters that give us the best validation loss.

Finally, we test the model on the test set using these "best" parameters.

In [79]:
model.load_state_dict(torch.load('english_python.pt'))

valid_loss = evaluate(model, valid_iterator, criterion)

print(f'| Validation Loss: {valid_loss:.3f} ')

| Validation Loss: 0.485 


In [80]:
def generate_program(sentence, src_field, trg_field, model, device, max_len = 50):
    
    model.eval()
        
    if isinstance(sentence, str):
        nlp = spacy.load('en')
        tokens = [token.text.lower() for token in nlp(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    tokens = [src_field.init_token] + tokens + [src_field.eos_token]
        
    src_indexes = [src_field.vocab.stoi[token] for token in tokens]

    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)
    
    src_mask = model.make_src_mask(src_tensor)
    
    with torch.no_grad():
        enc_src = model.encoder(src_tensor, src_mask)

    trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]

    for i in range(max_len):

        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)

        trg_mask = model.make_trg_mask(trg_tensor)
        
        with torch.no_grad():
            output, attention = model.decoder(trg_tensor, enc_src, trg_mask, src_mask)
        
        pred_token = output.argmax(2)[:,-1].item()
        
        trg_indexes.append(pred_token)

        if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
            break
    
    trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]
    
    return trg_tokens[1:], attention

In [81]:
def display_attention(sentence, translation, attention, n_heads = 8, n_rows = 4, n_cols = 2):
    
    assert n_rows * n_cols == n_heads
    
    fig = plt.figure(figsize=(15,25))
    
    for i in range(n_heads):
        
        ax = fig.add_subplot(n_rows, n_cols, i+1)
        
        _attention = attention.squeeze(0)[i].cpu().detach().numpy()

        cax = ax.matshow(_attention, cmap='bone')

        ax.tick_params(labelsize=12)
        ax.set_xticklabels(['']+['<sos>']+[t.lower() for t in sentence]+['<eos>'], 
                           rotation=45)
        ax.set_yticklabels(['']+translation)

        ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
        ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()

In [82]:
example_idx = 5

src = vars(valid_dataset.examples[example_idx])['English']
trg = vars(valid_dataset.examples[example_idx])['Python']
src_final = " ".join(src)
trg_final = " ".join(trg)



trg_final = trg_final.replace('NEWLINE','\n')
trg_final = trg_final.replace('INDENT','\t')
trg_final = trg_final.replace('DEDENT',' ')
print(src_final)
print(trg_final)


write a python function to append all elements of one list to another
def extend_list ( list1 , list2 ) : 
 	 list1 = [ 1 , 2 ] 
 list2 = [ 3 , 4 ] 
 return list1 . extend ( list2 ) 
  


In [83]:
example_idx = 2
src = vars(valid_dataset.examples[example_idx])['English']
#src =  src.split()
src_final = " ".join(src)
print(src_final)

trg, attention = generate_program(src_final, SRC, TRG, model, device)
trg_final = " ".join(trg)
trg_final = trg_final.replace('NEWLINE','\n')
trg_final = trg_final.replace('INDENT','\t')

print(trg_final)

write a python program to print a list after removing elements from index 1 to 4
list1 = [ 11 , 5 , 17 , 18 , 23 , 50 ] 
 del list1 [ 1 : 5 ] 
 print ( * list1 ) 
 <eos>


In [84]:
example_idx = 1
src = vars(valid_dataset.examples[example_idx])['English']
#src =  src.split()
src_final = " ".join(src)
print(src_final)

trg, attention = generate_program(src_final, SRC, TRG, model, device)
trg_final = " ".join(trg)
trg_final = trg_final.replace('NEWLINE','\n')
trg_final = trg_final.replace('INDENT','\t')

print(trg_final)

write a python function to print pyramid pattern
def pyramid_pattern ( symbol = '*' , count = 4 ) : 
 	 for i in range ( 1 , count + 1 ) : 
 	 print ( ' ' * ( count - i ) + symbol * i , end = '' ) 
 print ( symbol


In [85]:
example_idx = 3
src = vars(valid_dataset.examples[example_idx])['English']
#src =  src.split()
src_final = " ".join(src)
print(src_final)

trg, attention = generate_program(src_final, SRC, TRG, model, device)
trg_final = " ".join(trg)
trg_final = trg_final.replace('NEWLINE','\n')
trg_final = trg_final.replace('INDENT','\t')

print(trg_final)

97 write a python function that accepts a number and returns the nearest square number
import math 
 def nearest_square ( n ) : 
 	 upp = math . floor ( math . sqrt ( n ) ) 
 low = math . floor ( math . sqrt ( n ) ) 
 upp_diff = upp ** 2 - low ** 2 
 low_diff


In [86]:
example_idx = 4
src = vars(valid_dataset.examples[example_idx])['English']
#src =  src.split()
src_final = " ".join(src)
print(src_final)

trg, attention = generate_program(src_final, SRC, TRG, model, device)
trg_final = " ".join(trg)
trg_final = trg_final.replace('NEWLINE','\n')
trg_final = trg_final.replace('INDENT','\t')

print(trg_final)

write a python program to make use of enumerate method
q = [ 1 , 2 , 3 , 4 ] 
 q . insert ( 0 , 5 ) 
 print ( f"Revised List:{q}" ) 
 <eos>


In [87]:
example_idx = 5
src = vars(valid_dataset.examples[example_idx])['English']
#src =  src.split()
src_final = " ".join(src)
print(src_final)

trg, attention = generate_program(src_final, SRC, TRG, model, device)
trg_final = " ".join(trg)
trg_final = trg_final.replace('NEWLINE','\n')
trg_final = trg_final.replace('INDENT','\t')

print(trg_final)

write a python function to append all elements of one list to another
def extend_list ( list1 , list2 ) : 
 	 list1 = [ 1 , 2 ] 
 list2 = [ 3 , 4 ] 
 return list1 . extend ( list2 ) 
 DEDENT <eos>


In [88]:
example_idx = 7
src = vars(valid_dataset.examples[example_idx])['English']
#src =  src.split()
src_final = " ".join(src)
print(src_final)

trg, attention = generate_program(src_final, SRC, TRG, model, device)
trg_final = " ".join(trg)
trg_final = trg_final.replace('NEWLINE','\n')
trg_final = trg_final.replace('INDENT','\t')

print(trg_final)

write a function that given a number find the most significant bit number which is set bit and which is in power of two
from math import log 
 def near_thousand ( n ) : 
 	 return ( abs ( 1000 - x1 ) <= 100 ) 
 DEDENT print ( near_thousand ( 'c' ) ) 
 print ( near_thousand ( 900 ) ) 
 print ( near_thousand ( 900 ) ) 



In [92]:
for example_idx in range(7,25):
  src = vars(valid_dataset.examples[example_idx])['English']
  #src =  src.split()
  src_final = " ".join(src)
  print(src_final)

  trg, attention = generate_program(src_final, SRC, TRG, model, device)
  trg_final = " ".join(trg)
  trg_final = trg_final.replace('NEWLINE','\n')
  trg_final = trg_final.replace('INDENT','\t')
  trg_final = trg_final.replace('DEDENT ',' ')

  print(trg_final)
  print("\n\n")

write a function that given a number find the most significant bit number which is set bit and which is in power of two
from math import log 
 def near_thousand ( n ) : 
 	 return ( abs ( 1000 - x1 ) <= 100 ) 
  print ( near_thousand ( 'c' ) ) 
 print ( near_thousand ( 900 ) ) 
 print ( near_thousand ( 900 ) ) 




write a function to rotate string left by a given length
def rotate_left ( input , d ) : 
 	 Lfirst = input [ 0 : d ] 
 Lsecond = input [ d : ] 
 return ( Lsecond + Lfirst ) 
  <eos>



41 write a python program to check if one tuple is subset of other and print it
def filter_with_key_value ( list_of_dicts , key , value ) : 
 	 return list ( filter ( lambda x : x . get ( key ) == value , list_of_dicts ) ) 
  <eos>



write a python program to get numbers divisible by fifteen from a list
num_list = [ 45 , 55 , 60 , 37 , 100 , 105 , 220 ] 
 result = list ( filter ( lambda x : ( x % 15 == 0 ) , num_list ) ) 
 print ( f"Numbers divisible by 15 are {result}" ) 
 <eos>



calculat