In [None]:
from google.colab import files

In [None]:
data = files.upload()

Saving eng_python_data.txt to eng_python_data.txt


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.legacy.data import Field, BucketIterator,TabularDataset

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import spacy

from spacy.tokenizer import Tokenizer
import numpy as np
import pandas as pd

import random
import math
import time
import os
import re
import keyword

In [None]:
import torchtext

In [None]:
SEED = 3333

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
#out dataframe
out_df = pd.DataFrame(columns = ['src', 'python'])

In [None]:
input_file = 'eng_python_data.txt'
output_file = os.path.join('analysis_shortv3.txt')

In [None]:
# first print all lines starting with # with count
with open(output_file,'w') as out_file:
    eng_buf = ''
    py_buf = ''
    samples = 0
    with open(input_file) as in_file:
        for idx,line in enumerate(in_file):
            if line.startswith('#') and len(line) > 30:
                samples = samples + 1
                out_df.loc[len(out_df)] = [eng_buf,py_buf]
                py_buf = ''
                eng_buf = line

            else :
                py_buf = py_buf + line

print(f" Count of # :{idx}")

 Count of # :42424


In [None]:
# Data cleaning

# Replace tabs with 4 spaces and trim leading and trailing spaces

out_df['python']= out_df['python'].str.replace('\t', '    ')
out_df['python']= out_df['python'].str.strip()

# clean spaces 
# 3->4
# 7->8
# 11->12

reg3s_pat = re.compile(r'(:?\n)[\s]{3}([\w])')
reg7s_pat = re.compile(r'(:?\n)[\s]{7}([\w])')
reg11s_pat = re.compile(r'(:?\n)[\s]{11}([\w])')

In [None]:
def regex_clean(val):

    clean_py = reg3s_pat.sub(r'\1    \2', val)
    clean_py = reg7s_pat.sub(r'\1        \2', clean_py)
    clean_py = reg11s_pat.sub(r'\1            \2', clean_py)
    
    return clean_py

out_df['trg'] = out_df['python'].apply(regex_clean)

In [None]:
### Data Augmentation ( needed as model will overfit)

# Patterns to match function and variable names
func_pat = re.compile('def (?P<func_name>[\w]+?)\(')
var_pat = re.compile(r'\n\s*(?P<var_name>[\w]+?)\s*=')

In [None]:
# Create new dataset by regex matching function 
# and variable names and giving generic names

final_df = pd.DataFrame(columns = ['src','trg'])

In [None]:
for row_idx,row in out_df.iterrows():
  final_df.loc[len(final_df)] = [row.src,row.trg]
  var_list = []
  func_list = []
  func_list = list(set(func_pat.findall(row.trg)))
  var_list = list(set(var_pat.findall(row.trg)))

  if var_list:
    for var_idx,var in enumerate(var_list):
      varname = "var_"+ str(var_idx)
      final_df.loc[len(final_df)] = [row.src,row.trg.replace(var,varname)]
  if func_list:
    for func_idx,func in enumerate(func_list):
      funcname = "func_"+ str(func_idx)
      final_df.loc[len(final_df)] = [row.src,row.trg.replace(func,funcname)]
      

In [None]:
final_df['len'] = final_df['trg'].str.len()
fout_df = final_df[final_df['len'] < 500][['src','trg']]
print(len(fout_df)/len(final_df))
fout_df.to_csv('p_data.csv')

2021-04-03 09:38:17.510 INFO    numexpr.utils: NumExpr defaulting to 2 threads.


0.8974421240084183


In [None]:
print(f"Data Samples after augmentation : {len(final_df)}")

Data Samples after augmentation : 12354


In [None]:
spacy_en = spacy.load('en')

In [None]:
# Tokenization
def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [None]:
kw_dict = {}
for kw in keyword.kwlist:
  kw_dict[kw]= [{"ORTH":kw}]

# learn 4, 8 12 spaces
special_tabs = ['\\n    ','\\n        ','\\n            ']
for tab in special_tabs:
    kw_dict[tab] = [{"ORTH":tab}]

special_cases = kw_dict
infix_re = re.compile(r'''(==|>=|<=|!=|\,|\?|\:|\;|.
                          |\‘|\’|\`|\“|\”|\"|\'|~|\(|\)|\[|\])''')

In [None]:
def python_tokenizer(nlp):
    return Tokenizer(nlp.vocab, 
                    infix_finditer=infix_re.finditer)


py_custom = python_tokenizer(spacy_en)

In [None]:
#Modified the py thokenizer to factor spaces

def tokenize_py(text):
    """
    Tokenizes English text from a string into a list of strings
    """
    token_texts = []
    for token in py_custom(text):
       token_texts.append(token.text)
       if token.whitespace_:  # filter out empty strings
           token_texts.append(token.whitespace_)
    return token_texts

In [None]:
SRC = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True, 
            batch_first = True)

TRG = Field(tokenize = tokenize_py, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True, 
            batch_first = True)

fields = {'src' : ('src', SRC),
          'trg' : ('trg', TRG)}

In [None]:
with open("/content/drive/MyDrive/Data/SRC.pkl","wb") as f:
  pickle.dump(SRC,f)

with open("/content/drive/MyDrive/Data/TRG.pkl","wb") as f:
  pickle.dump(TRG,f)

In [None]:
e2p_data  = TabularDataset(
                            path = 'p_data.csv',
                            format = 'csv',
                            fields = fields
                            )

In [None]:
train_data, valid_data, test_data = e2p_data.split([0.7,.2,.1])

In [None]:
SRC.build_vocab(train_data, min_freq = 1)
TRG.build_vocab(train_data, min_freq = 1)

In [None]:
import pickle
#save the dictionary as pickle file to be used during inference
with open("/content/drive/MyDrive/Data/src_stio.pkl","wb") as f:
  pickle.dump(SRC.vocab.stoi,f)
with open("/content/drive/MyDrive/Data/src_itos.pkl","wb") as f:
  pickle.dump(SRC.vocab.itos,f)

#save the dictionary as pickle file to be used during inference
with open("/content/drive/MyDrive/Data/trg_stio.pkl","wb") as f:
  pickle.dump(TRG.vocab.stoi,f)
with open("/content/drive/MyDrive/Data/trg_itos.pkl","wb") as f:
  pickle.dump(TRG.vocab.itos,f)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [None]:
BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    device = device,
    sort_within_batch = True,
    sort_key = lambda x: len(x.src))

In [None]:
class Encoder(nn.Module):
    def __init__(self, 
                 input_dim, 
                 hid_dim, 
                 n_layers, 
                 n_heads, 
                 pf_dim,
                 dropout, 
                 device,
                 max_length = 300):
        super().__init__()

        self.device = device
        
        self.tok_embedding = nn.Embedding(input_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)
        
        self.layers = nn.ModuleList([EncoderLayer(hid_dim, 
                                                  n_heads, 
                                                  pf_dim,
                                                  dropout, 
                                                  device) 
                                     for _ in range(n_layers)])
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        
    def forward(self, src, src_mask):
        
        
        batch_size = src.shape[0]
        src_len = src.shape[1]
        
        pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        
        
        src = self.dropout((self.tok_embedding(src) * self.scale) + self.pos_embedding(pos))
        
        
        for layer in self.layers:
            src = layer(src, src_mask)
            
            
        return src

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, 
                 hid_dim, 
                 n_heads, 
                 pf_dim,  
                 dropout, 
                 device):
        super().__init__()
        
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, 
                                                                     pf_dim, 
                                                                     dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src, src_mask):       
                
        #self attention
        _src, _ = self.self_attention(src, src, src, src_mask)
        
        #dropout, residual connection and layer norm
        src = self.self_attn_layer_norm(src + self.dropout(_src))
        
        
        #positionwise feedforward
        _src = self.positionwise_feedforward(src)
        
        #dropout, residual and layer norm
        src = self.ff_layer_norm(src + self.dropout(_src))
        
        
        return src

In [None]:
class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, dropout, device):
        super().__init__()
        
        assert hid_dim % n_heads == 0
        
        self.hid_dim = hid_dim
        self.n_heads = n_heads
        self.head_dim = hid_dim // n_heads
        
        self.fc_q = nn.Linear(hid_dim, hid_dim)
        self.fc_k = nn.Linear(hid_dim, hid_dim)
        self.fc_v = nn.Linear(hid_dim, hid_dim)
        
        self.fc_o = nn.Linear(hid_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)
        
    def forward(self, query, key, value, mask = None):
        
        batch_size = query.shape[0]
                
        Q = self.fc_q(query)
        K = self.fc_k(key)
        V = self.fc_v(value)
                
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        
                
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale
        
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)
        
        attention = torch.softmax(energy, dim = -1)
                
        x = torch.matmul(self.dropout(attention), V)
        
        
        x = x.permute(0, 2, 1, 3).contiguous()
        
        
        x = x.view(batch_size, -1, self.hid_dim)
        
        
        x = self.fc_o(x)
        
        return x, attention

In [None]:
class PositionwiseFeedforwardLayer(nn.Module):
    def __init__(self, hid_dim, pf_dim, dropout):
        super().__init__()
        
        self.fc_1 = nn.Linear(hid_dim, pf_dim)
        self.fc_2 = nn.Linear(pf_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        x = self.dropout(torch.relu(self.fc_1(x)))

        x = self.fc_2(x)

        return x



In [None]:
class Decoder(nn.Module):
    def __init__(self, 
                 output_dim, 
                 hid_dim, 
                 n_layers, 
                 n_heads, 
                 pf_dim, 
                 dropout, 
                 device,
                 max_length = 300):
        super().__init__()
        
        self.device = device
        
        self.tok_embedding = nn.Embedding(output_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)
        
        self.layers = nn.ModuleList([DecoderLayer(hid_dim, 
                                                  n_heads, 
                                                  pf_dim, 
                                                  dropout, 
                                                  device)
                                     for _ in range(n_layers)])
        
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        
    def forward(self, trg, enc_src, trg_mask, src_mask):
                        
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        
        pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
          
        trg = self.dropout((self.tok_embedding(trg) * self.scale) + self.pos_embedding(pos))
       
        for layer in self.layers:
            trg, attention = layer(trg, enc_src, trg_mask, src_mask)

        output = self.fc_out(trg)

            
        return output, attention

In [None]:
class DecoderLayer(nn.Module):
    def __init__(self, 
                 hid_dim, 
                 n_heads, 
                 pf_dim, 
                 dropout, 
                 device):
        super().__init__()
        
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.enc_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.encoder_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, 
                                                                     pf_dim, 
                                                                     dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, trg, enc_src, trg_mask, src_mask):      
        #self attention
        _trg, _ = self.self_attention(trg, trg, trg, trg_mask)
        
        #dropout, residual connection and layer norm
        trg = self.self_attn_layer_norm(trg + self.dropout(_trg))
                        
        #encoder attention
        _trg, attention = self.encoder_attention(trg, enc_src, enc_src, src_mask)
        # query, key, value
        
        #dropout, residual connection and layer norm
        trg = self.enc_attn_layer_norm(trg + self.dropout(_trg))
       
        #positionwise feedforward
        _trg = self.positionwise_feedforward(trg)
        
        #dropout, residual and layer norm
        trg = self.ff_layer_norm(trg + self.dropout(_trg))
        
        return trg, attention

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, 
                 encoder, 
                 decoder, 
                 src_pad_idx, 
                 trg_pad_idx, 
                 device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device
        
    def make_src_mask(self, src):
             
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)

        return src_mask
    
    def make_trg_mask(self, trg):
             
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)
      
        trg_len = trg.shape[1]
        
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device = self.device)).bool()
          
        trg_mask = trg_pad_mask & trg_sub_mask
               
        return trg_mask

    def forward(self, src, trg):
                      
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
      
        enc_src = self.encoder(src, src_mask)
               
        output, attention = self.decoder(trg, enc_src, trg_mask, src_mask)
              
        return output, attention

In [None]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
HID_DIM = 256
ENC_LAYERS = 3
DEC_LAYERS = 3
ENC_HEADS = 8
DEC_HEADS = 8
ENC_PF_DIM = 512
DEC_PF_DIM = 512
ENC_DROPOUT = 0.1
DEC_DROPOUT = 0.1

enc = Encoder(INPUT_DIM, 
              HID_DIM, 
              ENC_LAYERS, 
              ENC_HEADS, 
              ENC_PF_DIM, 
              ENC_DROPOUT, 
              device)

dec = Decoder(OUTPUT_DIM, 
              HID_DIM, 
              DEC_LAYERS, 
              DEC_HEADS, 
              DEC_PF_DIM, 
              DEC_DROPOUT, 
              device)

In [None]:
len(TRG.vocab)

8740

In [None]:
SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, device).to(device)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 9,134,628 trainable parameters


In [None]:
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)

In [None]:
model.apply(initialize_weights);

In [None]:
LEARNING_RATE = 0.0005

optimizer = torch.optim.AdamW(model.parameters(), lr = LEARNING_RATE)

In [None]:
#modified the loss function
# Built a custom function
# If keyword or tab :5
# If among top 50 most frequent : 4 ( most common happen to be ones that help wtih syntax)
# Rest had a weight 1

py_toks = ['(',')','{','}','[',']',':',',',';',
            '+','-','*','/','|','&','<','>','=','.',
            '%','==','!=','<=','>=','~','^','**',
            '+=','-=','*=','/=','%=','/=','//']

weight_list = []
for idx,word in enumerate(TRG.vocab.itos):


  #default
  weight = 1.0 
 

  # keyword or tab or common tokens
  if (keyword.iskeyword(word)) or ('\n' in word) or (word in py_toks):
      weight = 2.0
  
  weight_list.append(weight)

class_weights = torch.FloatTensor(weight_list).to(device)

criterion = nn.CrossEntropyLoss(weight=class_weights, ignore_index = TRG_PAD_IDX)

In [None]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        
        output, _ = model(src, trg[:,:-1])
                
        output_dim = output.shape[-1]
            
        output = output.contiguous().view(-1, output_dim)
        trg = trg[:,1:].contiguous().view(-1)
            
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output, _ = model(src, trg[:,:-1])
            
            output_dim = output.shape[-1]
            
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:,1:].contiguous().view(-1)
            
            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
training_vis = pd.DataFrame(columns=['epoch','train_loss','val_loss'])
N_EPOCHS = 15
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), '/content/drive/MyDrive/Data/tut6-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')
    training_vis.loc[len(training_vis)] = [epoch+1,train_loss,valid_loss]

Epoch: 01 | Time: 0m 25s
	Train Loss: 0.331 | Train PPL:   1.392
	 Val. Loss: 0.455 |  Val. PPL:   1.576
Epoch: 02 | Time: 0m 26s
	Train Loss: 0.313 | Train PPL:   1.367
	 Val. Loss: 0.442 |  Val. PPL:   1.555
Epoch: 03 | Time: 0m 25s
	Train Loss: 0.291 | Train PPL:   1.338
	 Val. Loss: 0.434 |  Val. PPL:   1.544
Epoch: 04 | Time: 0m 25s
	Train Loss: 0.280 | Train PPL:   1.323
	 Val. Loss: 0.432 |  Val. PPL:   1.540
Epoch: 05 | Time: 0m 26s
	Train Loss: 0.268 | Train PPL:   1.307
	 Val. Loss: 0.423 |  Val. PPL:   1.526
Epoch: 06 | Time: 0m 26s
	Train Loss: 0.256 | Train PPL:   1.292
	 Val. Loss: 0.414 |  Val. PPL:   1.512
Epoch: 07 | Time: 0m 25s
	Train Loss: 0.246 | Train PPL:   1.279
	 Val. Loss: 0.412 |  Val. PPL:   1.510
Epoch: 08 | Time: 0m 25s
	Train Loss: 0.233 | Train PPL:   1.263
	 Val. Loss: 0.408 |  Val. PPL:   1.504
Epoch: 09 | Time: 0m 25s
	Train Loss: 0.224 | Train PPL:   1.251
	 Val. Loss: 0.412 |  Val. PPL:   1.511
Epoch: 10 | Time: 0m 26s
	Train Loss: 0.213 | Train PPL

In [None]:
# ax = plt.gca()

# training_vis.plot(kind='line',x='epoch',y='train_loss',ax=ax)
# training_vis.plot(kind='line',x='epoch',y='val_loss', color='red', ax=ax)

# plt.show()

In [None]:
model.load_state_dict(torch.load('/content/drive/MyDrive/Data/tut6-model.pt'))

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

| Test Loss: 0.395 | Test PPL:   1.485 |


In [None]:
# Translating sentence

In [None]:
def translate_sentence(sentence, src_field, trg_field, model, device, max_len = 500):
    
    model.eval()
        
    if isinstance(sentence, str):
        nlp = spacy.load('en')
        tokens = [token.text.lower() for token in nlp(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    tokens = [src_field.init_token] + tokens + [src_field.eos_token]
        
    src_indexes = [src_field.vocab.stoi[token] for token in tokens]

    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)
    
    src_mask = model.make_src_mask(src_tensor)
    
    with torch.no_grad():
        enc_src = model.encoder(src_tensor, src_mask)

    trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]

    for i in range(max_len):

        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)

        trg_mask = model.make_trg_mask(trg_tensor)
        
        with torch.no_grad():
            output, attention = model.decoder(trg_tensor, enc_src, trg_mask, src_mask)
        
        pred_token = output.argmax(2)[:,-1].item()
        
        trg_indexes.append(pred_token)

        if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
            break
    
    trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]
    
    return trg_tokens[1:], attention

In [None]:
def display_attention(sentence, translation, attention, n_heads = 8, n_rows = 4, n_cols = 2):
    
    assert n_rows * n_cols == n_heads
    
    fig = plt.figure(figsize=(15,25))
    
    for i in range(n_heads):
        
        ax = fig.add_subplot(n_rows, n_cols, i+1)
        
        _attention = attention.squeeze(0)[i].cpu().detach().numpy()

        cax = ax.matshow(_attention, cmap='bone')

        ax.tick_params(labelsize=12)
        ax.set_xticklabels(['']+['<sos>']+[t.lower() for t in sentence]+['<eos>'], 
                           rotation=45)
        ax.set_yticklabels(['']+translation)

        ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
        ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()

In [None]:
# Function to display generated python code

def display_py(trans_list):
  print('\n')
  final_str = ''
  for string in trans_list:
    if string != '\n':
      final_str = final_str + string
    else:
      final_str= final_str + string
  return final_str

In [None]:
example_idx = random.randint(0,len(train_data))

src = vars(train_data.examples[example_idx])['src']
trg = vars(train_data.examples[example_idx])['trg']

print(display_py(trg))



def find_evennumbers(input_list):
  var_0 = [var for var in input_list if var % 2 == 0]
  return var_0


In [None]:
translation, attention = translate_sentence(src, SRC, TRG, model, device)

print(display_py(translation[:-1]))



def find_evennumbers(input_list):
  var_0 = [var for var in input_list if var % 2 == 0]
  return var_0


In [None]:
# display_attention(src, translation, attention)

In [None]:
example_idx = random.randint(0,len(valid_data))

src = vars(valid_data.examples[example_idx])['src']
trg = vars(valid_data.examples[example_idx])['trg']

print(display_py(trg))





str1 = "abc4234afde"
digitcount = 0
for i in range(0,len(str1)):
  char = str1[i]
  if(char.isalpha()):
    digitcount += 1
print('number of alphanumeric: ',digitcount)


In [None]:
translation, attention = translate_sentence(src, SRC, TRG, model, device)

print(display_py(translation[:-1]))



str1 = "abc4234afde"
digitcount = 0
for i in range(0,len(str1)):
  char = str1[i]
  if(char.isalpha()):
    digitcount += 1
print('number of alphanumeric: ',digitcount)


In [None]:
example_idx = random.randint(0,len(test_data))

src = vars(test_data.examples[example_idx])['src']
trg = vars(test_data.examples[example_idx])['trg']

print(display_py(trg))



def func_0(p:float, r:float, t:float, n:float)->float:
    return round(p*((1+(r/(n*100)))**(n*t)) - p,2)


In [None]:
translation, attention = translate_sentence(src, SRC, TRG, model, device)

print(display_py(translation[:-1]))



def get_si(p:float, r:float, t:float)->float:
    return (p*r*t)/100


In [None]:
def translate_sentence_vectorized(src_tensor, src_field, trg_field, model, device, max_len=500):
    assert isinstance(src_tensor, torch.Tensor)

    model.eval()
    src_mask = model.make_src_mask(src_tensor)

    with torch.no_grad():
        enc_src = model.encoder(src_tensor, src_mask)
    # enc_src = [batch_sz, src_len, hid_dim]

    trg_indexes = [[trg_field.vocab.stoi[trg_field.init_token]] for _ in range(len(src_tensor))]
    # Even though some examples might have been completed by producing a <eos> token
    # we still need to feed them through the model because other are not yet finished
    # and all examples act as a batch. Once every single sentence prediction encounters
    # <eos> token, then we can stop predicting.
    translations_done = [0] * len(src_tensor)
    for i in range(max_len):
        trg_tensor = torch.LongTensor(trg_indexes).to(device)
        trg_mask = model.make_trg_mask(trg_tensor)
        with torch.no_grad():
            output, attention = model.decoder(trg_tensor, enc_src, trg_mask, src_mask)
        pred_tokens = output.argmax(2)[:,-1]
        for i, pred_token_i in enumerate(pred_tokens):
            trg_indexes[i].append(pred_token_i)
            if pred_token_i == trg_field.vocab.stoi[trg_field.eos_token]:
                translations_done[i] = 1
        if all(translations_done):
            break

    # Iterate through each predicted example one by one;
    # Cut-off the portion including the after the <eos> token
    pred_sentences = []
    for trg_sentence in trg_indexes:
        pred_sentence = []
        for i in range(1, len(trg_sentence)):
            if trg_sentence[i] == trg_field.vocab.stoi[trg_field.eos_token]:
                break
            pred_sentence.append(trg_field.vocab.itos[trg_sentence[i]])
        pred_sentences.append(pred_sentence)

    return pred_sentences, attention

In [None]:
test_sample_idxs = [random.randint(0, len(train_data)) for p in range(0, 10)]
for count,example_idx in enumerate(test_sample_idxs):
  src = vars(train_data.examples[example_idx])['src']
  trg = vars(train_data.examples[example_idx])['trg']
  print(50*"*" + 'Sample    : ' + str(count + 1) + '  ' + 50*"*")
  print('\n')
  print("*******Gold *******")
  eng = ' '.join(src)
  label = display_py(trg)
  print(eng)
  print(label)
  print('\n')
  #print(' '.join(trg))

  print("*******Predicted *******")
  translation, attention = translate_sentence(src, SRC, TRG, model, device)

  print(f'predicted trg = {translation}')
  print(display_py(translation[:-1]))
  print('\n')
  print(100* "*")
  print('\n')

**************************************************Sample    : 1  **************************************************


*******Gold *******


#   given a python list , remove all occurrence of a given number from the list
num1 = 20
var_2 = [5, 20, 15, 20, 25, 50, 20]

def removevalue(samplelist, val):
    return [value for value in samplelist if value != val]
reslist = removevalue(var_2, num1)
print(reslist)

# shuffle a list randomly
import random
list = [2,5,8,9,12]
random.shuffle(list)
print ("printing shuffled list ", list)


*******Predicted *******
predicted trg = ['num1', ' ', '=', ' ', '20', '\n', 'var_01', ' ', '=', ' ', '[5', ',', ' ', '20', ',', ' ', '15', ',', ' ', '20', ',', ' ', '25', ',', ' ', '50', ',', ' ', '20', ']', '\n\n', 'def', ' ', 'removevalue', '(', 'samplelist', ',', ' ', 'val', ')', ':', '\n    ', 'return', ' ', '[value', ' ', 'for', ' ', 'value', ' ', 'in', ' ', 'samplelist', ' ', 'if', ' ', 'value', ' ', '!=', ' ', 'val', ']', '\n', 'reslist', ' ', '=', ' ', 