<a href="https://colab.research.google.com/github/myomyint-maung/nlp-assignments/blob/main/06-Autocomplete/06-Code-Autocompletion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Feb 16 - Code Autocompletion

In [1]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext, datasets, math
from tqdm import tqdm

In [2]:
# Choose the computing device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [3]:
# Set SEED for reproducibility
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

## 1. Loading Data

In [4]:
# Load CodeParrot's Jupyter-Code-to-Text from HuggingFace  
train_set = datasets.load_dataset('codeparrot/github-jupyter-code-to-text', split='train')
test_set  = datasets.load_dataset('codeparrot/github-jupyter-code-to-text', split='test')

print(train_set)
print(test_set)



Dataset({
    features: ['repo_name', 'path', 'license', 'content'],
    num_rows: 47452
})
Dataset({
    features: ['repo_name', 'path', 'license', 'content'],
    num_rows: 11864
})


### 2. Preprocessing

In [5]:
# Remove comments from the codes
import re

comment_pattern = r"(^\s*#.*$)"
block_comment_pattern = r"(\"\"\")(.*?)(\"\"\")"

train_clean = list()
for code in train_set['content']:
    code = re.sub(comment_pattern, "", code, flags=re.MULTILINE)
    code = re.sub(block_comment_pattern, "", code, flags=re.DOTALL)
    train_clean.append(code)

test_clean = list()
for code in test_set['content']:
    code = re.sub(comment_pattern, "", code, flags=re.MULTILINE)
    code = re.sub(block_comment_pattern, "", code, flags=re.DOTALL)
    test_clean.append(code)

In [6]:
# Divide the codes into sentences
train_sents = [sent for code in train_clean for sent in code.split('\n') if sent != '']
test_sents  = [sent for code in test_clean for sent in code.split('\n') if sent != '']

print(train_sents[0], len(train_sents))
print(test_sents[0], len(test_sents))

import numpy as np 4984055
import tensorflow as tf 1238709


In [20]:
# Tokenize the datasets
tokenizer = torchtext.data.utils.get_tokenizer('spacy', language='en_core_web_sm')

def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

train_tokenized = yield_tokens(train_sents)
test_tokenized  = yield_tokens(test_sents)

print(next(iter(train_tokenized)))
print(next(iter(test_tokenized)))

['import', 'numpy', 'as', 'np']
['import', 'tensorflow', 'as', 'tf']


In [8]:
# Remove non-word, numeric and underscore strings from the train tokens
non_word = re.compile('[\W\d]+')
numeric_ = re.compile(r'[^0-9_]')

train_tokens = []
for string_list in train_tokenized:

    # Use a list comprehension to remove non-word strings
    word_list = [non_word.sub('', string) for string in string_list]

    # Use a list comprehension to filter out numeric and underscore strings
    filtered_list = [string for string in word_list if numeric_.match(string)]

    # Use a list comprehension to remove empty strings
    cleaned_list = [string for string in filtered_list if string != '']
    
    # Create a list of train tokens with the cleaned lists
    train_tokens.append(cleaned_list)

print(len(train_tokens))

4984054


In [9]:
# Numericalize the train tokens
vocab = torchtext.vocab.build_vocab_from_iterator(train_tokens) 
vocab.insert_token('<unk>', 0)           
vocab.insert_token('<eos>', 1)            
vocab.set_default_index(vocab['<unk>'])   
print(len(vocab))                         
print(vocab.get_itos()[:10])       

1515043
['<unk>', '<eos>', 'import', 'in', 'for', 'from', 'def', 'as', 'x', 'return']


In [29]:
# Save the vocab
import pickle

with open('vocab.pkl', 'wb') as file:
    pickle.dump(vocab, file)

file.close()

### 3. Preparing Data Loaders  

In [10]:
def get_data(dataset, vocab, batch_size):
    data = []                                                   
    for example in dataset:
        #appends eos so we know it ends....so model learn how to end...                             
        tokens = example.append('<eos>')   
        #numericalize          
        tokens = [vocab[token] for token in example] 
        data.extend(tokens)                                    
    data = torch.LongTensor(data)                                 
    num_batches = data.shape[0] // batch_size 
    data = data[:num_batches * batch_size]                       
    data = data.view(batch_size, num_batches)          
    return data


In [21]:
batch_size = 128
train_data = get_data(train_tokenized, vocab, batch_size)
valid_data = get_data(test_tokenized, vocab, batch_size)

print(train_data.shape, valid_data.shape)

torch.Size([128, 307006]) torch.Size([128, 76197])


In [31]:
# Save the data
with open('train_data.pkl', 'wb') as file:
    pickle.dump(train_data, file)

file.close()

with open('valid_data.pkl', 'wb') as file:
    pickle.dump(valid_data, file)

file.close()

### 4. Modeling 

In [12]:
class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, num_layers, dropout_rate):
                
        super().__init__()
        self.hid_dim   = hid_dim
        self.num_layers= num_layers
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.lstm      = nn.LSTM(emb_dim, hid_dim, num_layers=num_layers,
                                 dropout=dropout_rate, batch_first=True)
        self.dropout   = nn.Dropout(dropout_rate)
        #when you do LM, you look forward, so it does not make sense to do bidirectionality
        self.fc        = nn.Linear(hid_dim, vocab_size)

    def init_hidden(self, batch_size, device):
        #this function gonna be run in the beginning of the epoch
        hidden = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        cell   = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        
        return hidden, cell #return as tuple
    
    def detach_hidden(self, hidden):
        #this gonna run in every batch
        hidden, cell = hidden
        hidden = hidden.detach() #removing this hidden from gradients graph
        cell   = cell.detach()   #removing this cell from gradients graph
        return hidden, cell
        
    def forward(self, src, hidden):
        #src: [batch size, seq len]
        
        #embed
        embed = self.embedding(src)
        #embed: [batch size, seq len, emb_dim]
        
        #send this to the lstm
        #we want to put hidden here...because we want to reset hidden....
        output, hidden = self.lstm(embed, hidden)
        #output: [batch size, seq len, hid_dim] ==> all hidden states
        #hidden: [num layer, batch size, hid_dim]  ===> last hidden states from each layer
        
        output = self.dropout(output)
        prediction = self.fc(output)
        #prediction: [batch size, seq len, vocab size]
        
        return prediction, hidden
    

### 5. Training

In [13]:
vocab_size = 1515043
emb_dim = 1024
hid_dim = 1024
num_layers = 2
dropout_rate = 0.65              
lr = 1e-3                     

In [14]:
model = LSTMLanguageModel(vocab_size, emb_dim, hid_dim, num_layers, dropout_rate).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {num_params:,} trainable parameters')

The model has 3,121,116,707 trainable parameters


In [15]:
def get_batch(data, seq_len, idx):
    #this data is from get_data()
    #train_data.shape #[batch size, number of batches....]
    src    = data[:, idx:idx+seq_len]                   
    target = data[:, idx+1:idx+seq_len+1]  #target simply is ahead of src by 1            
    return src, target

In [16]:
def train(model, data, optimizer, criterion, batch_size, seq_len, clip, device):
    
    epoch_loss = 0
    model.train()
    # drop all batches that are not a multiple of seq_len
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)
    
    for idx in tqdm(range(0, num_batches - 1, seq_len), desc='Training: ',leave=False):
        optimizer.zero_grad()
        hidden = model.detach_hidden(hidden)

        src, target = get_batch(data, seq_len, idx) #src, target: [batch size, seq len]
        src, target = src.to(device), target.to(device)
        batch_size = src.shape[0]
        prediction, hidden = model(src, hidden)               

        prediction = prediction.reshape(batch_size * seq_len, -1)  #prediction: [batch size * seq len, vocab size]  
        target = target.reshape(-1)
        loss = criterion(prediction, target)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)  #prevents gradient explosion - clip is basically the threshold.....
        optimizer.step()
        epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [17]:
def evaluate(model, data, criterion, batch_size, seq_len, device):

    epoch_loss = 0
    model.eval()
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)

    with torch.no_grad():
        for idx in range(0, num_batches - 1, seq_len):
            hidden = model.detach_hidden(hidden)
            src, target = get_batch(data, seq_len, idx)
            src, target = src.to(device), target.to(device)
            batch_size= src.shape[0]

            prediction, hidden = model(src, hidden)
            prediction = prediction.reshape(batch_size * seq_len, -1)
            target = target.reshape(-1)

            loss = criterion(prediction, target)
            epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [18]:
# Create the folder to save models
import os
from os import path

if path.exists('./models') == False:
  os.mkdir('./models')

In [24]:
n_epochs = 50
seq_len  = 50
clip     = 0.25

lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=0)

best_valid_loss = float('inf')

for epoch in range(n_epochs):
    train_loss = train(model, train_data, optimizer, criterion, 
                batch_size, seq_len, clip, device)
    valid_loss = evaluate(model, valid_data, criterion, batch_size, 
                seq_len, device)

    lr_scheduler.step(valid_loss)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'models/lstm_lm.pt')

    print(f'\tTrain Perplexity: {math.exp(train_loss):.3f}')
    print(f'\tValid Perplexity: {math.exp(valid_loss):.3f}')



OutOfMemoryError: ignored

### 6. Testing

### 7. Real-world inference