<a href="https://colab.research.google.com/github/myomyint-maung/nlp-assignments/blob/main/06-Autocomplete/06-Code-Autocompletion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Feb 16 - Code Autocompletion

In [1]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext, datasets, math
from tqdm import tqdm

In [2]:
# Choose the computing device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [3]:
# Set SEED for reproducibility
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

## 1. Loading Data

In [4]:
# Load CodeParrot's Jupyter-Code-to-Text from HuggingFace  
train_set = datasets.load_dataset('codeparrot/github-jupyter-code-to-text', split='train')
test_set  = datasets.load_dataset('codeparrot/github-jupyter-code-to-text', split='test')

print(train_set)
print(test_set)

Downloading readme:   0%|          | 0.00/857 [00:00<?, ?B/s]

Downloading and preparing dataset parquet/codeparrot--github-jupyter-code-to-text to /root/.cache/huggingface/datasets/codeparrot___parquet/codeparrot--github-jupyter-code-to-text-cf9b56d996fd17e1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/227M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/56.9M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/codeparrot___parquet/codeparrot--github-jupyter-code-to-text-cf9b56d996fd17e1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.




Dataset({
    features: ['repo_name', 'path', 'license', 'content'],
    num_rows: 47452
})
Dataset({
    features: ['repo_name', 'path', 'license', 'content'],
    num_rows: 11864
})


In [5]:
print(train_set['content'][0])

import numpy as np
from tensorflow import keras
from tensorflow.keras import layers

"""
Explanation: Simple MNIST convnet
Author: fchollet<br>
Date created: 2015/06/19<br>
Last modified: 2020/04/21<br>
Description: A simple convnet that achieves ~99% test accuracy on MNIST.
Setup
End of explanation
"""


# Model / data parameters
num_classes = 10
input_shape = (28, 28, 1)

# the data, split between train and test sets
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()

# Scale images to the [0, 1] range
x_train = x_train.astype("float32") / 255
x_test = x_test.astype("float32") / 255
# Make sure images have shape (28, 28, 1)
x_train = np.expand_dims(x_train, -1)
x_test = np.expand_dims(x_test, -1)
print("x_train shape:", x_train.shape)
print(x_train.shape[0], "train samples")
print(x_test.shape[0], "test samples")


# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test

In [6]:
print(test_set['content'][0])

#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Explanation: Copyright 2020 The TensorFlow Authors.
End of explanation
"""


# Import Tokenizer and pad_sequences
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Import numpy and pandas
import numpy as np
import pandas as pd


"""
Explanation: Tokenize and sequence a bigger corpus of text
<table class="tfo-notebook-buttons" align="left">
  <td

### 2. Preprocessing

In [7]:
# Remove comments from the codes
import re

comment_pattern = r"(^\s*#.*$)"
block_comment_pattern = r"(\"\"\")(.*?)(\"\"\")"

train_clean = list()
for code in train_set['content']:
    code = re.sub(comment_pattern, "", code, flags=re.MULTILINE)
    code = re.sub(block_comment_pattern, "", code, flags=re.DOTALL)
    train_clean.append(code)

test_clean = list()
for code in test_set['content']:
    code = re.sub(comment_pattern, "", code, flags=re.MULTILINE)
    code = re.sub(block_comment_pattern, "", code, flags=re.DOTALL)
    test_clean.append(code)

In [8]:
print(train_clean[0])

import numpy as np
from tensorflow import keras
from tensorflow.keras import layers



num_classes = 10
input_shape = (28, 28, 1)

(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()

x_train = x_train.astype("float32") / 255
x_test = x_test.astype("float32") / 255

x_train = np.expand_dims(x_train, -1)
x_test = np.expand_dims(x_test, -1)
print("x_train shape:", x_train.shape)
print(x_train.shape[0], "train samples")
print(x_test.shape[0], "test samples")

y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)




model = keras.Sequential(
    [
        keras.Input(shape=input_shape),
        layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Flatten(),
        layers.Dropout(0.5),
        layers.Dense(num_classes, activation="

In [9]:
print(test_clean[0])















import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import numpy as np
import pandas as pd





path = tf.keras.utils.get_file('reviews.csv', 
                               'https://drive.google.com/uc?id=13ySLC_ue6Umt9RJYSeM2t-V0kCv-4C-P')
print (path)



dataset = pd.read_csv(path)

dataset.head()



reviews = dataset['text'].tolist()




tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(reviews)

word_index = tokenizer.word_index
print(len(word_index))
print(word_index)





sequences = tokenizer.texts_to_sequences(reviews)
padded_sequences = pad_sequences(sequences, padding='post')


print(padded_sequences.shape)

print (reviews[0])

print(padded_sequences[0])








In [10]:
# Divide the codes into sentences
train_sents = [sent for code in train_clean for sent in code.split('\n') if sent != '']
test_sents  = [sent for code in test_clean for sent in code.split('\n') if sent != '']

print(train_sents[:5])
print(test_sents[:5])
print(len(train_sents), len(test_sents))

['import numpy as np', 'from tensorflow import keras', 'from tensorflow.keras import layers', 'num_classes = 10', 'input_shape = (28, 28, 1)']
['import tensorflow as tf', 'from tensorflow.keras.preprocessing.text import Tokenizer', 'from tensorflow.keras.preprocessing.sequence import pad_sequences', 'import numpy as np', 'import pandas as pd']
4984055 1238709


In [11]:
# Select only the sentences starting with "import" or "from"
# because the full datasets are too big to train or test

small_train = [sent for sent in train_sents if re.match(r'^(import|from)', sent)]
small_test  = [sent for sent in test_sents if re.match(r'^(import|from)', sent)]

print(len(small_train))
print(len(small_test))

295359
73939


In [12]:
# Tokenize the selected sentences
tokenizer = torchtext.data.utils.get_tokenizer('spacy', language='en_core_web_sm')

train_tokenized = [tokenizer(sent) for sent in small_train]
test_tokenized  = [tokenizer(sent) for sent in small_test]

print(train_tokenized[:5])
print(test_tokenized[:5])

[['import', 'numpy', 'as', 'np'], ['from', 'tensorflow', 'import', 'keras'], ['from', 'tensorflow.keras', 'import', 'layers'], ['import', 'sys'], ['import', 'os']]
[['import', 'tensorflow', 'as', 'tf'], ['from', 'tensorflow.keras.preprocessing.text', 'import', 'Tokenizer'], ['from', 'tensorflow.keras.preprocessing.sequence', 'import', 'pad_sequences'], ['import', 'numpy', 'as', 'np'], ['import', 'pandas', 'as', 'pd']]


In [13]:
# Remove "," from the tokens
train_tokenized = [[token for token in sent if token != ","] for sent in train_tokenized]
test_tokenized  = [[token for token in sent if token != ","] for sent in test_tokenized]

print(train_tokenized[:5])
print(test_tokenized[:5])

[['import', 'numpy', 'as', 'np'], ['from', 'tensorflow', 'import', 'keras'], ['from', 'tensorflow.keras', 'import', 'layers'], ['import', 'sys'], ['import', 'os']]
[['import', 'tensorflow', 'as', 'tf'], ['from', 'tensorflow.keras.preprocessing.text', 'import', 'Tokenizer'], ['from', 'tensorflow.keras.preprocessing.sequence', 'import', 'pad_sequences'], ['import', 'numpy', 'as', 'np'], ['import', 'pandas', 'as', 'pd']]


In [14]:
# Numericalize the train tokens
vocab = torchtext.vocab.build_vocab_from_iterator(train_tokenized) 
vocab.insert_token('<unk>', 0)           
vocab.insert_token('<eos>', 1)            
vocab.set_default_index(vocab['<unk>'])   
print(len(vocab))                         
print(vocab.get_itos()[:10])       

38726
['<unk>', '<eos>', 'import', 'from', 'as', 'numpy', 'np', 'plt', 'matplotlib.pyplot', 'pandas']


In [29]:
import pickle

with open("/content/drive/My Drive/data/vocab.pkl", "wb") as file:
    pickle.dump(vocab, file)

file.close()

### 3. Preparing Data Loaders  

In [15]:
def get_data(dataset, vocab, batch_size):
    data = []                                                   
    for example in dataset:
        #appends eos so we know it ends....so model learn how to end...                             
        tokens = example.append('<eos>')   
        #numericalize          
        tokens = [vocab[token] for token in example] 
        data.extend(tokens)                                    
    data = torch.LongTensor(data)                                 
    num_batches = data.shape[0] // batch_size 
    data = data[:num_batches * batch_size]                       
    data = data.view(batch_size, num_batches)          
    return data


In [16]:
batch_size = 128
train_data = get_data(train_tokenized, vocab, batch_size)
valid_data = get_data(test_tokenized, vocab, batch_size)

print(train_data.shape, valid_data.shape)

torch.Size([128, 11226]) torch.Size([128, 2819])


### 4. Modeling 

In [17]:
class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, num_layers, dropout_rate):
                
        super().__init__()
        self.hid_dim   = hid_dim
        self.num_layers= num_layers
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.lstm      = nn.LSTM(emb_dim, hid_dim, num_layers=num_layers,
                                 dropout=dropout_rate, batch_first=True)
        self.dropout   = nn.Dropout(dropout_rate)
        #when you do LM, you look forward, so it does not make sense to do bidirectionality
        self.fc        = nn.Linear(hid_dim, vocab_size)

    def init_hidden(self, batch_size, device):
        #this function gonna be run in the beginning of the epoch
        hidden = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        cell   = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        
        return hidden, cell #return as tuple
    
    def detach_hidden(self, hidden):
        #this gonna run in every batch
        hidden, cell = hidden
        hidden = hidden.detach() #removing this hidden from gradients graph
        cell   = cell.detach()   #removing this cell from gradients graph
        return hidden, cell
        
    def forward(self, src, hidden):
        #src: [batch size, seq len]
        
        #embed
        embed = self.embedding(src)
        #embed: [batch size, seq len, emb_dim]
        
        #send this to the lstm
        #we want to put hidden here...because we want to reset hidden....
        output, hidden = self.lstm(embed, hidden)
        #output: [batch size, seq len, hid_dim] ==> all hidden states
        #hidden: [num layer, batch size, hid_dim]  ===> last hidden states from each layer
        
        output = self.dropout(output)
        prediction = self.fc(output)
        #prediction: [batch size, seq len, vocab size]
        
        return prediction, hidden
    

### 5. Training

In [18]:
vocab_size = len(vocab)
emb_dim = 1024
hid_dim = 1024
num_layers = 2
dropout_rate = 0.65              
lr = 1e-3                     

In [19]:
model = LSTMLanguageModel(vocab_size, emb_dim, hid_dim, num_layers, dropout_rate).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {num_params:,} trainable parameters')

The model has 96,143,174 trainable parameters


In [20]:
def get_batch(data, seq_len, idx):
    #this data is from get_data()
    #train_data.shape #[batch size, number of batches....]
    src    = data[:, idx:idx+seq_len]                   
    target = data[:, idx+1:idx+seq_len+1]  #target simply is ahead of src by 1            
    return src, target

In [21]:
def train(model, data, optimizer, criterion, batch_size, seq_len, clip, device):
    
    epoch_loss = 0
    model.train()
    # drop all batches that are not a multiple of seq_len
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)
    
    for idx in tqdm(range(0, num_batches - 1, seq_len), desc='Training: ',leave=False):
        optimizer.zero_grad()
        hidden = model.detach_hidden(hidden)

        src, target = get_batch(data, seq_len, idx) #src, target: [batch size, seq len]
        src, target = src.to(device), target.to(device)
        batch_size = src.shape[0]
        prediction, hidden = model(src, hidden)               

        prediction = prediction.reshape(batch_size * seq_len, -1)  #prediction: [batch size * seq len, vocab size]  
        target = target.reshape(-1)
        loss = criterion(prediction, target)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)  #prevents gradient explosion - clip is basically the threshold.....
        optimizer.step()
        epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [22]:
def evaluate(model, data, criterion, batch_size, seq_len, device):

    epoch_loss = 0
    model.eval()
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)

    with torch.no_grad():
        for idx in range(0, num_batches - 1, seq_len):
            hidden = model.detach_hidden(hidden)
            src, target = get_batch(data, seq_len, idx)
            src, target = src.to(device), target.to(device)
            batch_size= src.shape[0]

            prediction, hidden = model(src, hidden)
            prediction = prediction.reshape(batch_size * seq_len, -1)
            target = target.reshape(-1)

            loss = criterion(prediction, target)
            epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [23]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [24]:
# Create the folder to save models
import os
from os import path
if path.exists('/content/drive/My Drive/models') == False:
  os.mkdir('/content/drive/My Drive/models')

In [25]:
batch_size = 128
n_epochs = 50
seq_len  = 50
clip     = 0.25

lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=0)

best_valid_loss = float('inf')

for epoch in range(n_epochs):
    train_loss = train(model, train_data, optimizer, criterion, 
                batch_size, seq_len, clip, device)
    valid_loss = evaluate(model, valid_data, criterion, batch_size, 
                seq_len, device)

    lr_scheduler.step(valid_loss)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), '/content/drive/My Drive/models/lstm_lm.pt')

    print(f'Epoch: {epoch+1}')
    print(f'\tTrain Perplexity: {math.exp(train_loss):.3f}')
    print(f'\tValid Perplexity: {math.exp(valid_loss):.3f}')



Epoch: 1
	Train Perplexity: 18.941
	Valid Perplexity: 10.121




Epoch: 2
	Train Perplexity: 8.763
	Valid Perplexity: 7.660




Epoch: 3
	Train Perplexity: 6.909
	Valid Perplexity: 6.528




Epoch: 4
	Train Perplexity: 5.873
	Valid Perplexity: 5.900




Epoch: 5
	Train Perplexity: 5.183
	Valid Perplexity: 5.498




Epoch: 6
	Train Perplexity: 4.688
	Valid Perplexity: 5.220




Epoch: 7
	Train Perplexity: 4.301
	Valid Perplexity: 5.031




Epoch: 8
	Train Perplexity: 4.001
	Valid Perplexity: 4.877




Epoch: 9
	Train Perplexity: 3.750
	Valid Perplexity: 4.786




Epoch: 10
	Train Perplexity: 3.551
	Valid Perplexity: 4.716




Epoch: 11
	Train Perplexity: 3.380
	Valid Perplexity: 4.632




Epoch: 12
	Train Perplexity: 3.233
	Valid Perplexity: 4.636




Epoch: 13
	Train Perplexity: 3.057
	Valid Perplexity: 4.556




Epoch: 14
	Train Perplexity: 2.951
	Valid Perplexity: 4.529




Epoch: 15
	Train Perplexity: 2.872
	Valid Perplexity: 4.518




Epoch: 16
	Train Perplexity: 2.806
	Valid Perplexity: 4.504




Epoch: 17
	Train Perplexity: 2.745
	Valid Perplexity: 4.493




Epoch: 18
	Train Perplexity: 2.690
	Valid Perplexity: 4.481




Epoch: 19
	Train Perplexity: 2.640
	Valid Perplexity: 4.477




Epoch: 20
	Train Perplexity: 2.596
	Valid Perplexity: 4.500




Epoch: 21
	Train Perplexity: 2.536
	Valid Perplexity: 4.479




Epoch: 22
	Train Perplexity: 2.501
	Valid Perplexity: 4.461




Epoch: 23
	Train Perplexity: 2.478
	Valid Perplexity: 4.458




Epoch: 24
	Train Perplexity: 2.461
	Valid Perplexity: 4.467




Epoch: 25
	Train Perplexity: 2.448
	Valid Perplexity: 4.468




Epoch: 26
	Train Perplexity: 2.442
	Valid Perplexity: 4.464




Epoch: 27
	Train Perplexity: 2.437
	Valid Perplexity: 4.459




Epoch: 28
	Train Perplexity: 2.434
	Valid Perplexity: 4.458




Epoch: 29
	Train Perplexity: 2.429
	Valid Perplexity: 4.457




Epoch: 30
	Train Perplexity: 2.428
	Valid Perplexity: 4.457




Epoch: 31
	Train Perplexity: 2.427
	Valid Perplexity: 4.456




Epoch: 32
	Train Perplexity: 2.428
	Valid Perplexity: 4.456




Epoch: 33
	Train Perplexity: 2.428
	Valid Perplexity: 4.456




Epoch: 34
	Train Perplexity: 2.427
	Valid Perplexity: 4.456




Epoch: 35
	Train Perplexity: 2.427
	Valid Perplexity: 4.456




Epoch: 36
	Train Perplexity: 2.427
	Valid Perplexity: 4.456




Epoch: 37
	Train Perplexity: 2.426
	Valid Perplexity: 4.456




Epoch: 38
	Train Perplexity: 2.429
	Valid Perplexity: 4.456




Epoch: 39
	Train Perplexity: 2.428
	Valid Perplexity: 4.456




Epoch: 40
	Train Perplexity: 2.428
	Valid Perplexity: 4.456




Epoch: 41
	Train Perplexity: 2.427
	Valid Perplexity: 4.456




Epoch: 42
	Train Perplexity: 2.427
	Valid Perplexity: 4.456




Epoch: 43
	Train Perplexity: 2.427
	Valid Perplexity: 4.456




Epoch: 44
	Train Perplexity: 2.428
	Valid Perplexity: 4.456




Epoch: 45
	Train Perplexity: 2.427
	Valid Perplexity: 4.456




Epoch: 46
	Train Perplexity: 2.427
	Valid Perplexity: 4.456




Epoch: 47
	Train Perplexity: 2.428
	Valid Perplexity: 4.456




Epoch: 48
	Train Perplexity: 2.426
	Valid Perplexity: 4.456




Epoch: 49
	Train Perplexity: 2.426
	Valid Perplexity: 4.456




Epoch: 50
	Train Perplexity: 2.426
	Valid Perplexity: 4.456


### 6. Inference on Python Code

In [26]:
def generate(prompt, max_seq_len, temperature, model, tokenizer, vocab, device, seed=None):
    if seed is not None:
        torch.manual_seed(seed)
    model.eval()
    tokens = tokenizer(prompt)
    indices = [vocab[t] for t in tokens]
    batch_size = 1
    hidden = model.init_hidden(batch_size, device)
    with torch.no_grad():
        for i in range(max_seq_len):
            src = torch.LongTensor([indices]).to(device)
            prediction, hidden = model(src, hidden)
            
            #prediction: [batch size, seq len, vocab size]
            #prediction[:, -1]: [batch size, vocab size] #probability of last vocab
            
            probs = torch.softmax(prediction[:, -1] / temperature, dim=-1)  
            prediction = torch.multinomial(probs, num_samples=1).item()    
            
            while prediction == vocab['<unk>']: #if it is unk, we sample again
                prediction = torch.multinomial(probs, num_samples=1).item()

            if prediction == vocab['<eos>']:    #if it is eos, we stop
                break

            indices.append(prediction) #autoregressive, thus output becomes input

    itos = vocab.get_itos()
    tokens = [itos[i] for i in indices]
    return tokens

In [27]:
prompt = 'from sklearn.preprocessing '
max_seq_len = 30
seed = 0

#smaller the temperature, more diverse tokens but comes 
#with a tradeoff of less-make-sense sentence
temperatures = [0.5, 0.7, 0.75, 0.8, 1.0]
for temperature in temperatures:
    generation = generate(prompt, max_seq_len, temperature, model, tokenizer, 
                          vocab, device, seed)
    print(str(temperature)+'\n'+' '.join(generation)+'\n')

0.5
from sklearn.preprocessing import LabelEncoder

0.7
from sklearn.preprocessing import LabelEncoder

0.75
from sklearn.preprocessing import LabelEncoder

0.8
from sklearn.preprocessing import LabelEncoder

1.0
from sklearn.preprocessing import LabelEncoder

