In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
from datasets import load_dataset

dataset = load_dataset("AhmedSSoliman/DJANGO")

Downloading and preparing dataset csv/AhmedSSoliman--DJANGO to /root/.cache/huggingface/datasets/csv/AhmedSSoliman--DJANGO-1230ca3115826199/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/117k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/AhmedSSoliman--DJANGO-1230ca3115826199/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
import transformers
import torch

In [4]:
dataset = dataset.remove_columns('Unnamed: 0')
dataset = dataset.rename_column('nl', 'pseudocode')

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['pseudocode', 'code'],
        num_rows: 16000
    })
    test: Dataset({
        features: ['pseudocode', 'code'],
        num_rows: 1805
    })
    validation: Dataset({
        features: ['pseudocode', 'code'],
        num_rows: 1000
    })
})

In [6]:
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-base')

def preprocess_data(examples):
    pseudocode = examples["pseudocode"]
    code = examples["code"]
    prefix = "Generate Pseudocode: "
    inputs = [prefix + c for c in code]
    model_inputs = tokenizer(inputs, max_length = 64, padding="max_length", truncation=True)
    labels = tokenizer(pseudocode, max_length=64, padding="max_length", truncation=True).input_ids
    
    #Replace padding token ids with -100 so that they are not taken into account by the loss function
    labels_with_ignore_index = []
    for labels_example in labels:
        labels_example = [label if label != 0 else -100 for label in labels_example]
        labels_with_ignore_index.append(labels_example)
    model_inputs["labels"] = labels_with_ignore_index
    
    return model_inputs
    

tokenized_dataset = dataset.map(preprocess_data, batched=True)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

  0%|          | 0/16 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [7]:
tokenized_dataset.set_format('torch')

In [8]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['pseudocode', 'code', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 16000
    })
    test: Dataset({
        features: ['pseudocode', 'code', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1805
    })
    validation: Dataset({
        features: ['pseudocode', 'code', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
})

In [9]:
tokenized_dataset = tokenized_dataset.remove_columns(["pseudocode", "code"])
# small_train_dataset = tokenized_dataset['train'].shuffle(seed=42).select(range(2000))
# small_validation_dataset = tokenized_dataset['validation'].shuffle(seed=42).select(range(200))
small_train_dataset = tokenized_dataset['train']
small_validation_dataset = tokenized_dataset['validation']

In [10]:
small_train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 16000
})

In [11]:
#create torch dataloaders
from torch.utils.data import DataLoader

train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=32)
eval_dataloader = DataLoader(small_validation_dataset, shuffle=True, batch_size=32)

In [12]:
from transformers import T5ForConditionalGeneration
model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-base')

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [13]:
import math
from transformers import AdamW, get_scheduler

optimizer = AdamW(
    model.parameters(), 
    lr=1e-4,
    weight_decay = 1e-4,
    eps = 1e-8
)

num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)
warmup_ratio = 0.2
num_warmup_steps = math.ceil(num_training_steps * warmup_ratio)
lr_scheduler = get_scheduler(
    "linear",
    optimizer,
    num_warmup_steps,
    num_training_steps
)



In [14]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32100, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32100, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [15]:
for i in train_dataloader:
    for x, y in i.items():
        print(type(y))
        break
    break

<class 'torch.Tensor'>


In [16]:
def eval_loss(model):
    model.eval()
    total_loss = 0
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss
    return total_loss / len(eval_dataloader)
    

In [17]:
def checkpoint(model, filename):
    torch.save(model.state_dict(), filename)
    
def resume(model, filename):
    model.load_state_dict(torch.load(filename))

In [18]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

train_loss = []

early_stop_threshold = 4
min_loss = 100
best_epoch = 0

# for epoch in range(1, num_epochs+1):
#     model.train()
#     print(f"EPOCH {epoch}")
#     for batch in train_dataloader:
#         batch = {k: v.to(device) for k, v in batch.items()}
#         outputs = model(**batch)
#         loss = outputs.loss
#         print(f"training loss: {loss}")
#         train_loss.append(loss)
#         loss.backward()

#         optimizer.step()
#         lr_scheduler.step()
#         optimizer.zero_grad()
#         progress_bar.update(1)
#     current_eval_loss = eval_loss(model)
#     print(type(current_eval_loss))
#     print(type(min_loss))
#     print("Eval loss: ", current_eval_loss)
#     if(current_eval_loss < min_loss):
#         min_loss = current_eval_loss
#         best_epoch = epoch
#         checkpoint(model, 'best_model.pth')
#     elif epoch - best_epoch > early_stop_threshold:
#         print("Early stopped training at epoch %d" % epoch)
#         break 
        
# resume(model, "best_model.pth")    
# model.save_pretrained('runs/saved_model/')
# tokenizer.save_pretrained('runs/saved_model/')

for epoch in range(1, num_epochs+1):
    model.train()
    train_loss = 0
    print(f"EPOCH {epoch}")
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        train_loss += loss
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    e_loss = eval_loss(model)
    print("Train Loss: ", train_loss / len(train_dataloader))
    print("Eval loss: ", e_loss)
#     train_losses.append(train_loss / len(train_dataloader))
#     eval_losses.append(e_loss)
    if(epoch % 5 == 0):
        checkpoint(model, f"ckpt_ep-{epoch}.pt")    
checkpoint(model, "saved_model.pt")

  0%|          | 0/2500 [00:00<?, ?it/s]

EPOCH 1
Train Loss:  tensor(1.6093, device='cuda:0', grad_fn=<DivBackward0>)
Eval loss:  tensor(0.6274, device='cuda:0')
EPOCH 2
Train Loss:  tensor(0.5680, device='cuda:0', grad_fn=<DivBackward0>)
Eval loss:  tensor(0.5097, device='cuda:0')
EPOCH 3
Train Loss:  tensor(0.4309, device='cuda:0', grad_fn=<DivBackward0>)
Eval loss:  tensor(0.4928, device='cuda:0')
EPOCH 4
Train Loss:  tensor(0.3506, device='cuda:0', grad_fn=<DivBackward0>)
Eval loss:  tensor(0.4677, device='cuda:0')
EPOCH 5
Train Loss:  tensor(0.2983, device='cuda:0', grad_fn=<DivBackward0>)
Eval loss:  tensor(0.4714, device='cuda:0')


In [19]:
# torch.save(model.state_dict(), "code2pcfull.pt")

In [20]:
small_test_dataset = dataset['test'].shuffle(seed=42).select(range(400))
# small_test_dataset = dataset['test']
test_dataloader = DataLoader(small_test_dataset, shuffle=True, batch_size=8)

In [21]:
for i in test_dataloader:
    print(i)
    break

{'pseudocode': ['if f is an instance of models.FileField,', 'raise an exception.', 'raise an SuspiciousFileOperation exception with string "Attempted access to \'%s\' denied." as argument, replace \'%s\' with name.', 'call the function self.compress with empty list as an argument, return the result.', 'import module sys.', "if value under the 'unique_id' key of the item dictionary is not None,", 'call the function plural_re.match with an argument t.contents, substitute the result for pluralmatch.', 'convert ret to a boolean, return it.'], 'code': ['if isinstance ( f , models . FileField ) :', 'raise', 'raise SuspiciousFileOperation ( "Attempted access to \'%s\' denied." % name )', 'return self . compress ( [ ] )', 'import sys', "if item [ 'unique_id' ] is not None :", 'pluralmatch = plural_re . match ( t . contents )', 'return bool ( ret )']}


In [22]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.0
[0mNote: you may need to restart the kernel to use updated packages.


In [23]:
from datasets import load_metric
import evaluate
metric= evaluate.load("bleu")
model.eval()
pred = []
ref = []
for i in small_test_dataset:
    i['code'] = 'Generate Pseudocode: ' + i['code']
    input_ids = tokenizer(i['code'], return_tensors="pt").input_ids
    input_ids = input_ids.to('cuda')
    generated_ids = model.generate(input_ids, max_length=64)
    pred.append(tokenizer.decode(generated_ids[0], skip_special_tokens=True))
    ref.append([i['pseudocode']])
results = metric.compute(predictions=pred, references=ref)

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [24]:
print(results)

{'bleu': 0.7466941590539696, 'precisions': [0.9056511056511056, 0.8229623137598597, 0.7626767200754005, 0.7093425605536332], 'brevity_penalty': 0.9370406026966878, 'length_ratio': 0.9389418640418333, 'translation_length': 6105, 'reference_length': 6502}


In [25]:
fibonacci_code = ['nterms = int(input("How many terms? "))', 'n1, n2 = 0, 1', 'count = 0',
     'if nterms <= 0:', 'print("Please enter a positive integer")', 'elif nterms == 1:', 
     'print("Fibonacci sequence upto",nterms,":")', 'print(n1)', 'else:', 'print("Fibonacci sequence:")', 'while count < nterms:', 'print(n1)', 'nth = n1 + n2',
     '# update values', 'n1 = n2', 'n2 = nth', 'count += 1']

In [26]:
for i in fibonacci_code:
    model = model.to('cuda')
    prefix = "Generate Pseudocode: "
    text = i
    text = prefix + text
    input_ids = tokenizer(text, return_tensors="pt").input_ids
    input_ids = input_ids.to('cuda')
    generated_ids = model.generate(input_ids, max_length=128, top_p=0.95, top_k=50)
    print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))

call the function input with an argument string "How many terms? ", substitute the result for nterms.
n1 and n2 are integer 0 and 1.
count is an integer 0.
if nterms is lesser than integer 0,
print string "Please enter a positive integer".
otherwise if nterms equals integer 1,
print string "Fibonacci sequence upto",nterms,":" to the standard output.
print n1 to the standard output.
if not,
print string "Fibonacci sequence:" to the standard output.
while count is lesser than nterms,
print n1 to the standard output.
sum n1 and n2, substitute the result for nth.
update values dictionary.
substitute n2 for n1.
substitute nth for n2.
increment count by integer 1.


In [27]:
torch.save(model.state_dict(), 'code2pc_0.673bleu.pt')