In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
import transformers
import datasets
import torch

In [3]:
from datasets import load_dataset
dataset = load_dataset("mbpp", ignore_verifications=True)
dataset = dataset['test'].remove_columns(['task_id', 'test_list', 'test_setup_code', 'challenge_test_list'])
dataset = dataset.train_test_split(test_size=0.1, seed = 2022)

Downloading builder script:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading and preparing dataset mbpp/full (download: 550.33 KiB, generated: 456.82 KiB, post-processed: Unknown size, total: 1007.15 KiB) to /root/.cache/huggingface/datasets/mbpp/full/1.0.0/7847a4dd5135067a588814541d6721a08c17109d9a2e5591ddacb3f2a18a3149...


Downloading data:   0%|          | 0.00/131k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/974 [00:00<?, ? examples/s]

Dataset mbpp downloaded and prepared to /root/.cache/huggingface/datasets/mbpp/full/1.0.0/7847a4dd5135067a588814541d6721a08c17109d9a2e5591ddacb3f2a18a3149. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'code'],
        num_rows: 876
    })
    test: Dataset({
        features: ['text', 'code'],
        num_rows: 98
    })
})

In [5]:
test_dataset = dataset['test']

In [6]:
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-base')

def preprocess_data(examples):
    text = examples["text"]
    code = examples["code"]
    prefix = "Generate Python: "
    inputs = [prefix + t for t in text]
    model_inputs = tokenizer(inputs, max_length = 48, padding="max_length", truncation=True)
    labels = tokenizer(code, max_length=128, padding="max_length", truncation=True).input_ids
    
    #Replace padding token ids with -100 so that they are not taken into account by the loss function
    labels_with_ignore_index = []
    for labels_example in labels:
        labels_example = [label if label != 0 else -100 for label in labels_example]
        labels_with_ignore_index.append(labels_example)
    model_inputs["labels"] = labels_with_ignore_index
    
    return model_inputs
    

tokenized_dataset = dataset.map(preprocess_data, batched=True)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [7]:
for i in tokenized_dataset['train']:
    print(i)
    break

{'text': 'Write a function to extract a specified column from a given nested list.', 'code': 'def extract_column(list1, n):\r\n   result = [i.pop(n) for i in list1]\r\n   return result ', 'input_ids': [1, 4625, 6600, 30, 2598, 279, 445, 358, 2608, 279, 1269, 1057, 628, 279, 864, 4764, 666, 18, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [1, 536, 2608, 67, 2827, 12, 1098, 21, 16, 290, 4672, 206, 203, 282, 563, 273, 306, 77, 18, 5120, 12, 82, 13, 364, 277, 316, 666, 21, 65, 206, 203, 282, 327, 563, 225, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,

In [8]:
tokenized_dataset.set_format("torch")

In [9]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'code', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 876
    })
    test: Dataset({
        features: ['text', 'code', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 98
    })
})

In [10]:
tokenized_dataset = tokenized_dataset.remove_columns(["text", "code"])
train_dataset = tokenized_dataset['train']
validation_dataset = tokenized_dataset['test']

In [11]:
validation_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 98
})

In [12]:
#create torch dataloaders
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=4)
eval_dataloader = DataLoader(validation_dataset, shuffle=True, batch_size=4)

In [13]:
from transformers import T5ForConditionalGeneration
model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-base')

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [14]:
import math
from transformers import AdamW, get_scheduler, get_cosine_schedule_with_warmup

optimizer = AdamW(
    model.parameters(), 
    lr=1e-4,
    weight_decay = 1e-4,
    eps = 1e-8
)

num_epochs = 40
num_training_steps = num_epochs * len(train_dataloader)
warmup_ratio = 0.2
num_warmup_steps = math.ceil(num_training_steps * warmup_ratio)
# lr_scheduler = get_scheduler(
#     "linear",
#     optimizer,
#     num_warmup_steps,
#     num_training_steps
# )

lr_scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps,
    num_training_steps
)



In [15]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# if torch.cuda.device_count() > 1:
#     print("Let's use", torch.cuda.device_count(), "GPUs!")
#     model = nn.DataParallel(model)
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32100, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32100, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [16]:
def eval_loss(model):
    model.eval()
    total_loss = 0
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss
    return total_loss / len(eval_dataloader)
    

In [17]:
def checkpoint(model, filename):
    torch.save(model.state_dict(), filename)
    
def resume(model, filename):
    model.load_state_dict(torch.load(filename))

In [18]:
# from tqdm.auto import tqdm

# progress_bar = tqdm(range(num_training_steps))

# train_loss = []

# early_stop_threshold = 4
# min_loss = 100
# best_epoch = 0

# for epoch in range(1, num_epochs+1):
#     model.train()
#     print(f"EPOCH {epoch}")
#     for batch in train_dataloader:
#         batch = {k: v.to(device) for k, v in batch.items()}
#         outputs = model(**batch)
#         loss = outputs.loss
#         print(f"training loss: {loss}")
#         train_loss.append(loss)
#         loss.backward()

#         optimizer.step()
#         lr_scheduler.step()
#         optimizer.zero_grad()
#         progress_bar.update(1)
#     current_eval_loss = eval_loss(model)
#     print(type(current_eval_loss))
#     print(type(min_loss))
#     print("Eval loss: ", current_eval_loss)
#     if(current_eval_loss < min_loss):
#         min_loss = current_eval_loss
#         best_epoch = epoch
#         checkpoint(model, 'best_model.pth')
#     elif epoch - best_epoch > early_stop_threshold:
#         print("Early stopped training at epoch %d" % epoch)
#         break 
        
# resume(model, "best_model.pth")   

In [19]:
torch.cuda.empty_cache()

In [20]:
# from tqdm.auto import tqdm
# from pathlib import Path

# progress_bar = tqdm(range(num_training_steps))

# train_loss = []

# # early_stop_threshold = 4
# # min_loss = 100
# best_epoch = 0


# checkpoint_path = "runs/checkpoint/"
# Path(checkpoint_path).mkdir(parents=True, exist_ok=True)

# for epoch in range(1, num_epochs+1):
#     model.train()
#     print(f"EPOCH {epoch}")
#     for batch in train_dataloader:
#         batch = {k: v.to(device) for k, v in batch.items()}
#         outputs = model(**batch)
#         loss = outputs.loss
#         print(f"training loss: {loss}")
#         train_loss.append(loss)
#         loss.backward()

#         optimizer.step()
#         lr_scheduler.step()
#         optimizer.zero_grad()
#         progress_bar.update(1)
#     print("Eval loss: ", eval_loss(model))
#     checkpoint(model, f"{checkpoint_path}ckpt-{epoch}")
# model.save_pretrained('runs/saved_model/')
# tokenizer.save_pretrained('runs/saved_model/')

In [21]:
from tqdm.auto import tqdm
from pathlib import Path


progress_bar = tqdm(range(num_training_steps))
best_epoch = 0
min_loss = 100

for epoch in range(1, num_epochs+1):
    model.train()
    train_loss = 0
    print(f"EPOCH {epoch}")
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        train_loss += loss
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    e_loss = eval_loss(model)
    print("Train Loss: ", train_loss / len(train_dataloader))
    print("Eval loss: ", e_loss)
    if(e_loss < min_loss):
        min_loss = e_loss
        best_epoch = epoch
        checkpoint(model, "best_model.pt")
    if(epoch % 10 == 0):
        checkpoint(model, f"ckpt_ep-{epoch}.pt")    
checkpoint(model, "saved_model.pt")

  0%|          | 0/8760 [00:00<?, ?it/s]

EPOCH 1
Train Loss:  tensor(4.9896, device='cuda:0', grad_fn=<DivBackward0>)
Eval loss:  tensor(2.0591, device='cuda:0')
EPOCH 2
Train Loss:  tensor(1.9435, device='cuda:0', grad_fn=<DivBackward0>)
Eval loss:  tensor(1.5698, device='cuda:0')
EPOCH 3
Train Loss:  tensor(1.5313, device='cuda:0', grad_fn=<DivBackward0>)
Eval loss:  tensor(1.3843, device='cuda:0')
EPOCH 4
Train Loss:  tensor(1.3023, device='cuda:0', grad_fn=<DivBackward0>)
Eval loss:  tensor(1.3173, device='cuda:0')
EPOCH 5
Train Loss:  tensor(1.1306, device='cuda:0', grad_fn=<DivBackward0>)
Eval loss:  tensor(1.2057, device='cuda:0')
EPOCH 6
Train Loss:  tensor(0.9851, device='cuda:0', grad_fn=<DivBackward0>)
Eval loss:  tensor(1.2493, device='cuda:0')
EPOCH 7
Train Loss:  tensor(0.8601, device='cuda:0', grad_fn=<DivBackward0>)
Eval loss:  tensor(1.2286, device='cuda:0')
EPOCH 8
Train Loss:  tensor(0.7349, device='cuda:0', grad_fn=<DivBackward0>)
Eval loss:  tensor(1.2470, device='cuda:0')
EPOCH 9
Train Loss:  tensor(0.62

In [22]:
# torch.save(model.state_dict(), 'pttxttocode_20ep_4bs_1e-4_0.3bleu.pt')

In [23]:
# from datasets import load_metric
# metric= load_metric("bleu")
# model.eval()
# pred = []
# ref = []
# for batch in eval_dataloader:
#     batch = {k: v.to(device) for k, v in batch.items()}
#     with torch.no_grad():
#         outputs = model(**batch)

#     logits = outputs.logits
#     predictions = torch.argmax(logits, dim=-1)
#     for i in predictions:
#         pred.append(i.tolist())
#     for j in batch["labels"]:
#         l = []
#         l.append(j.tolist())
#         ref.append(l)
# results = metric.compute(predictions = pred, references = ref)

In [24]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m121.2 kB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.0
[0mNote: you may need to restart the kernel to use updated packages.


In [25]:
from datasets import load_metric
import evaluate
metric= evaluate.load("bleu")

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [26]:
def bleu_calculation(model):
    model.eval()
    pred = []
    ref = []
    for i in test_dataset:
        i['text'] = 'Generate Python: ' + i['text']
        input_ids = tokenizer(i['text'], return_tensors="pt").input_ids
        input_ids = input_ids.to('cuda')
        generated_ids = model.generate(input_ids, max_length=128)
        pred.append(tokenizer.decode(generated_ids[0], skip_special_tokens=True))
        ref.append([i['code']])
    results = metric.compute(predictions=pred, references=ref)
    return results

In [27]:
#last model
print("Last Model: ", bleu_calculation(model))
#least eval loss model
resume(model, 'best_model.pt')
print("Least eval loss model: ", bleu_calculation(model))

Last Model:  {'bleu': 0.29090039172798854, 'precisions': [0.6843245846376114, 0.4594327990135635, 0.35001263583522874, 0.28323399844519304], 'brevity_penalty': 0.6923345336056473, 'length_ratio': 0.731161971830986, 'translation_length': 4153, 'reference_length': 5680}
Least eval loss model:  {'bleu': 0.12660707030375812, 'precisions': [0.6712978651304643, 0.3788994041359972, 0.2442831215970962, 0.16710575837410613], 'brevity_penalty': 0.39662213046835226, 'length_ratio': 0.5195422535211267, 'translation_length': 2951, 'reference_length': 5680}


In [28]:
#every 10 epochs
resume(model, 'ckpt_ep-10.pt')
print("CKPT-10: ", bleu_calculation(model))
resume(model, 'ckpt_ep-20.pt')
print("CKPT-20: ", bleu_calculation(model))
resume(model, 'ckpt_ep-30.pt')
print("CKPT-30: ", bleu_calculation(model))

CKPT-10:  {'bleu': 0.20482612520346988, 'precisions': [0.6178634864247653, 0.3601353109549831, 0.24966622162883845, 0.18508363038113518], 'brevity_penalty': 0.643226370004607, 'length_ratio': 0.6938380281690141, 'translation_length': 3941, 'reference_length': 5680}
CKPT-20:  {'bleu': 0.2900076962926017, 'precisions': [0.6752733193765992, 0.44013330159485836, 0.32780892030221787, 0.2624219725343321], 'brevity_penalty': 0.7252509855343923, 'length_ratio': 0.7568661971830986, 'translation_length': 4299, 'reference_length': 5680}
CKPT-30:  {'bleu': 0.2964149247333916, 'precisions': [0.7038740920096852, 0.4742063492063492, 0.3599389933909507, 0.28832116788321166], 'brevity_penalty': 0.6870812929312217, 'length_ratio': 0.727112676056338, 'translation_length': 4130, 'reference_length': 5680}


In [29]:
model = model.to('cuda')
prefix = "Generate Python: "
text = "Write a function to add two numbers"
text = prefix + text
input_ids = tokenizer(text, return_tensors="pt").input_ids
input_ids = input_ids.to('cuda')
generated_ids = model.generate(input_ids, max_length=128, top_p=0.95, top_k=50)
# generated_ids = model.generate(input_ids, max_length=128)
print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))

def add_numbers(num1,num2):
    if num1+num2>num2:
        return num1
    else:
        return num2
