###DATA UPLOAD

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


###INSTALL DEPENDENCIES AND IMPORT LIBRARIES

In [1]:
!pip install transformers
!pip install datasets
!pip install sacrebleu
!pip install rouge_score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sacrebleu
  Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 KB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker
  Downloading portalocker-2.7.0-py2.py3-none-any.whl (15 kB)
Collecting colorama
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-2.7.0 sacrebleu-2.3.1


In [3]:
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel
torch.cuda.empty_cache()

PREPARE DATA


In [None]:
## high quality data
train_df = pd.read_csv('/content/drive/MyDrive/coNaLa-data/hq_train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/coNaLa-data/hq_test.csv')
val_df = pd.read_csv('/content/drive/MyDrive/coNaLa-data/hq_val.csv')



In [None]:
train_df = train_df['intent'] + ' <ZZZ> ' + train_df['snippet']
val_df = val_df['intent'] + ' <ZZZ> ' + val_df['snippet']

###TOKENIZER AND MODEL

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2',  bos_token='<|startoftext|>',  eos_token='<|endoftext|>', pad_token='<|pad|>')
tokenizer.add_tokens([['<ZZZ>']])

model = GPT2LMHeadModel.from_pretrained('gpt2').cuda()
#model = GPT2LMHeadModel.from_pretrained('/content/drive/MyDrive/fineTunedGPT2_hq').cuda() # continue training

model.resize_token_embeddings(len(tokenizer))

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Embedding(50260, 768)

###PREPARE DATA PART 2

In [None]:
max_length = max([len(tokenizer.encode(conala)) for conala in train_df])

In [11]:
class coNaLaDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for i in data:
            encodings_dict = tokenizer('<|startoftext|>' + i + '<|endoftext|>', truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [None]:
train_dataset = coNaLaDataset(train_df, tokenizer, max_length=max_length)
val_dataset = coNaLaDataset(val_df, tokenizer, max_length=max_length)

In [None]:
my_data_collator = lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}

In [None]:
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/GPT2-FineTuned_Z',
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    per_device_train_batch_size=2, 
    per_device_eval_batch_size=2,
    num_train_epochs=3, 
    gradient_accumulation_steps = 2,
    warmup_steps=10, 
    weight_decay=0.05, 
    report_to = 'none',
    save_total_limit = 2,
    load_best_model_at_end = True,
    )


trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    data_collator = my_data_collator
)

trainer.train()

trainer.save_model('/content/drive/MyDrive/fineTunedGPT2_hq_Z')
trainer.save_state()




Epoch,Training Loss,Validation Loss
1,0.1607,0.150141
2,0.1338,0.130787
3,0.1175,0.123858


###COMPUTE BLEU METRIC

In [None]:
# load in evaluation script
from datasets import Metric, load_metric
from google.colab import files
uploaded1 = files.upload() # evaluation script
import evaluation

In [None]:
# load in test data
test_df = pd.read_csv('/content/drive/MyDrive/coNaLa-data/hq_test.csv')

In [4]:
# define the tokenizer and load in the trained model

tokenizer = GPT2Tokenizer.from_pretrained('gpt2',  bos_token='<|startoftext|>',  eos_token='<|endoftext|>', pad_token='<|pad|>', padding_side='left')
#tokenizer.add_tokens([['<ZZZ>']])

model = GPT2LMHeadModel.from_pretrained('/content/drive/MyDrive/fineTunedGPT2_hq_Z').cuda()
model.resize_token_embeddings(len(tokenizer))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Embedding(50259, 768)

In [5]:
## define a method to generate code from NL
def generate_response(NL, model, tokenizer, temp, n_out):

  inputs = tokenizer('<|startoftext|> ' + NL + ' <ZZZ> ', return_tensors='pt')

  input_ids = inputs.input_ids.to("cuda")
  attention_mask = inputs.attention_mask.to("cuda")

  outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, do_sample=True, top_k=50, max_length=75, top_p=0.99,
                                 temperature=temp, num_return_sequences=n_out, pad_token_id=tokenizer.encode('<|pad|>')[0])
  
  return [[tokenizer.decode(out, skip_special_tokens=True)] for out in outputs]

In [8]:
# get snippet predictions

results = []

for i in range(len(test_df)):
  row = test_df.iloc[i]

  output_str = generate_response(row['intent'], model, tokenizer, 0.1, 1)[0][0]
  output_str = output_str.split('<ZZZ>')[1].lstrip(' ')

  results.append(output_str)


In [14]:
# get the bleu score of the results
evaluator = evaluation.CodeGenerationEvaluator(tokenizer, 'cuda', smooth_bleu=True)

bleu_score = 0

for i in range(len(results)):

  ref = test_df['snippet'][i]
  pred = results[i]

  if pred is not None and pred != "":

      if ref is not None and ref != "":
       
        metrics = evaluator.evaluate([pred], [ref])

        bleu_score += metrics['BLEU']

bleu_score = bleu_score/len(results)