###INSTALL DEPENDENCIES AND IMPORT LIBRARIES

In [None]:
!pip install transformers
!pip install datasets
!pip install sacrebleu
!pip install rouge_score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m37.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m77.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.4
Looking in indexes: https://pypi.org/simple, https://us

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel
torch.cuda.empty_cache()

In [None]:
!git clone https://github.com/nelson-nsc/COMP0087-NLP-project.git

Cloning into 'ucl-nlp'...
remote: Enumerating objects: 181, done.[K
remote: Counting objects: 100% (44/44), done.[K
remote: Compressing objects: 100% (27/27), done.[K
remote: Total 181 (delta 24), reused 27 (delta 17), pack-reused 137[K
Receiving objects: 100% (181/181), 7.98 MiB | 13.55 MiB/s, done.
Resolving deltas: 100% (66/66), done.


PREPARE DATA


In [None]:
## HQ DATA
train_df = pd.read_csv('/content/COMP0087-NLP-project/data/conala-hq-marian/hq_train.csv')
test_df = pd.read_csv('/content/COMP0087-NLP-project/data/conala-hq-marian/hq_test.csv')
val_df = pd.read_csv('/content/COMP0087-NLP-project/data/conala-hq-marian/hq_val.csv')

In [None]:
## HQ + MINED DATA
# hq_train_df = pd.read_csv('/content/COMP0087-NLP-project/data/conala-hq-marian/hq_train.csv')
# mined_df = pd.read_csv('/content/COMP0087-NLP-project/data/mined_data.csv')
# mined_df.dropna(inplace=True)

# train_df = pd.concat([hq_train_df, mined_df])
# train_df = train_df.sample(frac=1)
# train_df.reset_index(drop=True, inplace=True)

# test_df = pd.read_csv('/content/COMP0087-NLP-project/data/conala-hq-marian/hq_test.csv')
# val_df = pd.read_csv('/content/COMP0087-NLP-project/data/conala-hq-marian/hq_val.csv')

In [None]:
## HQ + AUGMENTED DATA
# hq_train_df = pd.read_csv('/content/COMP0087-NLP-project/data/conala-hq-marian/hq_train.csv')
# aug_train_df = pd.read_csv('/content/COMP0087-NLP-project/data/conala-hq-marian/hq_train_aug.csv')

# train_df = pd.concat([hq_train_df, aug_train_df])
# train_df = train_df.sample(frac=1)
# train_df.reset_index(drop=True, inplace=True)

# test_df = pd.read_csv('/content/COMP0087-NLP-project/data/conala-hq-marian/hq_test.csv')
# val_df = pd.read_csv('/content/COMP0087-NLP-project/data/conala-hq-marian/hq_val.csv')


In [None]:
train_df = train_df['intent'] + ' <ZZZ> ' + train_df['snippet']
val_df = val_df['intent'] + ' <ZZZ> ' + val_df['snippet']

###TOKENIZER AND MODEL

In [None]:
# get gpt2 tokineizer and add our special token
tokenizer = GPT2Tokenizer.from_pretrained('gpt2',  bos_token='<|startoftext|>',  eos_token='<|endoftext|>', pad_token='<|pad|>')
tokenizer.add_tokens([['<ZZZ>']])

model = GPT2LMHeadModel.from_pretrained('gpt2').cuda()

model.resize_token_embeddings(len(tokenizer))

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Embedding(50260, 768)

###PREPARE DATA PART 2

In [None]:
max_length = max([len(tokenizer.encode(conala)) for conala in train_df])

In [None]:
class coNaLaDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for i in data:
            encodings_dict = tokenizer('<|startoftext|>' + i + '<|endoftext|>', truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [None]:
train_dataset = coNaLaDataset(train_df, tokenizer, max_length=max_length)
val_dataset = coNaLaDataset(val_df, tokenizer, max_length=max_length)

In [None]:
my_data_collator = lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}

#### TRAIN THE MODEL

In [None]:
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/GPT2-FineTuned_mix_Z_3',
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    per_device_train_batch_size=2, 
    per_device_eval_batch_size=2,
    num_train_epochs=3, 
    gradient_accumulation_steps = 2,
    warmup_steps=10, 
    weight_decay=0.05, 
    report_to = 'none',
    save_total_limit = 2,
    load_best_model_at_end = True,
    )


trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    data_collator = my_data_collator
)

trainer.train()

trainer.save_model('/content/drive/MyDrive/fineTunedGPT2_hq_Z_mix_3')
trainer.save_state()




Epoch,Training Loss,Validation Loss
1,0.118,0.091485
2,0.1042,0.079084
3,0.0931,0.074648


### HAVE FUN WITH THE MODEL

In [None]:
# define the tokenizer and load in the trained model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2',  bos_token='<|startoftext|>',  eos_token='<|endoftext|>', pad_token='<|pad|>', padding_side='left')
tokenizer.add_tokens([['<ZZZ>']])

model = GPT2LMHeadModel.from_pretrained('/content/drive/MyDrive/fineTunedGPT2_hq_Z_mix_3').cuda()
model.resize_token_embeddings(len(tokenizer))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Embedding(50260, 768)

In [None]:
## define a method to generate code from NL
def generate_response(NL, model, tokenizer, temp, n_out):

  inputs = tokenizer('<|startoftext|> ' + NL + ' <ZZZ> ', return_tensors='pt')

  input_ids = inputs.input_ids.to("cuda")
  attention_mask = inputs.attention_mask.to("cuda")

  outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, do_sample=True, top_k=50, max_length=75, top_p=0.99,
                                 temperature=temp, num_return_sequences=n_out, pad_token_id=tokenizer.encode('<|pad|>')[0])
  
  return [[tokenizer.decode(out, skip_special_tokens=True)] for out in outputs]

In [None]:
generate_response('how to get mean of numpy array?', model, tokenizer, 0.001, 5)

[[' how to get mean of numpy array? <ZZZ>   y = np.mean(x, axis=0)'],
 [' how to get mean of numpy array? <ZZZ>   a = np.mean(a, axis=0)'],
 [' how to get mean of numpy array? <ZZZ>   y = np.mean(x, axis=0)'],
 [' how to get mean of numpy array? <ZZZ>   numpy.mean(a, axis=0)'],
 [' how to get mean of numpy array? <ZZZ>   numpy.mean(a, axis=0)']]