###DATA UPLOAD

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


###INSTALL DEPENDENCIES AND IMPORT LIBRARIES

In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.1-py3-none-any.whl (6.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m65.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.2-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 KB[0m [31m836.3 kB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m81.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.2 tokenizers-0.13.2 transformers-4.27.1


In [3]:
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel
torch.cuda.empty_cache()

PREPARE DATA


In [None]:
# ## regular data
# df = pd.read_csv('/content/drive/MyDrive/coNaLa-data/conala_alltrain.csv')

# # put in form we desire
# df = df['intent'] + ' = '  + df['snippet']

# # get number of sampels we wish to use
# df = df.iloc[0:20000]

In [4]:
## high quality data
train_df = pd.read_csv('/content/drive/MyDrive/coNaLa-data/hq_train_gpt2.csv')
test_df = pd.read_csv('/content/drive/MyDrive/coNaLa-data/hq_test_gpt2.csv')
val_df = pd.read_csv('/content/drive/MyDrive/coNaLa-data/hq_val_gpt2.csv')

###TOKENIZER AND MODEL

In [25]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2',  bos_token='<|startoftext|>',  eos_token='<|endoftext|>', pad_token='<|pad|>')
#model = GPT2LMHeadModel.from_pretrained('gpt2').cuda()
model = GPT2LMHeadModel.from_pretrained('/content/drive/MyDrive/fineTunedGPT2_hq').cuda() # continue training
model.resize_token_embeddings(len(tokenizer))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Embedding(50259, 768)

###PREPARE DATA PART 2

In [26]:
max_length = max([len(tokenizer.encode(conala)) for conala in train_df['0']])

In [27]:
class coNaLaDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for i in data['0']:
            encodings_dict = tokenizer('<|startoftext|>' + i + '<|endoftext|>', truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [28]:
train_dataset = coNaLaDataset(train_df, tokenizer, max_length=max_length)
val_dataset = coNaLaDataset(val_df, tokenizer, max_length=max_length)

In [29]:
my_data_collator = lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}

In [30]:
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/GPT2-FineTuned',
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    per_device_train_batch_size=2, 
    per_device_eval_batch_size=2,
    num_train_epochs=3, 
    gradient_accumulation_steps = 5,
    warmup_steps=10, 
    weight_decay=0.05, 
    report_to = 'none',
    save_total_limit = 2,
    load_best_model_at_end = True,
    )


trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    data_collator = my_data_collator
)

trainer.train()

trainer.save_model('/content/drive/MyDrive/fineTunedGPT2_hq')
trainer.save_state()




Epoch,Training Loss,Validation Loss
0,0.1253,0.129454
1,0.1081,0.118961
2,0.1043,0.115103


###GENERATED DESCRIPTION

In [None]:
# define the tokenizer and load the trained model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2',  bos_token='<|startoftext|>',  eos_token='<|endoftext|>', pad_token='<|pad|>')
model = GPT2LMHeadModel.from_pretrained('/content/drive/MyDrive/fineTunedGPT2_hq').cuda() # continue training
model.resize_token_embeddings(len(tokenizer))

In [76]:
generated = tokenizer("<|startoftext|>  get first element of python list = ", return_tensors="pt").input_ids.cuda()

In [77]:
sample_outputs = model.generate(generated, do_sample=True, top_k=50, 
                                max_length=50, top_p=0.95, temperature=0.1, num_return_sequences=5,
                                pad_token_id=tokenizer.encode('<|pad|>')[0])

In [78]:
for i, sample_output in enumerate(sample_outputs):
    print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

0:   get first element of python list =   a[0]
1:   get first element of python list =   a[0]
2:   get first element of python list =   a[0]
3:   get first element of python list =   a[0]
4:   get first element of python list =   a[0]
