In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F
import csv

# GPT2 with Fine Tuning

### Prepare data

In [None]:
df = pd.read_csv('/content/drive/MyDrive/conala-corpus/conala-train.csv')

In [None]:
df['Training_Data'] = df.intent + '\n' + df.snippet
df.head()

Unnamed: 0,intent,rewritten_intent,snippet,question_id,Training_Data
0,How to convert a list of multiple integers int...,Concatenate elements of a list 'x' of multiple...,"sum(d * 10 ** i for i, d in enumerate(x[::-1]))",41067960,How to convert a list of multiple integers int...
1,How to convert a list of multiple integers int...,convert a list of integers into a single integer,"r = int(''.join(map(str, x)))",41067960,How to convert a list of multiple integers int...
2,how to convert a datetime string back to datet...,convert a DateTime string back to a DateTime o...,datetime.strptime('2010-11-13 10:33:54.227806'...,4170655,how to convert a datetime string back to datet...
3,Averaging the values in a dictionary based on ...,get the average of a list values for each key ...,"[(i, sum(j) / len(j)) for i, j in list(d.items...",29565452,Averaging the values in a dictionary based on ...
4,zip lists in python,"zip two lists `[1, 2]` and `[3, 4]` into a lis...","zip([1, 2], [3, 4])",13704860,"zip lists in python\nzip([1, 2], [3, 4])"


In [None]:
df = df.drop(columns=['question_id','rewritten_intent'])
df.head()

Unnamed: 0,intent,snippet,Training_Data
0,How to convert a list of multiple integers int...,"sum(d * 10 ** i for i, d in enumerate(x[::-1]))",How to convert a list of multiple integers int...
1,How to convert a list of multiple integers int...,"r = int(''.join(map(str, x)))",How to convert a list of multiple integers int...
2,how to convert a datetime string back to datet...,datetime.strptime('2010-11-13 10:33:54.227806'...,how to convert a datetime string back to datet...
3,Averaging the values in a dictionary based on ...,"[(i, sum(j) / len(j)) for i, j in list(d.items...",Averaging the values in a dictionary based on ...
4,zip lists in python,"zip([1, 2], [3, 4])","zip lists in python\nzip([1, 2], [3, 4])"


In [None]:
len(df)

2379

In [None]:
#Create a very small test set to compare generated text with the reality
test_set = pd.read_csv('/content/drive/MyDrive/conala-corpus/conala-test.csv')

#Reset the indexes
test_set = test_set.reset_index()
df = df.reset_index()

In [None]:
test_set['Training_Data'] = test_set.intent + '\n' + test_set.snippet
test_set = test_set.drop(columns=['question_id','rewritten_intent'])
test_set.head()

Unnamed: 0,index,intent,snippet,Training_Data
0,0,How can I send a signal from a python program?,"os.kill(os.getpid(), signal.SIGUSR1)",How can I send a signal from a python program?...
1,1,Decode Hex String in Python 3,bytes.fromhex('4a4b4c').decode('utf-8'),Decode Hex String in Python 3\nbytes.fromhex('...
2,2,check if all elements in a list are identical,all(x == myList[0] for x in myList),check if all elements in a list are identical\...
3,3,Format string dynamically,"print('%*s : %*s' % (20, 'Python', 20, 'Very G...",Format string dynamically\nprint('%*s : %*s' %...
4,4,How to convert a string from CP-1251 to UTF-8?,d.decode('cp1251').encode('utf8'),How to convert a string from CP-1251 to UTF-8?...


### Prepare the dataset

In [None]:
class Programs(Dataset):
    
    def __init__(self, control_code, truncate=False, gpt2_type="gpt2", max_length=1024):

        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
        self.code = []

        for row in df['Training_Data']:
          self.code.append(torch.tensor(
                self.tokenizer.encode(f"<|{control_code}|>{row[:max_length]}<|endoftext|>")
            ))
                
        if truncate:
            self.code = self.code[:20000]
        self.code_count = len(self.code)
        
    def __len__(self):
        return self.code_count

    def __getitem__(self, item):
        return self.code[item]

In [None]:
dataset = Programs(df['Training_Data'], truncate=True, gpt2_type="gpt2")

### Prepare training

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [None]:
#Accumulated batch size (since GPT2 is so big)
def pack_tensor(new_tensor, packed_tensor, max_seq_len):
    if packed_tensor is None:
        return new_tensor, True, None
    if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
        return packed_tensor, False, new_tensor
    else:
        packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
        return packed_tensor, True, None

In [None]:
def train(
    dataset, model, tokenizer,
    batch_size=16, epochs=10, lr=2e-5,
    max_seq_len=400, warmup_steps=200,
    gpt2_type="gpt2", output_dir=".", output_prefix="wreckgar",
    test_mode=False,save_model_on_epoch=False,
):

    acc_steps = 100
    device=torch.device("cuda")
    model = model.cuda()
    model.train()

    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )

    train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    loss=0
    accumulating_batch_count = 0
    input_tensor = None

    for epoch in range(epochs):

        print(f"Training epoch {epoch}")
        print(loss)
        for idx, entry in tqdm(enumerate(train_dataloader)):
            (input_tensor, carry_on, remainder) = pack_tensor(entry, input_tensor, 768)

            if carry_on and idx != len(train_dataloader) - 1:
                continue

            input_tensor = input_tensor.to(device)
            outputs = model(input_tensor, labels=input_tensor)
            loss = outputs[0]
            loss.backward()

            if (accumulating_batch_count % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()

            accumulating_batch_count += 1
            input_tensor = None
        if save_model_on_epoch:
            torch.save(
                model.state_dict(),
                os.path.join(output_dir, f"{output_prefix}-{epoch}.pt"),
            )
    return model

### Actual Training

In [None]:
#Train the model on the specific data we have
model = train(dataset, model, tokenizer)



Training epoch 0
0


2379it [02:37, 15.10it/s]


Training epoch 1
tensor(1.6879, device='cuda:0', grad_fn=<NllLossBackward0>)


2379it [02:42, 14.66it/s]


Training epoch 2
tensor(0.7446, device='cuda:0', grad_fn=<NllLossBackward0>)


2379it [02:42, 14.68it/s]


Training epoch 3
tensor(0.4253, device='cuda:0', grad_fn=<NllLossBackward0>)


2379it [02:41, 14.70it/s]


Training epoch 4
tensor(0.3239, device='cuda:0', grad_fn=<NllLossBackward0>)


2379it [02:41, 14.71it/s]


Training epoch 5
tensor(0.3459, device='cuda:0', grad_fn=<NllLossBackward0>)


2379it [02:42, 14.67it/s]


Training epoch 6
tensor(0.3968, device='cuda:0', grad_fn=<NllLossBackward0>)


2379it [02:42, 14.67it/s]


Training epoch 7
tensor(0.3871, device='cuda:0', grad_fn=<NllLossBackward0>)


2379it [02:42, 14.68it/s]


Training epoch 8
tensor(0.3003, device='cuda:0', grad_fn=<NllLossBackward0>)


2379it [02:41, 14.71it/s]


Training epoch 9
tensor(0.3134, device='cuda:0', grad_fn=<NllLossBackward0>)


2379it [02:41, 14.71it/s]


In [None]:
#Save the model to a pkl or something so it can be reused later on
torch.save(model, '/content/drive/MyDrive/conala-corpus/model.pt')

### Text generation

In [None]:
#Load the model to use it
model = torch.load('/content/drive/MyDrive/conala-corpus/model.pt')

In [None]:
def generate(
    model,
    tokenizer,
    prompt,
    entry_count=10,
    entry_length=40, #maximum number of words
    top_p=0.8,
    temperature=1.,
):

    model.eval()

    generated_num = 0
    generated_list = []

    filter_value = -float("Inf")

    with torch.no_grad():

        for entry_idx in trange(entry_count):

            entry_finished = False

            generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)

            for i in range(entry_length):
                outputs = model(generated, labels=generated)
                loss, logits = outputs[:2]
                logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)

                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
                    ..., :-1
                ].clone()
                sorted_indices_to_remove[..., 0] = 0

                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                logits[:, indices_to_remove] = filter_value

                next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
                generated = torch.cat((generated, next_token), dim=1)

                if next_token in tokenizer.encode("\n"):
                    entry_finished = True

                if entry_finished:

                    generated_num = generated_num + 1

                    output_list = list(generated.squeeze().numpy())
                    output_text = tokenizer.decode(output_list)
                    generated_list.append(output_text)
                    break
            
            if not entry_finished:
              output_list = list(generated.squeeze().numpy())
              output_text = f"{tokenizer.decode(output_list)}<|endoftext|>" 
              generated_list.append(output_text)
                
    return generated_list

In [None]:
#Function to generate multiple sentences. Test data should be a dataframe
def text_generation(test_data):
  generated_code = []
  for i in range(len(test_data)):
    x = generate(model.to('cpu'), tokenizer, test_data['Training_Data'][i], entry_count=1)
    generated_code.append(x)
  return generated_code

In [None]:
generated_code = text_generation(test_set)

In [None]:
import re
a = re.split('\n|<', generated_code[0][0])[1]
a

'os.kill(os.getpid(), signal.SIGUSR1)'

In [None]:
#Loop to keep only generated text and add it as a new column in the dataframe
my_generations=[]

for i in range(len(generated_code)):
  a = re.split('\n|<', generated_code[i][0])[1]
  my_generations.append(a)

test_set['Generated_snippet'] = my_generations

In [None]:
test_set.head()

Unnamed: 0,index,intent,snippet,Training_Data,Generated_snippet
0,0,How can I send a signal from a python program?,"os.kill(os.getpid(), signal.SIGUSR1)",How can I send a signal from a python program?...,"os.kill(os.getpid(), signal.SIGUSR1)"
1,1,Decode Hex String in Python 3,bytes.fromhex('4a4b4c').decode('utf-8'),Decode Hex String in Python 3\nbytes.fromhex('...,bytes.fromhex('4a4b4c').decode('utf-8')
2,2,check if all elements in a list are identical,all(x == myList[0] for x in myList),check if all elements in a list are identical\...,all(x == myList[0] for x in myList)
3,3,Format string dynamically,"print('%*s : %*s' % (20, 'Python', 20, 'Very G...",Format string dynamically\nprint('%*s : %*s' %...,"print('%*s : %*s' % (20, 'Python', 20, 'Very G..."
4,4,How to convert a string from CP-1251 to UTF-8?,d.decode('cp1251').encode('utf8'),How to convert a string from CP-1251 to UTF-8?...,d.decode('cp1251').encode('utf8')


In [None]:
test_set['Generated_snippet'][15]

'Entry.objects.filter()[:1].get()'

In [None]:
test_set['snippet'][15]

'Entry.objects.filter()[:1].get()'

### Analyze performance

In [None]:
!pip install rouge

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [None]:
#Rouge score
from rouge import Rouge
rouge=Rouge()

rouge.get_scores(test_set['Generated_snippet'], test_set['snippet'], avg=True)

{'rouge-1': {'r': 0.9519448268507096,
  'p': 0.9439811410811415,
  'f': 0.9297990193201199},
 'rouge-2': {'r': 0.8911630116260553,
  'p': 0.8862972285786932,
  'f': 0.874053085977529},
 'rouge-l': {'r': 0.9519448268507096,
  'p': 0.9439811410811415,
  'f': 0.9297990193201199}}