In this notebook, I am going to finetune a pretrained CodeT5 model for text2code task. I utilize libraries from Huggingface, since it facilitates data preprocessing, training and evaluation phase.

In [1]:
!pip install -q transformers datasets evaluate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m42.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 KB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 KB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 KB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 KB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import transformers
from datasets import load_dataset, Dataset, DatasetDict
from google.colab import drive
import pandas as pd


In [3]:
drive.mount('/content/drive/')

Mounted at /content/drive/


In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/ProblemSolutionPythonV3.csv")
df.dropna(inplace=True)


df['Python Code'].replace(r'\r', '\n', regex=True, inplace=True)

dataset = Dataset.from_pandas(df)


In [17]:
df.head()

Unnamed: 0.1,Unnamed: 0,Problem,Python Code
0,0,Write a NumPy program to repeat elements of an...,"import numpy as np\nx = np.repeat(3, 4)\nprint..."
1,1,Write a Python function to create and print a ...,def printValues():\n\tl = list()\n\tfor i in r...
2,2,Write a Python program to remove duplicates fr...,"import itertools\nnum = [[10, 20], [40], [30, ..."
3,3,Write a NumPy program to compute the x and y c...,import numpy as np\nimport matplotlib.pyplot a...
4,4,Write a Python program to alter a given SQLite...,import sqlite3\nfrom sqlite3 import Error\ndef...


#Train Test Split
Train-test-split is performed twice to split the data into train, validation, and test.
Then the dataset is built via DatasetDict

In [18]:
datasets_train_testval = dataset.train_test_split(test_size=400)

datasets_test_val = datasets_train_testval['test'].train_test_split(test_size=0.5)

train_test_dataset = DatasetDict({
    'train': datasets_train_testval["train"],
    'val': datasets_test_val['train'],
    'test': datasets_test_val['test']
    })

In [19]:
train_test_dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'Problem', 'Python Code', '__index_level_0__'],
        num_rows: 2906
    })
    val: Dataset({
        features: ['Unnamed: 0', 'Problem', 'Python Code', '__index_level_0__'],
        num_rows: 200
    })
    test: Dataset({
        features: ['Unnamed: 0', 'Problem', 'Python Code', '__index_level_0__'],
        num_rows: 200
    })
})

#Preprocessing
The preprocess function does the following:

- prepend the  prefix that denotes the downstream task

- tokenize the model input (Problem) and the labels (Python Code that corresponds to the problem)

- replace the index of the padding tokens by -100 so that they are ignored during training

In [21]:
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained("Salesforce/codet5-base")

prefix = "Generate Python: "
max_input_length = 48
max_target_length = 512

def preprocess(data):
  
  texts = data['Problem']
  codes = data['Python Code']
  
  inputs = [prefix + text for text in texts]
  model_inputs = tokenizer(inputs, max_length=max_input_length, padding="max_length", truncation=True)

  labels = tokenizer(codes, max_length=max_target_length, padding="max_length", truncation=True).input_ids


  labels_with_ignore_index = []
  for labels_example in labels:
    labels_example = [label if label != 0 else -100 for label in labels_example]
    labels_with_ignore_index.append(labels_example)
  
  model_inputs["labels"] = labels_with_ignore_index

  return model_inputs

In [22]:
train_test_dataset = train_test_dataset.map(preprocess, batched=True)
train_test_dataset

Map:   0%|          | 0/2906 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'Problem', 'Python Code', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2906
    })
    val: Dataset({
        features: ['Unnamed: 0', 'Problem', 'Python Code', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 200
    })
    test: Dataset({
        features: ['Unnamed: 0', 'Problem', 'Python Code', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 200
    })
})

For training, I import Seq2SeqTrainingArguments, Seq2SeqTrainer. This allows to avoid writing a training loop and at the same time pass multiple parameters (learning rate, weight decay, num_epochs, etc.)in one class, which is convenient. For evaluation I picked BLEU score which was also used in the original paper. Also, there is no need to take care of collating data into batches, since there is a data collator in the transformers library, and is passed to trainer as an argument.

In [23]:
from transformers import T5ForConditionalGeneration, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [24]:
import evaluate
import nltk 
nltk.download('punkt')
metric = evaluate.load("bleu")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [25]:
import numpy as np
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip()))
                      for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) 
                      for label in decoded_labels]

   
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result['bleu'] = result['bleu'] * 100
    
    return result

In Seq2SeqTrainingArguments I specify parameters for saving the resulting model and evaluation and logging strategy, as well as hyperparameters. The are 1820 iterations in 5 epochs, so the evaluation is performed every 100 steps. The hyperparameters (learning rate, batch size, etc.) are pretty standard and selected in order not to spoil the model's performance.

In [26]:
batch_size = 8
model_name = "codet5-python-generation"
model_dir = f"drive/MyDrive/Models/{model_name}"



args = Seq2SeqTrainingArguments(
    model_dir,
    evaluation_strategy="steps",
    eval_steps=100,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=200,
    learning_rate=3e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.001,
    fp16=True,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
    load_best_model_at_end=True,
)

In [27]:
data_collator = DataCollatorForSeq2Seq(tokenizer)

In [28]:
def model_init():
    return T5ForConditionalGeneration.from_pretrained("Salesforce/codet5-base")

trainer = Seq2SeqTrainer(
    model_init=model_init,
    args=args,
    train_dataset=train_test_dataset['train'],
    eval_dataset=train_test_dataset['val'],
    data_collator=data_collator,
    tokenizer=tokenizer)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [29]:
trainer.train()



Step,Training Loss,Validation Loss
100,2.2518,1.517326
200,1.6084,1.259911
300,1.4617,1.20466
400,1.2555,1.079029
500,1.0591,1.016309
600,1.0521,0.951695
700,0.9927,0.925286
800,0.8642,0.934536
900,0.7791,0.889118
1000,0.7763,0.858602


TrainOutput(global_step=1820, training_loss=0.9126550087561974, metrics={'train_runtime': 1355.0055, 'train_samples_per_second': 10.723, 'train_steps_per_second': 1.343, 'total_flos': 829514826547200.0, 'train_loss': 0.9126550087561974, 'epoch': 5.0})

In [30]:
trainer.save_model(model_dir)

Let's load the model and the tokenizer from the last checkpoint.

In [46]:
model_path = '/content/drive/MyDrive/Colab Notebooks/drive/MyDrive/Models/codet5-python-generation/checkpoint-1800'

In [47]:
model = T5ForConditionalGeneration.from_pretrained(model_path)

In [48]:
tokenizer = RobertaTokenizer.from_pretrained(model_path)

Now we can take a look at several model's predictions on the test set.

In [49]:
def predict_code(example):
  query = prefix + example['Problem'] 
  input_ids = tokenizer(query, padding='max_length', truncation=True, max_length=max_input_length, return_tensors="pt").input_ids
    
  generated_code = model.generate(input_ids, max_length=max_target_length)
    
  decoded_code = tokenizer.decode(generated_code.numpy()[0], skip_special_tokens=True)
  return decoded_code

In [50]:
import random
def test():

  index = random.randint(0, len(train_test_dataset['test']))
  example = train_test_dataset['test'][index]
  text = example['Problem']
  code = example['Python Code']
    
  decoded_code = predict_code(example) 


  print("#" * 25); print("QUERY: ", text); 
  print()
  print('#' * 25); print("ORIGINAL: "); print("\n", code);
  print()
  print('#' * 25); print("GENERATED: "); print("\n", decoded_code);

In [65]:
for i in range(3):
  test()

#########################
QUERY:  Write a Pandas program to create a Pivot table and find survival rate by gender. 

#########################
ORIGINAL: 

 import pandas as pd
import numpy as np
df = pd.read_csv('titanic.csv')
result=df.groupby('sex')[['survived']].mean()
print(result)


#########################
GENERATED: 

 import pandas as pd
import numpy as np
df = pd.read_csv('titanic.csv')
result = df.pivot_table('survived', index='sex', columns='name')
print(result)

#########################
QUERY:  How to find the number of arguments in a Python function

#########################
ORIGINAL: 

 def no_of_argu(*args):
     
    # using len() method in args to count
    return(len(args))




a = 1
b = 3


# arguments passed
n = no_of_argu(1, 2, 4, a)


# result printed
print(" The number of arguments are: ", n)

#########################
GENERATED: 

 # Python program to find the number of
# arguments




def num_arguments(fn):
     
    # call the function
    # print the numbe

#Model Compression & Performance Comparison

Let's write a function that we will later use for the calculation of the BLEU score for both the original and compressed model.

For model compression, dynamic quantization is used.

In [105]:

import torch

test_tokenized_dataset = train_test_dataset["test"]

def preprocess_test(examples):
  inputs = [prefix + text for text in examples["Problem"]]
  model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True,
                           padding="max_length")
  return model_inputs

test_tokenized_dataset = test_tokenized_dataset.map(preprocess_test, batched=True)


test_tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])
dataloader = torch.utils.data.DataLoader(test_tokenized_dataset, batch_size=32)

def compute_bleu(model):

  all_predictions = []
  for i,batch in enumerate(dataloader):
    predictions = model.generate(**batch, max_length=256)
    all_predictions.append(predictions)

  all_predictions_flattened = [pred for preds in all_predictions for pred in preds]

  all_labels = tokenizer(test_tokenized_dataset["Python Code"], max_length=max_target_length,
                        truncation=True, padding="max_length")["input_ids"]

  data = [all_predictions_flattened, all_labels]
  return compute_metrics(data)

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [96]:
quantized_model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)

In [104]:
import os
def print_size_of_model(model):
    torch.save(model.state_dict(), "temp.p")
    print('Size (MB):', os.path.getsize("temp.p")/(1024**2))
    os.remove('temp.p')


print_size_of_model(model)
print_size_of_model(quantized_model)

Size (MB): 850.3209743499756
Size (MB): 306.9806890487671


In [103]:
result = compute_bleu(model)
comp_result = compute_bleu(quantized_model)
print("BLEU for original model:", result['bleu'])
print("BLEU for compressed model:", comp_result['bleu'])

BLEU for original model: 34.159872583391376
BLEU for compressed model: 33.66092305973754


#Conclusion

- For the original model the metric is 34.15 BLEU, which is comparable to the result in the paper. 
- Dynamic quantization reduced the model's size by almost 3 times without significant deterioration in the model's performance.
