In [1]:
!pip install transformers[torch]



In [2]:
import os
import shutil

import pandas as pd
import numpy as np

import torch
from torch.utils.data import Dataset
from transformers import (
    TrainerCallback, 
    EarlyStoppingCallback, 
    AutoTokenizer, 
    T5ForConditionalGeneration, 
    AutoModelForSeq2SeqLM, 
    DataCollatorForSeq2Seq, 
    Seq2SeqTrainingArguments, 
    Seq2SeqTrainer
)

np.random.seed(0)

2024-02-17 01:52:36.573186: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-17 01:52:36.614589: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-17 01:52:36.614629: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-17 01:52:36.615790: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-17 01:52:36.622904: I tensorflow/core/platform/cpu_feature_guar

In [3]:
with open("english_python_data.txt", "r") as f:
    file_lines = f.readlines()

dps = []
current_question = None
current_solution = []

for line in file_lines:
    if line.startswith("#"):
        if current_question is not None:
            dps.append({"question": current_question, "solution": ''.join(current_solution)})
        current_question = line[1:].strip()
        current_solution = []
    else:
        current_solution.append(line)

if current_question is not None:
    dps.append({"question": current_question, "solution": ''.join(current_solution)})

python_problems_df = pd.DataFrame(dps)

msk = np.random.rand(len(python_problems_df)) < 0.8

train_df = python_problems_df[msk]
val_df = python_problems_df[~msk]

In [4]:
def augment_dataframe(train_df):
    augmented_rows = []
    
    solution_replacements = [
        (("num1", "num2", "num3"), ("x", "y", "z")),
        (("num1", "num2", "num3"), ("var1", "var2", "var3")),
        (("num1", "num2", "num3"), ("a", "b", "c")),
        (("num1", "num2", "num3"), ("first", "second", "third")),
        (("num",), ("x",)),
        (("largest",), ("highest",)),
        (("smallest",), ("lowest",)),
        (("l1",), ("list1",)),
        (("l2",), ("list2",)),
        ((" i ",), (" j ",)),
        ((" i ",), (" k ",)),
        ((" i ",), (" idx ",)),
        ((" i ",), (" x ",)),
        ((" i ",), (" r ",)),
    ]
    
    question_replacements = [
        (("print",), ("return",)),
        (("Write a function to ",), ("",)),
    ]
    
    for index, row in train_df.iterrows():
        changes = []
        solution = row["solution"]
        question = row["question"]

        for find_words, replace_words in solution_replacements:
            for find_word, replace_word in zip(find_words, replace_words):
                if find_word in solution:
                    solution = solution.replace(find_word, replace_word)
                    changes.append((find_word, replace_word))

        for find_words, replace_words in question_replacements:
            for find_word, replace_word in zip(find_words, replace_words):
                if find_word in question:
                    question = question.replace(find_word, replace_word)
                    changes.append((find_word, replace_word))
                    
        if changes:
            augmented_row = row.copy()
            augmented_row["solution"] = solution
            augmented_row["question"] = question
            augmented_rows.append(augmented_row)
                
    augmented_train_df = pd.concat([train_df, pd.DataFrame(augmented_rows)], ignore_index=True)
    return augmented_train_df

augmented_train_df = augment_dataframe(train_df)

count_before = len(train_df)
count_after = len(augmented_train_df)

print("Count before augmentation:", count_before)
print("Count after augmentation:", count_after)

Count before augmentation: 3970
Count after augmentation: 6394


In [5]:
model_dir = "codet5_large_python_code_gen_v2"
max_question_length = 64
max_solution_length = 128
model_checkpoint = "./codet5_large_python_code_gen_v2/tmp-checkpoint-3200" # "Salesforce/codet5-large" # use checkpoint paths
strategy = "epoch"
lr = 4e-5
train_batch_size = 8
val_batch_size = 8
decay = 0.01
save_limits = 1
train_iter = 10

In [6]:
def clear_directory(directory):
    for root, dirs, files in os.walk(directory, topdown=False):
        for name in files:
            os.remove(os.path.join(root, name))
        for name in dirs:
            os.rmdir(os.path.join(root, name))
    os.rmdir(directory)

try:
    clear_directory(model_dir)
except:
    print("Model directory doesn't exist!")

In [6]:
if torch.cuda.is_available():
    num_gpus = torch.cuda.device_count()
    print(f"Training is using {num_gpus} GPU(s).")
    for i in range(num_gpus):
        device = torch.cuda.get_device_properties(i)
        print(f"GPU {i}: {device.name}, Compute Capability: {device.major}.{device.minor}")
        print(f"\tTotal Memory: {device.total_memory / (1024**3):.2f} GB")
        print(f"\tCUDA Cores: {device.multi_processor_count}")
else:
    print("Training is using CPU.")

Training is using 1 GPU(s).
GPU 0: NVIDIA A100 80GB PCIe, Compute Capability: 8.0
	Total Memory: 79.32 GB
	CUDA Cores: 108


In [7]:
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_question_length, max_solution_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_question_length + max_solution_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        inputs = item['question']
        targets = item['solution']

        encoding = self.tokenizer(
            inputs,
            text_target=targets,
            max_length=self.max_length,
            truncation=True,
            padding="max_length"
        )

        return encoding

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

train_dataset = CustomDataset(augmented_train_df, tokenizer, max_question_length, max_solution_length)
val_dataset = CustomDataset(val_df, tokenizer, max_question_length, max_solution_length)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_checkpoint)

training_args = Seq2SeqTrainingArguments(
    output_dir=model_dir,
    evaluation_strategy=strategy,
    logging_strategy=strategy,
    save_strategy=strategy,
    learning_rate=lr,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=val_batch_size,
    weight_decay=decay,
    save_total_limit=save_limits,
    num_train_epochs=train_iter,
    predict_with_generate=True,
    fp16=False,
    load_best_model_at_end=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
trainer.train()

In [8]:
predictions = trainer.predict(val_dataset, max_new_tokens=64)

In [9]:
def print_example(index, val_dataset, predictions, tokenizer, val_df):

    example = val_dataset[index]
    
    question = tokenizer.decode(example["input_ids"], skip_special_tokens=True)
    solution = tokenizer.decode(example["labels"], skip_special_tokens=True)
    prediction = tokenizer.decode(predictions.predictions[index], skip_special_tokens=True)

    print("Question:")
    print(question)
    print("\nTrue Solution:")
    print(solution)
    print("\nPredicted Solution:")
    print(prediction)

example_index = 7

print_example(example_index, val_dataset, predictions, tokenizer, val_df)

Question:
Write a function to calculate simple interest, given p, r, t

True Solution:
def simp_int(p, r, t):
    interest = (p*r*t)/100
    return interest




Predicted Solution:
def simpleIntereset(p, r, t):
    si = (p*r*t)/100
    return si





In [None]:
# train_df.iloc[10]["question"]

In [None]:
# train_df.iloc[10]["solution"]

In [11]:
def generate_solution(user_text, trainer, tokenizer, max_length=64):
    input_ids = tokenizer.encode(user_text, max_length=max_length, truncation=True, return_tensors="pt")
    input_ids = input_ids.to(trainer.args.device)

    output_ids = trainer.model.generate(input_ids, max_length=max_length, num_beams=4, early_stopping=True)

    generated_solution = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    return generated_solution

user_text = "write a function to print sum of two number"
generated_solution = generate_solution(user_text, trainer, tokenizer)
print("Generated Solution:")
print(generated_solution)

Generated Solution:
def sum_nums(num, y):
    return num + y




In [22]:
user_text = "return top 5 values"
generated_solution = generate_solution(user_text, trainer, tokenizer)
print("Generated Solution:")
print(generated_solution)

Generated Solution:
print([i for j in range(5) if j % 5 == 0])


