In [1]:

import pandas as pd

df = pd.read_csv("extended_programming_code_snippets.csv")
df

Unnamed: 0,Query,Code_Snippet,Language,Tags
0,How to create a list comprehension in Python?,[x**2 for x in range(10)],Python,tutorial
1,How to handle missing data in pandas?,"df.fillna(0, inplace=True)",Python,example
2,How to use a lambda function in Python?,lambda x: x + 2,Python,advanced
3,How to create a REST API in Flask?,from flask import Flask\napp = Flask(__name__)...,Python,tutorial
4,How to perform matrix multiplication in numpy?,"import numpy as np\nnp.dot(A, B)",Python,advanced
...,...,...,...,...
3015,How to implement a class in C++?,class MyClass {\npublic:\n void myMethod() ...,C++,tutorial
3016,How to use pointers in C++?,int x = 10;\nint* ptr = &x;,C++,advanced
3017,How to read a file in C++?,#include <fstream>\nstd::ifstream file('file.t...,C++,common-issues
3018,How to create a vector in C++?,"#include <vector>\nstd::vector<int> v = {1, 2,...",C++,tutorial


In [2]:
df['Language'].value_counts()


Language
SQL           627
Java          618
Shell         601
JavaScript    595
Python        574
C++             5
Name: count, dtype: int64

In [3]:

# Convert all text columns to lowercase
df['Query'] = df['Query'].str.lower()
df['Code_Snippet'] = df['Code_Snippet'].str.lower()
df['Tags'] = df['Tags'].str.lower()

In [4]:
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained("t5-small")

def preprocess_data(data):
    inputs = ["generate code: " + query for query in data["Query"]]
    targets = data["Code_Snippet"].tolist()
    input_encodings = tokenizer(inputs, max_length=128, truncation=True, padding="max_length", return_tensors="pt")
    target_encodings = tokenizer(targets, max_length=128, truncation=True, padding="max_length", return_tensors="pt")
    return input_encodings, target_encodings

input_encodings, target_encodings = preprocess_data(df)

  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [5]:

import torch

class CodeSnippetDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return len(self.inputs["input_ids"])

    def __getitem__(self, idx):
        return {
            "input_ids": self.inputs["input_ids"][idx],
            "attention_mask": self.inputs["attention_mask"][idx],
            "labels": self.targets["input_ids"][idx],
        }

dataset = CodeSnippetDataset(input_encodings, target_encodings)

In [6]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)

training_args = TrainingArguments(
    output_dir="./t5_finetuned",
    evaluation_strategy="epoch",  
    save_strategy="epoch",        
    learning_rate=5e-5,
    per_device_train_batch_size=4,  
    per_device_eval_batch_size=4,   
    num_train_epochs=10,            
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500,             
    save_total_limit=1,            
    load_best_model_at_end=True,
    fp16=True,                     
)

# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
)

# Train the model
trainer.train()


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,1.156,0.052419
2,0.0634,0.01006
3,0.0352,0.00872
4,0.019,0.008079
5,0.0165,0.007408
6,0.012,0.006829
7,0.0105,0.006321
8,0.0103,0.005919
9,0.0106,0.005675
10,0.0116,0.00559


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=7550, training_loss=0.10508741058458555, metrics={'train_runtime': 1006.4158, 'train_samples_per_second': 30.007, 'train_steps_per_second': 7.502, 'total_flos': 1021830601113600.0, 'train_loss': 0.10508741058458555, 'epoch': 10.0})

In [7]:

model.save_pretrained("t5_finetuned_model")
tokenizer.save_pretrained("t5_finetuned_model")

('t5_finetuned_model\\tokenizer_config.json',
 't5_finetuned_model\\special_tokens_map.json',
 't5_finetuned_model\\spiece.model',
 't5_finetuned_model\\added_tokens.json')