In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
import os

# Step 1: Load and preprocess the dataset
df = pd.read_csv("/content/ProblemSolutionPythonV3.csv")
df = df.dropna(subset=["Python Code"])
df = df.rename(columns={"Problem": "Query", "Python Code": "Code_Snippet"})
df['Query'] = df['Query'].str.lower()
df['Code_Snippet'] = df['Code_Snippet'].str.lower()

# Step 2: Tokenize
tokenizer = T5Tokenizer.from_pretrained("t5-small")

def preprocess_data(data):
    inputs = ["generate code: " + query for query in data["Query"]]
    targets = data["Code_Snippet"].tolist()
    input_encodings = tokenizer(inputs, max_length=128, truncation=True, padding="max_length", return_tensors="pt")
    target_encodings = tokenizer(targets, max_length=128, truncation=True, padding="max_length", return_tensors="pt")
    return input_encodings, target_encodings

input_encodings, target_encodings = preprocess_data(df)

# Step 3: Custom Dataset
class CodeSnippetDataset(Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return len(self.inputs["input_ids"])

    def __getitem__(self, idx):
        return {
            "input_ids": self.inputs["input_ids"][idx],
            "attention_mask": self.inputs["attention_mask"][idx],
            "labels": self.targets["input_ids"][idx],
        }

dataset = CodeSnippetDataset(input_encodings, target_encodings)

# Step 4: Load Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)

# Step 5: Training arguments with improved config
output_dir = "/content/t5_finetuned150man_model"
training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=150,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500,
    save_total_limit=1,                    # Enable mixed precision if using GPU
)

# Step 6: Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
)

# Step 7: Train
trainer.train()

# Step 8: Save the model and tokenizer
os.makedirs(output_dir, exist_ok=True)
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
trainer.save_state()
torch.save(training_args, os.path.join(output_dir, "training_args.bin"))

# Step 9: Load back (optional)
model = T5ForConditionalGeneration.from_pretrained(output_dir).to(device)
tokenizer = T5Tokenizer.from_pretrained(output_dir)

# Step 10: Inference function
def generate_code(query):
    query = query.lower()
    input_text = "generate code: " + query
    input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True).to(model.device)
    outputs = model.generate(input_ids, max_length=512, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Step 11: Real-time Interface
while True:
    query = input("\n\nYour Query or type (exit) to leave : \n  ")
    if query.lower() == "exit":
        print("\nGood Bye...")
        break
    code = generate_code(query)
    print("\nCode Snippet: \n", code)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mallabonu-srisameera-ece23[0m ([33mallabonu-srisameera-ece23-indian-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
500,3.4814
1000,2.6276
1500,2.4084
2000,2.252
2500,2.1353
3000,2.0334
3500,2.0065
4000,1.9056
4500,1.8545
5000,1.8224




Your Query or type (exit) to leave : 
  exit

Good Bye...


In [2]:
!pip install datasets


Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [5]:
!pip install evaluate


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [10]:
!pip install rouge_score


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=308b0ea0cb96aa8fd2d5841d7097314edb344ee1f66e700cb83d1f3b8dfd0503
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [13]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
import evaluate
from tqdm import tqdm
import pandas as pd

# ========== 1. Load Model and Tokenizer ==========
model_path = "/content/drive/MyDrive/t5_finetuned150man_model"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = T5ForConditionalGeneration.from_pretrained(model_path).to(device)
tokenizer = T5Tokenizer.from_pretrained(model_path)

# ========== 2. Load and Preprocess Dataset ==========
df = pd.read_csv("/ProblemSolutionPythonV3.csv")
df = df.dropna(subset=["Python Code"])
df = df.rename(columns={"Problem": "Query", "Python Code": "Code_Snippet"})
df['Query'] = df['Query'].str.lower()
df['Code_Snippet'] = df['Code_Snippet'].str.lower()

def preprocess_data(data):
    inputs = ["generate code: " + query for query in data["Query"]]
    targets = data["Code_Snippet"].tolist()
    input_encodings = tokenizer(inputs, max_length=128, truncation=True, padding="max_length", return_tensors="pt")
    target_encodings = tokenizer(targets, max_length=128, truncation=True, padding="max_length", return_tensors="pt")
    return input_encodings, target_encodings

input_encodings, target_encodings = preprocess_data(df)

# ========== 3. Create Custom Dataset ==========
class CodeSnippetDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return len(self.inputs["input_ids"])

    def __getitem__(self, idx):
        return {
            "input_ids": self.inputs["input_ids"][idx],
            "attention_mask": self.inputs["attention_mask"][idx],
            "labels": self.targets["input_ids"][idx],
        }

dataset = CodeSnippetDataset(input_encodings, target_encodings)

# ========== 4. Evaluation Function ==========
def evaluate_model(model, tokenizer, dataset, num_samples=100):
    bleu = evaluate.load("bleu")
    rouge = evaluate.load("rouge")
    exact_match_count = 0

    model.eval()
    inputs = dataset.inputs["input_ids"]
    targets = dataset.targets["input_ids"]
    total_samples = min(num_samples, len(inputs))

    for i in tqdm(range(total_samples)):
        input_ids = inputs[i].unsqueeze(0).to(model.device)
        labels = targets[i].unsqueeze(0).to(model.device)

        with torch.no_grad():
            outputs = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)

        pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
        true = tokenizer.decode(labels[0], skip_special_tokens=True)

        # BLEU evaluation: predictions should be a list of words (tokens)
        bleu.add(predictions=[pred.split()], references=[[true.split()]])

        # ROUGE evaluation: predictions and references should be plain strings
        rouge.add(prediction=pred, reference=true)

        if pred.strip() == true.strip():
            exact_match_count += 1

    bleu_score = bleu.compute()
    rouge_score = rouge.compute()
    exact_match = exact_match_count / total_samples

    print(f"\nEvaluation on {total_samples} samples:")
    print(f"BLEU Score: {bleu_score['bleu']:.4f}")
    print(f"ROUGE-L Score: {rouge_score['rougeL']:.4f}")
    print(f"Exact Match Accuracy: {exact_match * 100:.2f}%")

# ========== 5. Run Evaluation ==========
evaluate_model(model, tokenizer, dataset, num_samples=100)


  0%|          | 0/100 [00:04<?, ?it/s]


ValueError: Predictions and/or references don't match the expected format.
Expected format:
Feature option 0: {'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id='references')}
Feature option 1: {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')},
Input predictions: [['import', 'numpy', 'as', 'np', 'x', '=', "np.array(['python", "exercises',", "'php',", "'c++'],", 'dtype=np.str)', 'print("original', 'array:")', 'print(x)', 'print("repeated', 'array', 'elements:")', 'print(x)']],
Input references: [[['import', 'numpy', 'as', 'np', 'x', '=', 'np.repeat(3,', '4)', 'print(x)', 'x', '=', 'np.array([[1,2],[3,4]])', 'print(np.repeat(x,', '2))']]]