In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install evaluate rouge_score


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill (from evaluate)
  Downloading dill-0.4.0-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.18-py311-none-any.whl.metadata (7.5 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloadi

In [None]:
!pip install -U transformers




In [None]:
# 🚀 Step 0: Install Dependencies (Run in Colab if needed)
!pip install -q transformers evaluate datasets safetensors

# 🚀 Step 1: Imports
import os
import pandas as pd
import torch
from transformers import T5ForConditionalGeneration, AutoTokenizer, TrainingArguments, Trainer
from torch.utils.data import Dataset
from sklearn.model_selection import KFold
import evaluate
import numpy as np

# 🚀 Step 2: Load Dataset
df = pd.read_csv("/content/ProblemSolutionPythonV3.csv").dropna(subset=["Python Code"])
df = df.rename(columns={"Problem": "Query", "Python Code": "Code_Snippet"})
df["Query"] = df["Query"].str.lower()
df["Code_Snippet"] = df["Code_Snippet"].str.lower()

# 🚀 Step 3: Tokenizer
model_checkpoint = "Salesforce/codet5-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# 🚀 Step 4: Custom Dataset
class CodeGenDataset(Dataset):
    def __init__(self, queries, codes):
        self.inputs = tokenizer(["generate code: " + q for q in queries],
                                truncation=True, padding="max_length", max_length=256, return_tensors="pt")
        self.targets = tokenizer(codes, truncation=True, padding="max_length", max_length=256, return_tensors="pt")

    def __len__(self):
        return len(self.inputs["input_ids"])

    def __getitem__(self, idx):
        return {
            "input_ids": self.inputs["input_ids"][idx],
            "attention_mask": self.inputs["attention_mask"][idx],
            "labels": self.targets["input_ids"][idx],
        }

# 🚀 Step 5: Evaluation Metrics
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = [p.strip() for p in decoded_preds]
    decoded_labels = [l.strip() for l in decoded_labels]

    rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    bleu_result = bleu.compute(predictions=decoded_preds, references=[[l] for l in decoded_labels])
    em_score = sum([1 if p == l else 0 for p, l in zip(decoded_preds, decoded_labels)]) / len(decoded_preds)

    return {
        "rougeL": round(rouge_result["rougeL"] * 100, 2),
        "bleu": round(bleu_result["bleu"] * 100, 2),
        "exact_match": round(em_score * 100, 2)
    }

# 🚀 Step 6: K-Fold Training + Ensemble
k = 3
kf = KFold(n_splits=k, shuffle=True, random_state=42)
models = []

for fold, (train_idx, val_idx) in enumerate(kf.split(df)):
    print(f"\n📂 Fold {fold + 1}/{k}")
    model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)

    train_data = df.iloc[train_idx]
    val_data = df.iloc[val_idx]

    train_dataset = CodeGenDataset(train_data["Query"].tolist(), train_data["Code_Snippet"].tolist())
    val_dataset = CodeGenDataset(val_data["Query"].tolist(), val_data["Code_Snippet"].tolist())

    output_dir = f"/content/codet5_fold{fold+1}"
    training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir=f"{output_dir}/logs",
    save_total_limit=1,
    logging_steps=500,
    save_steps=500,
    report_to="none"
)


    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    models.append(model)

# 🚀 Step 7: Ensemble Saving (First model will be saved as final ensemble checkpoint)
final_model_dir = "/content/codet5_ensemblemm_model"
os.makedirs(final_model_dir, exist_ok=True)
models[0].save_pretrained(final_model_dir, safe_serialization=True)
tokenizer.save_pretrained(final_model_dir)
torch.save(training_args, os.path.join(final_model_dir, "training_args.bin"))
print(f"\n✅ Final ensemble model saved at {final_model_dir}")

# 🚀 Step 8: Ensemble Inference Function
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
for m in models:
    m.to(device)
    m.eval()

def generate_code(query, max_length=256, num_beams=5):
    input_text = "generate code: " + query.lower()
    input_ids = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).input_ids.to(device)
    attention_mask = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).attention_mask.to(device)

    # Generate outputs from each model
    generated_outputs = []
    with torch.no_grad():
        for m in models:
            generated_ids = m.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_length=max_length,
    do_sample=True,
    temperature=0.7,
    top_k=50,
    top_p=0.9,
    num_return_sequences=1
)

            generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
            generated_outputs.append(generated_text.strip())

    # Majority vote OR first output (as fallback)
    from collections import Counter
    most_common = Counter(generated_outputs).most_common(1)[0][0]
    return most_common


# 🚀 Step 9: Manual Testing Loop
while True:
    query = input("\n💬 Enter your query (or type 'exit'): ")
    if query.lower() == "exit":
        print("👋 Bye!")
        break
    output = generate_code(query)
    print("\n💡 Generated Code:\n", output)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]


📂 Fold 1/3


config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
500,1.1366
1000,0.7421
1500,0.6308
2000,0.5563
2500,0.5286



📂 Fold 2/3


Step,Training Loss
500,1.1399
1000,0.7643
1500,0.64
2000,0.5785
2500,0.5187



📂 Fold 3/3


Step,Training Loss
500,1.1475
1000,0.7548
1500,0.6663
2000,0.5807
2500,0.5314



✅ Final ensemble model saved at /content/codet5_ensemblemm_model

💬 Enter your query (or type 'exit'): python program to perform subtraction of two matrices

💡 Generated Code:
 from numpy import linalg as linalg
  
# taking input of the 1st matrix
print("enter the 1st matrix element:")
print(np.linalg.norm(matrix))
  
# taking input of the 2nd matrix
print("enter the 2nd matrix element:")
print(np.linalg.norm(matrix))
  
# perform subtraction of two matrices
sub_matrix = [[0, 1] for i in range(len(matrix)):
    for j in range(len(matrix[0])):
        sub_matrix[j][i]=matrix[i][j]
  
# display the subtraction of the 2nd matrix
print("subtraction of two matrices is:")
for i in range(len(matrix)):
    print(np.subtract(matrix[i], sub_matrix[i], sub_matrix[i])

💬 Enter your query (or type 'exit'): Python Program to Check Whether a Number is Positive or Negative

💡 Generated Code:
 a=int(input("enter a number: "))
b=int(input("enter a negative number: "))
c=0
while(b!=0):
    if(a%b==0):
 

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/183.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

🔍 Generating predictions:  71%|███████▏  | 2363/3306 [1:42:44<44:19,  2.82s/it]