In [1]:
import pandas as pd

# === Step 1: Set the folder containing your CSV files ===
folder_path = "/Users/p0s0cad/work/personal-projects/ai-munshi/model/dataset/dataset_templates/"  # <-- change this to your folder path


# === Step 2: Find all CSV files in that folder ===
csv_file_paths = [
    "/Users/p0s0cad/work/personal-projects/ai-munshi/model/dataset/dataset_templates/aggregation_queries_with_df_code.csv",
    "/Users/p0s0cad/work/personal-projects/ai-munshi/model/dataset/dataset_templates/sorting_ranking_queries_with_df_code.csv",
    "/Users/p0s0cad/work/personal-projects/ai-munshi/model/dataset/dataset_templates/text_string_queries_with_df_code.csv"
]

# === Step 3: Read and concatenate them ===
all_dfs = []

for file_path in csv_file_paths:
    df = pd.read_csv(file_path)
    all_dfs.append(df)

final_df = pd.concat(all_dfs, ignore_index=True)

final_df=final_df[["query_type","query","col_1","col_2","df_command"]]

# === Step 4: Save the combined dataset ===
output_path = "./dataset/combined_dataset.csv"
final_df.to_csv(output_path, index=False)

print(f"✅ Combined {len(csv_file_paths)} files into {output_path}")


✅ Combined 3 files into ./dataset/combined_dataset.csv


In [6]:
import random
import pandas as pd
import json

# === Step 0: Fake columns pool for schema noise ===
fake_cols_pool = [
    'Request ID', 'Tracking Code', 'Approval Status', 'Internal Notes', 'Timestamp Created',
    'Last Modified By', 'Error Code', 'Sync Status', 'Origin System', 'Archived Flag',
    'Workflow Step', 'Reviewer Comments', 'Flagged Reason', 'Processing Time', 'Manual Override',
    'Document Ref', 'System ID', 'Batch Number', 'Response Time (ms)', 'Audit Trail',
    'Project Tag', 'Release Version', 'Run ID', 'Environment Name', 'Retry Count',
    'Source File Name', 'Alert Triggered', 'Backup ID', 'Session Token', 'Validation Notes'
]

# === Step 1: Load your CSV ===
input_csv_path = "./dataset/combined_dataset.csv"
df = pd.read_csv(input_csv_path)

# === Step 2: Convert to CodeT5 format ===
output_data = []

for _, row in df.iterrows():
    # Collect actual columns used
    true_cols = list({col.strip() for col in [row["col_1"], row.get("col_2", None)] if pd.notna(col) and col.strip()})
    extra_cols = random.sample(fake_cols_pool, 5)
    all_cols = sorted(set(true_cols + extra_cols))
    schema = ", ".join(all_cols)

    # Create prompt: embed schema into source
    query = row["query"].strip()
    source = f"The DataFrame contains the following columns: {schema}.\nQuery: {query}"

    # The expected code output
    target = row["df_command"].strip()

    output_data.append({
        "source": source,
        "target": target
    })

# === Step 3: Save JSONL ===
output_jsonl_path = "./codet5_dataset.jsonl"
with open(output_jsonl_path, "w") as f:
    for item in output_data:
        f.write(json.dumps(item) + "\n")

print(f"✅ Saved {len(output_data)} examples to {output_jsonl_path} for CodeT5 training.")


✅ Saved 1006 examples to ./codet5_dataset.jsonl for CodeT5 training.


In [None]:
%pip install transformers==4.51.0
%pip install accelerate==1.6.0
%pip install datasets==3.5.0

In [8]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)
import torch

# === Configuration ===
model_path = "Salesforce/codet5p-220m"
dataset_path = "codet5_dataset.jsonl"  # must have "source" and "target" fields
output_dir = "./codet5p-full-finetune"
torch.manual_seed(0)

# === Load dataset ===
dataset = load_dataset("json", data_files=dataset_path, split="train")

# === Tokenizer ===
tokenizer = AutoTokenizer.from_pretrained(model_path)

# === Tokenization ===
def tokenize(example):
    model_input = tokenizer(
        example["source"],
        truncation=True,
        padding="max_length",
        max_length=256
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            example["target"],
            truncation=True,
            padding="max_length",
            max_length=128
        )
    model_input["labels"] = labels["input_ids"]
    return model_input

tokenized_dataset = dataset.map(tokenize, remove_columns=dataset.column_names)

# === Load Model ===
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_path,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True,
    cache_dir="/tmp/codet5p_cache"
)

Generating train split: 1006 examples [00:00, 43857.79 examples/s]
Map: 100%|██████████| 1006/1006 [00:00<00:00, 5840.49 examples/s]


In [9]:

# === Data Collator ===
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# === Training Arguments (add your own config) ===
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    logging_steps=50,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    save_strategy="no",
    learning_rate=5e-5,  # slightly higher to help faster adaptation
    weight_decay=0.0,  # unnecessary for tiny datasets
    report_to="none",
    push_to_hub=False
)

# === Trainer ===
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)


# === Train ===
trainer.train()

# === Save ===
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print("✅ Full fine-tuning complete. Model saved.")

  trainer = Seq2SeqTrainer(


Step,Training Loss
50,5.1159
100,0.0
150,0.0
200,0.0
250,0.0
300,0.0
350,0.0


✅ Full fine-tuning complete. Model saved.
