In [47]:
import pandas as pd
import glob
import os

# === Step 1: Set the folder containing your CSV files ===
folder_path = "/Users/p0s0cad/work/personal-projects/ai-munshi/model/dataset/dataset_templates/"  # <-- change this to your folder path


# === Step 2: Find all CSV files in that folder ===
csv_file_paths = [
    "/Users/p0s0cad/work/personal-projects/ai-munshi/model/dataset/dataset_templates/aggregation_queries_with_df_code.csv",
    "/Users/p0s0cad/work/personal-projects/ai-munshi/model/dataset/dataset_templates/sorting_ranking_queries_with_df_code.csv",
    "/Users/p0s0cad/work/personal-projects/ai-munshi/model/dataset/dataset_templates/text_string_queries_with_df_code.csv"
]

# === Step 3: Read and concatenate them ===
all_dfs = []

for file_path in csv_file_paths:
    df = pd.read_csv(file_path)
    all_dfs.append(df)

final_df = pd.concat(all_dfs, ignore_index=True)

final_df=final_df[["query_type","query","col_1","col_2","df_command"]]

# === Step 4: Save the combined dataset ===
output_path = "./dataset/combined_dataset.csv"
final_df.to_csv(output_path, index=False)

print(f"✅ Combined {len(csv_file_paths)} files into {output_path}")


✅ Combined 3 files into ./dataset/combined_dataset.csv


In [48]:
import random
import pandas as pd
import json

fake_cols_pool = [
    'Request ID', 'Tracking Code', 'Approval Status', 'Internal Notes', 'Timestamp Created',
    'Last Modified By', 'Error Code', 'Sync Status', 'Origin System', 'Archived Flag',
    'Workflow Step', 'Reviewer Comments', 'Flagged Reason', 'Processing Time', 'Manual Override',
    'Document Ref', 'System ID', 'Batch Number', 'Response Time (ms)', 'Audit Trail',
    'Project Tag', 'Release Version', 'Run ID', 'Environment Name', 'Retry Count',
    'Source File Name', 'Alert Triggered', 'Backup ID', 'Session Token', 'Validation Notes'
]


# === Step 1: Load CSV ===
input_csv_path = "./dataset/combined_dataset.csv"  # Replace with your actual file path
df = pd.read_csv(input_csv_path)

# === Step 2: Convert rows to JSONL format ===
output_data = []

for _, row in df.iterrows():
    # Collect columns and build prompt
    columns = list({col.strip() for col in [row["col_1"], row.get("col_2", None)] if pd.notna(col) and col.strip()})
    columns_str = ", ".join(sorted(set(columns)))
    fake_cols_no = random.randint(0, 5)
    extra_cols = random.sample(fake_cols_pool, fake_cols_no)
    columns_str = ", ".join(sorted(columns + extra_cols))
    # Create fully delimited Phi-4 style prompt
    print("columns_str:", columns_str)
    prompt = (
        f"<|system|>You are an expert Python assistant. Generate valid Pandas code based on the user's query.\n"
        f"The DataFrame contains the following columns: {columns_str}<|end|>"
        f"<|user|>{row['query']}<|end|>"
        "<|assistant|>"
    )


    # JSON-style assistant completion with <|end|>
    completion = json.dumps({
        "df_code": row["df_command"],
        "query_pred_category": row["query_type"].lower().replace(" ", "_")
    }) + "<|end|>"

    output_data.append({
        "prompt": prompt,
        "completion": completion
    })

# === Step 3: Write to JSONL file ===
output_jsonl_path = "output_phi_4_format.jsonl"
with open(output_jsonl_path, "w") as f:
    for item in output_data:
        f.write(json.dumps(item) + "\n")

print(f"Saved {len(output_data)} items to {output_jsonl_path}")


columns_str: Project Tag, channel, quantity
columns_str: Flagged Reason, Request ID, Run ID, Source File Name, cost, region
columns_str: Batch Number, Document Ref, cost, month
columns_str: Response Time (ms), Reviewer Comments, Validation Notes, month
columns_str: Flagged Reason, Last Modified By, Origin System, Workflow Step, profit, store_id
columns_str: channel, expenses
columns_str: Audit Trail, Sync Status, cost, department
columns_str: category, cost
columns_str: Source File Name, Tracking Code, city, profit
columns_str: Retry Count, Reviewer Comments, System ID, channel, cost
columns_str: Internal Notes, Last Modified By, Release Version, Sync Status, city, tax
columns_str: Origin System, department, sales
columns_str: Alert Triggered, Environment Name, Source File Name, Validation Notes, customer_type, expenses
columns_str: category, units_sold
columns_str: Response Time (ms), store_id
columns_str: Alert Triggered, Reviewer Comments, Validation Notes, cost, product
columns_str

In [None]:
%pip install transformers==4.51.0
%pip install accelerate==1.6.0
%pip install datasets==3.5.0

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
import torch

# === Configuration ===
model_path = "microsoft/Phi-4-mini-instruct"
dataset_path = "output_phi_4_format.jsonl"
output_dir = "./phi4-full-finetune"
torch.random.manual_seed(0)


# === Load dataset ===
dataset = load_dataset("json", data_files=dataset_path, split="train")

# === Tokenizer ===
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token  # phi-4 uses eos for padding

# === Tokenize dataset ===
def tokenize(example):
    full_text = example["prompt"] + example["completion"]
    return tokenizer(full_text, truncation=True, padding="max_length", max_length=1024)

tokenized_dataset = dataset.map(tokenize, remove_columns=dataset.column_names)

# === Load model (full model, no LoRA) ===
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True,
    cache_dir="/tmp/phi4_cache"
)


  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 300/300 [00:00<00:00, 3238.20 examples/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.48s/it]


In [None]:


# === Training Arguments ===
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=10,  # more epochs to help model learn from small data
    logging_steps=5,
    save_strategy="no",
    learning_rate=5e-5,  # slightly higher to help faster adaptation
    weight_decay=0.0,  # unnecessary for tiny datasets
    report_to="none",
    push_to_hub=False
)


# === Trainer ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

# === Train ===
trainer.train()

# === Save ===
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print("✅ Full fine-tuning complete. Model saved.")
