In [1]:
import pandas as pd
import glob
import os

# === Step 1: Set the folder containing your CSV files ===
folder_path = "/Users/p0s0cad/work/personal-projects/ai-munshi/model/dataset/dataset_templates/"  # <-- change this to your folder path


# === Step 2: Find all CSV files in that folder ===
csv_file_paths = [
    "/Users/p0s0cad/work/personal-projects/ai-munshi/model/dataset/dataset_templates/aggregation_queries_with_df_code.csv",
    "/Users/p0s0cad/work/personal-projects/ai-munshi/model/dataset/dataset_templates/sorting_ranking_queries_with_df_code.csv",
    "/Users/p0s0cad/work/personal-projects/ai-munshi/model/dataset/dataset_templates/text_string_queries_with_df_code.csv"
]

# === Step 3: Read and concatenate them ===
all_dfs = []

for file_path in csv_file_paths:
    df = pd.read_csv(file_path)
    all_dfs.append(df)

final_df = pd.concat(all_dfs, ignore_index=True)

final_df=final_df[["query_type","query","col_1","col_2","df_command"]]

# === Step 4: Save the combined dataset ===
output_path = "./dataset/combined_dataset.csv"
final_df.to_csv(output_path, index=False)

print(f"✅ Combined {len(csv_file_paths)} files into {output_path}")


✅ Combined 3 files into ./dataset/combined_dataset.csv


In [41]:
import random
import pandas as pd
import json

fake_cols_pool = [
    'Request ID', 'Tracking Code', 'Approval Status', 'Internal Notes', 'Timestamp Created',
    'Last Modified By', 'Error Code', 'Sync Status', 'Origin System', 'Archived Flag',
    'Workflow Step', 'Reviewer Comments', 'Flagged Reason', 'Processing Time', 'Manual Override',
    'Document Ref', 'System ID', 'Batch Number', 'Response Time (ms)', 'Audit Trail',
    'Project Tag', 'Release Version', 'Run ID', 'Environment Name', 'Retry Count',
    'Source File Name', 'Alert Triggered', 'Backup ID', 'Session Token', 'Validation Notes'
]


# === Step 1: Load CSV ===
input_csv_path = "./dataset/combined_dataset.csv"  # Replace with your actual file path
df = pd.read_csv(input_csv_path)

# === Step 2: Convert rows to JSONL format ===
output_data = []

for _, row in df.iterrows():
    # Get real and fake columns
    columns = list({col.strip() for col in [row["col_1"], row.get("col_2", None)] if pd.notna(col) and col.strip()})
    extra_cols = random.sample(fake_cols_pool, 5)
    columns_str = ", ".join(sorted(columns + extra_cols))

    # Prompt with system + user tags
    prompt = (
        f"<|system|>\nYou are an expert Python assistant. Generate valid Pandas code based on the user's query.\n"
        f"The DataFrame contains the following columns: {columns_str}\n<|end|>\n"
        f"<|user|>\n{row['query']}\n<|end|>\n"
        f"<|assistant|>\n"
    )

    # Use raw string as completion — no <|end|>, no JSON wrapping
    completion = row["df_command"]

    output_data.append({
        "prompt": prompt,
        "completion": completion
    })

# === Step 3: Write to JSONL file ===
output_jsonl_path = "output_tiny_lama_format.jsonl"
with open(output_jsonl_path, "w") as f:
    for item in output_data:
        f.write(json.dumps(item) + "\n")

print(f"Saved {len(output_data)} items to {output_jsonl_path}")


Saved 1006 items to output_tiny_lama_format.jsonl


In [None]:
%pip install transformers==4.51.0
%pip install accelerate==1.6.0
%pip install datasets==3.5.0

In [43]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
import torch
import os

# === Configuration ===
model_path = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
dataset_path = "output_tiny_lama_format.jsonl"
output_dir = "./tinyllama-finetuned"

torch.manual_seed(42)

# === Load dataset ===
dataset = load_dataset("json", data_files=dataset_path, split="train")

# === Load tokenizer ===
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token  # ensure padding is handled

# === Tokenize dataset ===
def tokenize(example):
    full_text = example["prompt"] + example["completion"]
    return tokenizer(full_text, truncation=True, padding="max_length", max_length=1024)

tokenized_dataset = dataset.map(tokenize, remove_columns=dataset.column_names)

# === Load model ===
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float32,
    device_map="auto",
    cache_dir="/tmp/tinyllama"
)


In [44]:

# === Training args ===
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=10,
    learning_rate=2e-5,
    logging_steps=20,
    save_strategy="no",
    report_to="none",
    fp16=False,
    remove_unused_columns=False
)

# === Data collator ===
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# === Trainer ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# === Start training ===
trainer.train()

# === Save model ===
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"✅ Finetuning complete! Model saved at {output_dir}")


  trainer = Trainer(


Step,Training Loss
10,1.1215


KeyboardInterrupt: 