In [None]:
!pip install -q "torch>=2.1" transformers datasets peft accelerate bitsandbytes trl


In [None]:
pip install transformers


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!ls -lh /content/drive/MyDrive

total 50K
drwx------ 2 root root 4.0K Feb  8 04:20 'Colab Notebooks'
-rw------- 1 root root  179 Jul 10  2024 'Nareshkumar - Reactjs.gslides'
drwx------ 2 root root 4.0K May  2  2023 'Python Class'
drwx------ 2 root root 4.0K Apr 28  2023  Resume
-rw------- 1 root root  38K Feb  7 15:14  training.jsonl


In [None]:
cp /content/drive/MyDrive/training.jsonl /content/training.jsonl

In [None]:
!ls -lh /content
!head -n 2 /content/training.jsonl


total 52K
drwx------ 5 root root 4.0K Feb  8 13:19 drive
drwxr-xr-x 2 root root 4.0K Feb  8 13:27 out
drwxr-xr-x 1 root root 4.0K Jan 16 14:24 sample_data
-rw------- 1 root root  38K Feb  8 13:31 training.jsonl
{"messages":[{"role":"system","content":"You are a React testing expert. You generate comprehensive Jest + React Testing Library test files for React components. Your tests are production-quality, use best practices, and achieve high code coverage."},{"role":"user","content":"Generate a comprehensive Jest + React Testing Library test file for the following React component.\nThe tests should achieve at least 50% code coverage.\n\n## Component: Button\nWrapper: forwardRef\n\n## Props\n- className: unknown [required]\n- variant: string [optional] (default: 'default')\n- size: string [optional] (default: 'default')\n- isLoading: boolean [optional] (default: false)\n- leftIcon: unknown [required]\n- rightIcon: unknown [required]\n- children: unknown [required]\n- disabled: unknown [r

In [None]:
from datasets import load_dataset

dataset = load_dataset(
    "json",
    data_files="/content/training.jsonl"
)["train"]

print("Total examples:", len(dataset))
print(dataset[0]["messages"][1]["content"][:300])

Generating train split: 0 examples [00:00, ? examples/s]

Total examples: 5
Generate a comprehensive Jest + React Testing Library test file for the following React component.
The tests should achieve at least 50% code coverage.

## Component: Button
Wrapper: forwardRef

## Props
- className: unknown [required]
- variant: string [optional] (default: 'default')
- size: string


In [None]:
print(dataset.column_names)
print(dataset[0]["messages"][0])
print(dataset[0]["messages"][1])


['messages']
{'role': 'system', 'content': 'You are a React testing expert. You generate comprehensive Jest + React Testing Library test files for React components. Your tests are production-quality, use best practices, and achieve high code coverage.'}
{'role': 'user', 'content': 'Generate a comprehensive Jest + React Testing Library test file for the following React component.\nThe tests should achieve at least 50% code coverage.\n\n## Component: Button\nWrapper: forwardRef\n\n## Props\n- className: unknown [required]\n- variant: string [optional] (default: \'default\')\n- size: string [optional] (default: \'default\')\n- isLoading: boolean [optional] (default: false)\n- leftIcon: unknown [required]\n- rightIcon: unknown [required]\n- children: unknown [required]\n- disabled: unknown [required]\n- props: unknown [required]\n\n## Buttons\n- "unlabeled" (type: button)\n\n## Conditional Rendering\n- Condition: isLoading → renders JSX\n\n## Source Code\n```tsx\nimport { forwardRef, type 

In [None]:
def to_text(ex):
    # assumes messages = [{role, content}, {role, content}, ...]
    msgs = ex["messages"]
    # minimal format: user prompt -> assistant answer
    # if you have system+user+assistant, this still works well
    user_parts = [m["content"] for m in msgs if m["role"] in ["user", "system"]]
    assistant_parts = [m["content"] for m in msgs if m["role"] == "assistant"]

    prompt = "\n\n".join(user_parts).strip()
    completion = "\n\n".join(assistant_parts).strip()

    return {"text": f"{prompt}\n\n### RESPONSE:\n{completion}"}

dataset_text = dataset.map(to_text)
print(dataset_text[0]["text"][:600])


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

You are a React testing expert. You generate comprehensive Jest + React Testing Library test files for React components. Your tests are production-quality, use best practices, and achieve high code coverage.

Generate a comprehensive Jest + React Testing Library test file for the following React component.
The tests should achieve at least 50% code coverage.

## Component: Button
Wrapper: forwardRef

## Props
- className: unknown [required]
- variant: string [optional] (default: 'default')
- size: string [optional] (default: 'default')
- isLoading: boolean [optional] (default: false)
- leftIco


In [None]:
!pip install -q "torch>=2.1" transformers datasets peft accelerate bitsandbytes trl


In [None]:
BASE_MODEL = "Qwen/Qwen2.5-Coder-1.5B-Instruct"


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

tok = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
if tok.pad_token is None:
    tok.pad_token = tok.eos_token

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=bnb_config,
    device_map="auto",
)

lora = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj","k_proj","v_proj","o_proj"],
)

model = get_peft_model(model, lora)
model.print_trainable_parameters()


Loading weights:   0%|          | 0/338 [00:00<?, ?it/s]

trainable params: 4,358,144 || all params: 1,548,072,448 || trainable%: 0.2815


In [None]:
def tokenize(ex):
    out = tok(
        ex["text"],
        truncation=True,
        max_length=2048,
        padding=False,
    )
    out["labels"] = out["input_ids"].copy()
    return out

train_ds = dataset_text.map(tokenize, remove_columns=dataset_text.column_names)
print(train_ds[0].keys())


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

dict_keys(['input_ids', 'attention_mask', 'labels'])


: 

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

OUTPUT_DIR = "/content/out"

args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    num_train_epochs=5,
    logging_steps=5,
    save_steps=50,
    bf16=False,
    fp16=True,
    report_to="none",
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_ds,
    args=args,
)

trainer.train()

Truncating train dataset:   0%|          | 0/5 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


: 

: 

In [None]:
model.save_pretrained("/content/react-testgen-lora")
tok.save_pretrained("/content/react-testgen-lora")
!ls -lh /content/react-testgen-lora


In [None]:
import torch

def generate(prompt: str):
    inputs = tok(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=600,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
        )
    return tok.decode(out[0], skip_special_tokens=True)

test_prompt = dataset_text[0]["text"].split("### RESPONSE:")[0].strip() + "\n\n### RESPONSE:\n"
print(generate(test_prompt))


In [None]:
def to_text(ex):
    msgs = ex["messages"]
    # Combine system+user into prompt, assistant into completion
    prompt = "\n\n".join([m["content"] for m in msgs if m["role"] in ["system","user"]]).strip()
    completion = "\n\n".join([m["content"] for m in msgs if m["role"] == "assistant"]).strip()
    return {"text": f"{prompt}\n\n### RESPONSE:\n{completion}"}

dataset_text = dataset.map(to_text)
print(dataset_text[0]["text"][:600])


In [None]:
from datasets import load_dataset

# Load a sample dataset (e.g., a small subset of alpaca)
# Alpaca dataset structure: 'instruction', 'input', 'output'
dataset = load_dataset("tatsu-lab/alpaca", split="train[:50]") # Load first 50 examples

print("Total examples:", len(dataset))
# Print an example of the new dataset's content
print(dataset[0]["instruction"][:300])

In [None]:
def to_text(ex):
    prompt_parts = []
    if ex["instruction"]:
        prompt_parts.append(ex["instruction"])
    if ex["input"]:
        prompt_parts.append(f"Input: {ex['input']}")

    prompt = "\n\n".join(prompt_parts).strip()
    completion = ex["output"].strip()

    return {"text": f"{prompt}\n\n### RESPONSE:\n{completion}"}

dataset_text = dataset.map(to_text)
print(dataset_text[0]["text"][:600])

In [None]:
!pip install -q "torch>=2.1" transformers datasets peft accelerate bitsandbytes trl


In [None]:
def tokenize(ex):
    out = tok(
        ex["text"],
        truncation=True,
        max_length=2048,
        padding=False,
    )
    out["labels"] = out["input_ids"].copy()
    return out

train_ds = dataset_text.map(tokenize, remove_columns=dataset_text.column_names)
print(train_ds[0].keys())

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="/content/out",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    num_train_epochs=5,          # small dataset => more epochs ok
    logging_steps=5,
    save_steps=50,
    bf16=True, # Changed from fp16=True
    report_to="none",
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_ds,
    args=args
)

trainer.train()

In [None]:
model.save_pretrained("/content/react-testgen-lora")
tok.save_pretrained("/content/react-testgen-lora")
!ls -lh /content/react-testgen-lora

In [None]:
import torch

def generate(prompt: str):
    inputs = tok(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=600,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
        )
    return tok.decode(out[0], skip_special_tokens=True)

test_prompt = dataset_text[0]["text"].split("### RESPONSE:")[0].strip() + "\n\n### RESPONSE:\n"
print(generate(test_prompt))

In [None]:
BASE_MODEL = "Qwen/Qwen2.5-Coder-1.5B-Instruct"


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model

bnb = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

tok = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
if tok.pad_token is None:
    tok.pad_token = tok.eos_token

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=bnb,
    device_map="auto",
)

lora = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj","k_proj","v_proj","o_proj"],
)

model = get_peft_model(model, lora)
model.print_trainable_parameters()


In [None]:
def tokenize(ex):
    out = tok(ex["text"], truncation=True, max_length=2048, padding=False)
    out["labels"] = out["input_ids"].copy()
    return out

train_ds = dataset_text.map(tokenize, remove_columns=dataset_text.column_names)

from trl import SFTTrainer
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="/content/out",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    num_train_epochs=5,
    logging_steps=5,
    save_steps=50,
    fp16=True,
    report_to="none",
)

trainer = SFTTrainer(model=model, train_dataset=train_ds, args=args, tokenizer=tok)
trainer.train()
