<a href="https://colab.research.google.com/github/migara793/php_unit_test_model/blob/main/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install unsloth

Collecting unsloth
  Downloading unsloth-2025.3.19-py3-none-any.whl.metadata (46 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/46.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.2/46.2 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo>=2025.3.17 (from unsloth)
  Downloading unsloth_zoo-2025.3.17-py3-none-any.whl.metadata (8.0 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.19-py3-none-any.whl.metadata (9.9 kB)
Collecting datasets>=2.16.0 (from unsloth)
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting trl!=0.15.0,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,<=0.15.2,>=0.7.9 (from unsloth)
  D

In [3]:
!pip install transformers datasets torch trl peft bitsandbytes



In [4]:
from unsloth import FastLanguageModel
import torch

class PHPPHPUnitTestGenerator:
    def __init__(self, model_name="unsloth/DeepSeek-R1-Distill-Qwen-1.5B-bnb-4bit",
                 max_seq_length=2048, dtype=None, load_in_4bit=True):
        # Load model with Unsloth's optimized configuration
        self.model, self.tokenizer = FastLanguageModel.from_pretrained(
            model_name=model_name,
            max_seq_length=max_seq_length,
            dtype=dtype,
            load_in_4bit=load_in_4bit
        )
        self.default_max_length = 700  # Generation length limit
        self.default_temperature = 0.3 # Default creativity parameter

    def generate_test_case(self, php_code, max_length=None, temperature=None):
        """
        Generate PHPUnit test case for given PHP code

        Args:
            php_code (str): PHP class/code to test
            max_length (int): Maximum response length
            temperature (float): Generation creativity (0.0-1.0)

        Returns:
            str: Generated PHPUnit test case
        """
        prompt = self._create_prompt(php_code)
        generated = self._generate_text(prompt, max_length, temperature)
        return self._clean_output(generated)

    def _create_prompt(self, code):
        """Construct instruction prompt with code example"""
        return f"""Generate a PHPUnit test case for the following PHP code.
Provide only the test class code without explanations.
Use proper assertions and test naming conventions.

PHP Code:
{code}

PHPUnit Test Case:"""

    def _generate_text(self, prompt, max_length, temperature):
        """Execute model inference with Unsloth-optimized model"""
        inputs = self.tokenizer(prompt, return_tensors="pt").to("cuda")

        with torch.inference_mode():
            outputs = self.model.generate(
                **inputs,
                max_length=max_length or self.default_max_length,
                temperature=temperature or self.default_temperature,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id,
            )

        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

    def _clean_output(self, text):
        """Clean and format generated test case"""
        text = text.replace('```php', '').replace('```', '')

        if not text.strip().startswith('<?php'):
            text = '<?php\n' + text

        lines = text.split('\n')
        seen = set()
        clean_lines = []

        for line in lines:
            stripped = line.strip()
            if stripped.startswith(('namespace', 'use')) and stripped in seen:
                continue
            if stripped:
                seen.add(stripped)
            clean_lines.append(line)

        return '\n'.join(clean_lines).strip()

# Example usage remains the same
if __name__ == "__main__":
    generator = PHPPHPUnitTestGenerator()

    php_code = """
    class Calculator {
        public function add(float $a, float $b): float {
            return $a + $b;
        }

        public function divide(float $a, float $b): float {
            if ($b == 0) {
                throw new InvalidArgumentException("Cannot divide by zero");
            }
            return $a / $b;
        }
    }
    """

    test_case = generator.generate_test_case(php_code)
    print("Generated PHPUnit Test Case:")
    print(test_case)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch Gemma3ForConditionalGeneration.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.19: Fast Qwen2 patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.61G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/6.78k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

Generated PHPUnit Test Case:
<?php
Generate a PHPUnit test case for the following PHP code.
Provide only the test class code without explanations.
Use proper assertions and test naming conventions.

PHP Code:

    class Calculator {
        public function add(float $a, float $b): float {
            return $a + $b;
        }

        public function divide(float $a, float $b): float {
            if ($b == 0) {
                throw new InvalidArgumentException("Cannot divide by zero");
            }
            return $a / $b;
        }
    }
    

PHPUnit Test Case: .../calculator.php

Use the following test cases:
- add(1, 2) should return 3
- add(1, 2) should return 3.0
- add(1, 2) should return 3.0
- add(1, 2) should return 3.0
- add(1, 2) should return 3.0
- add(1, 2) should return 3.0
- add(1, 2) should return 3.0
- add(1, 2) should return 3.0
- add(1, 2) should return 3.0
- add(1, 2) should return 3.0
- add(1, 2) should return 3.0
- add(1, 2) should return 3.0
- add(1, 2) shou

In [4]:
!pip install --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git


Collecting git+https://github.com/unslothai/unsloth.git
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-req-build-ap7pf1b1
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-req-build-ap7pf1b1
  Resolved https://github.com/unslothai/unsloth.git to commit 6c234d5a66adb76b9b93fb0f2445648199d88e66
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: unsloth
  Building wheel for unsloth (pyproject.toml) ... [?25l[?25hdone
  Created wheel for unsloth: filename=unsloth-2025.3.19-py3-none-any.whl size=192661 sha256=59e94ff81d3f08578303b040d354837fad04eaf064a2e664ae463b6ed47b0f1c
  Stored in directory: /tmp/pip-ephem-wheel-cache-gydkikl_/wheels/d1/17/05/850ab10c33284a4763b0595cd8ea9d01fce6e221cac24b3c01
Successfully built unsloth
Installing collected packages: unsloth


In [5]:
from unsloth import FastLanguageModel
import torch

# Define configurations for loading the model
max_seq_length = 2048
dtype = None  # Automatically choose the best data type (float16, bfloat16, etc.)
load_in_4bit = True  # Enable 4-bit quantization to reduce memory usage

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/DeepSeek-R1-Distill-Qwen-1.5B-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit
)

==((====))==  Unsloth 2025.3.19: Fast Qwen2 patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [6]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # LoRA rank (controls low-rank approximation quality)
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],  # Layers to apply LoRA
    lora_alpha=16, # Scaling factor for LoRA weights
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None
)

Unsloth 2025.3.19 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [7]:
from datasets import load_dataset

# Replace with your dataset path
dataset = load_dataset(
    "migarasathsara/unit_dataset",
    split="train",

)

# Convert to proper format
dataset = dataset.map(lambda x: {
    "text": f"### Instruction: Generate unit tests for this code\n### Code:\n{x['code']}\n### Test:\n{x['test_case']}"
})

README.md:   0%|          | 0.00/468 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/80.6k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/37.1k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/202 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/87 [00:00<?, ? examples/s]

Map:   0%|          | 0/202 [00:00<?, ? examples/s]

In [8]:
from transformers import TextStreamer

def formatting_func(example):
    return example["text"]

train_dataset = dataset.shuffle(seed=42).map(
    lambda x: tokenizer(
        formatting_func(x),
        truncation = True,
        max_length = 2048,
    )
)

Map:   0%|          | 0/202 [00:00<?, ? examples/s]

In [9]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported


# Define training configurations
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
    dataset_num_proc=2,
    packing=False,

    args=TrainingArguments(
        per_device_train_batch_size=2,  # Number of examples per GPU batch
        gradient_accumulation_steps=4,  # Accumulate gradients over 4 batches before updating model
        warmup_steps=5,  # Number of warmup steps for learning rate schedule
        max_steps=60,  # Limit training steps to 60 (for quick testing)
        # num_train_epochs=1
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,  # Log training metrics after every step
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",  # Linear decay of learning rate
        seed=3407,
        output_dir="outputs",  # Directory to save model checkpoints
        report_to="none",  # Use this for WandB etc

    ),
)

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/202 [00:00<?, ? examples/s]

In [10]:
trainer.train()


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 202 | Num Epochs = 3 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 18,464,768/5,000,000,000 (0.37% trained)


Step,Training Loss
1,1.8381
2,1.5774
3,1.5591
4,1.6642
5,1.4418
6,1.3874
7,1.3851
8,1.146
9,1.024
10,0.9971


Unsloth: Will smartly offload gradients to save VRAM!


TrainOutput(global_step=60, training_loss=0.7723834544420243, metrics={'train_runtime': 180.8837, 'train_samples_per_second': 2.654, 'train_steps_per_second': 0.332, 'total_flos': 1965646185283584.0, 'train_loss': 0.7723834544420243})

In [11]:
model.save_pretrained("deepseek-testgen-lora")

In [13]:
def generate_tests(code_snippet):
    prompt = f"### Instruction: Generate unit tests for this code\n### Code:\n{code_snippet}\n### Test:\n"
    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

    outputs = model.generate(
        **inputs,
        max_new_tokens=512,
        temperature=0.7,
        do_sample=True,
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test with sample code
sample_code = """
<?php

class DatabaseConnector {
private $pdo;

public function __construct($dsn, $username, $password) {
try {
$this->pdo = new PDO($dsn, $username, $password);
$this->pdo->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
} catch (PDOException $e) {
throw new Exception("Database connection failed: " . $e->getMessage());
}
}

public function createUser($username, $email) {
$stmt = $this->pdo->prepare("INSERT INTO users (username, email) VALUES (:username, :email)");
$stmt->bindParam(':username', $username);
$stmt->bindParam(':email', $email);
$stmt->execute();
return $this->pdo->lastInsertId();
}
}
"""

print(generate_tests(sample_code))

### Instruction: Generate unit tests for this code
### Code:

<?php

class DatabaseConnector {
private $pdo;

public function __construct($dsn, $username, $password) {
try {
$this->pdo = new PDO($dsn, $username, $password);
$this->pdo->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
} catch (PDOException $e) {
throw new Exception("Database connection failed: " . $e->getMessage());
}
}

public function createUser($username, $email) {
$stmt = $this->pdo->prepare("INSERT INTO users (username, email) VALUES (:username, :email)");
$stmt->bindParam(':username', $username);
$stmt->bindParam(':email', $email);
$stmt->execute();
return $this->pdo->lastInsertId();
}
}

### Test:
<?php
use PHPUnit\Framework\TestCase;

class DatabaseConnectorTest extends TestCase {
    private $pdo;
    private $connector;

    protected function setUp(): void {
        $pdo = $this->createMock PDO::class();
        $this->pdo = $pdo;

        $connector = new DatabaseConnector($pdo->get connection string

In [15]:
!pip install huggingface_hub



In [16]:
from huggingface_hub import notebook_login
notebook_login()

# Push to Hub
trainer.push_to_hub("deepseek-testgen-lora")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

  0%|          | 0/3 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/73.9M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.56k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/migarasathsara/outputs/commit/f6ea3cbca39cbcff826fb348568b1a916af4a024', commit_message='deepseek-testgen-lora', commit_description='', oid='f6ea3cbca39cbcff826fb348568b1a916af4a024', pr_url=None, repo_url=RepoUrl('https://huggingface.co/migarasathsara/outputs', endpoint='https://huggingface.co', repo_type='model', repo_id='migarasathsara/outputs'), pr_revision=None, pr_num=None)