<a href="https://colab.research.google.com/github/migara793/test-case-model/blob/main/unit_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install langchain-anthropic

Collecting langchain-anthropic
  Downloading langchain_anthropic-0.3.10-py3-none-any.whl.metadata (1.9 kB)
Collecting anthropic<1,>=0.49.0 (from langchain-anthropic)
  Downloading anthropic-0.49.0-py3-none-any.whl.metadata (24 kB)
Downloading langchain_anthropic-0.3.10-py3-none-any.whl (25 kB)
Downloading anthropic-0.49.0-py3-none-any.whl (243 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m243.4/243.4 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: anthropic, langchain-anthropic
Successfully installed anthropic-0.49.0 langchain-anthropic-0.3.10


In [None]:
import pandas as pd
import time
from tqdm import tqdm
import langchain
import google.generativeai as genai
import re
from typing import List, Tuple

import os
os.environ["GEMINI_API_KEY"] = 'AIzaSyDh3VaMPvmKl-seQhyrbEZ8dUJdNJXfjy8'

In [None]:
import re
from typing import Tuple, List


In [None]:
genai.configure(api_key=os.environ["GEMINI_API_KEY"])
llm=genai.GenerativeModel("gemini-2.0-flash")

In [None]:
import re
import json
from typing import Tuple, List

def extract_php_test_pairs(string: str) -> Tuple[List[str], List[str]]:
    """
    Extracts pairs of PHP code and corresponding test cases from a JSON-formatted string.
    Now handles more JSON variations and provides better error recovery.

    Parameters:
        - string (str): A string containing JSON-formatted PHP code and test case pairs.

    Returns:
        - php_code (list): A list of extracted PHP code snippets.
        - test_cases (list): A list of extracted test cases corresponding to the PHP code.
    """
    php_code = []
    test_cases = []

    # First try to parse as complete JSON
    try:
        data = json.loads(string)
        if isinstance(data, list):
            for item in data:
                if isinstance(item, dict):
                    php_code.append(item.get("php_code", ""))
                    test_cases.append(item.get("test_case", ""))
        elif isinstance(data, dict):
            php_code.append(data.get("php_code", ""))
            test_cases.append(data.get("test_case", ""))
        return php_code, test_cases
    except json.JSONDecodeError:
        pass  # Fall through to regex approach

    # Fallback regex pattern that handles more variations
    pattern = r'(?:"php_code"\s*:\s*"(.*?)"\s*,\s*"test_case"\s*:\s*"(.*?)"|' \
              r'\{\s*"php_code"\s*:\s*"(.*?)"\s*,\s*"test_case"\s*:\s*"(.*?)"\s*\})'

    matches = re.findall(pattern, string, re.DOTALL)

    for match in matches:
        # Handle both pattern variations (with or without outer braces)
        php = match[0] or match[2]
        test = match[1] or match[3]

        if php and test:
            # Proper JSON unescaping
            try:
                php_clean = json.loads(f'"{php}"')
                test_clean = json.loads(f'"{test}"')
                php_code.append(php_clean)
                test_cases.append(test_clean)
            except:
                # Fallback basic cleaning
                php_clean = php.replace('\\n', '\n').replace('\\"', '"').replace('\\t', '\t')
                test_clean = test.replace('\\n', '\n').replace('\\"', '"').replace('\\t', '\t')
                php_code.append(php_clean.strip())
                test_cases.append(test_clean.strip())

    return php_code, test_cases

In [None]:
# Manually test your extraction function
test_input = '''{
    "php_code": "<?php\\nfunction test() { return true; }",
    "test_case": "<?php\\nclass TestTest extends TestCase {}"
}'''

php, tests = extract_php_test_pairs(test_input)
print(php, tests)

['<?php\nfunction test() { return true; }'] ['<?php\nclass TestTest extends TestCase {}']


In [None]:
focus_list = [
    "basic PHP functions with input/output operations",
    "object-oriented PHP features (classes, inheritance, interfaces)",
    "database operations and API integrations"
]
how_many_iteration = [20, 20, 20]  # More data for OOP and database sections
describe = "Generate PHP code examples with corresponding PHPUnit test cases"

All_php_code = []
All_test_cases = []

for focus, iteration in zip(focus_list, how_many_iteration):
    for idx in tqdm(range(iteration)):
        prompt = f"""### Instruction: Based on {focus}, generate 5 pairs of PHP code examples
        with their corresponding PHPUnit test cases. Provide the pairs in JSON format where:
        - "php_code" contains the PHP implementation
        - "test_case" contains the PHPUnit test code
        Make sure both code examples are complete and properly formatted.

        ### Example: {{
            "php_code": "<?php\\nfunction add($a, $b) {{\\n    return $a + $b;\\n}}",
            "test_case": "<?php\\nuse PHPUnit\\\\Framework\\\\TestCase;\\nclass AddTest extends TestCase {{\\n    public function testAdd() {{\\n        $this->assertEquals(4, add(2, 2));\\n    }}\\n}}"
        }}

        ### Focus Area: {focus}
        ### Response:"""

        generated_text = llm.generate_content(prompt)


        php_code, test_cases = extract_php_test_pairs(generated_text.text)


        All_php_code.extend(php_code)
        All_test_cases.extend(test_cases)


100%|██████████| 20/20 [01:47<00:00,  5.36s/it]
100%|██████████| 20/20 [02:52<00:00,  8.61s/it]
100%|██████████| 20/20 [07:04<00:00, 21.24s/it]


In [None]:
print(len(All_php_code))
print(len(All_test_cases))

282
282


In [None]:
df = pd.DataFrame({
    "Instructions": All_php_code,
    "Responses": All_test_cases
})

df.to_csv("unit_test_data.csv", index=False)

In [None]:
print(All_test_cases)

['<?php\nuse PHPUnit\\Framework\\TestCase;\n\nclass GreetTest extends TestCase {\n    public function testGreet() {\n        $this->assertEquals("Hello, World!", greet("World"));\n        $this->assertEquals("Hello, Alice!", greet("Alice"));\n    }\n}', '<?php\nuse PHPUnit\\Framework\\TestCase;\n\nclass CalculateAreaTest extends TestCase {\n    public function testCalculateArea() {\n        $this->assertEquals(20.0, calculate_area(5.0, 4.0));\n        $this->assertEquals(0.0, calculate_area(0.0, 10.0));\n        $this->assertEquals(12.25, calculate_area(3.5, 3.5));\n    }\n}', '<?php\nuse PHPUnit\\Framework\\TestCase;\n\nclass IsEvenTest extends TestCase {\n    public function testIsEven() {\n        $this->assertTrue(is_even(4));\n        $this->assertFalse(is_even(7));\n        $this->assertTrue(is_even(0));\n    }\n}', "<?php\nuse PHPUnit\\Framework\\TestCase;\n\nclass GetFileExtensionTest extends TestCase {\n    public function testGetFileExtension() {\n        $this->assertEquals(