In [4]:
import numpy as np
import torch
import torchvision.transforms as T
from PIL import Image
from torchvision.transforms.functional import InterpolationMode

from transformers import AutoModel, AutoTokenizer, BitsAndBytesConfig

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)

def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
    best_ratio_diff = float('inf')
    best_ratio = (1, 1)
    area = width * height
    for ratio in target_ratios:
        target_aspect_ratio = ratio[0] / ratio[1]
        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
        if ratio_diff < best_ratio_diff:
            best_ratio_diff = ratio_diff
            best_ratio = ratio
        elif ratio_diff == best_ratio_diff:
            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                best_ratio = ratio
    return best_ratio

def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
    orig_width, orig_height = image.size
    aspect_ratio = orig_width / orig_height

    # calculate the existing image aspect ratio
    target_ratios = set(
        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
        i * j <= max_num and i * j >= min_num)
    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

    # find the closest aspect ratio to the target
    target_aspect_ratio = find_closest_aspect_ratio(
        aspect_ratio, target_ratios, orig_width, orig_height, image_size)

    # calculate the target width and height
    target_width = image_size * target_aspect_ratio[0]
    target_height = image_size * target_aspect_ratio[1]
    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

    # resize the image
    resized_img = image.resize((target_width, target_height))
    processed_images = []
    for i in range(blocks):
        box = (
            (i % (target_width // image_size)) * image_size,
            (i // (target_width // image_size)) * image_size,
            ((i % (target_width // image_size)) + 1) * image_size,
            ((i // (target_width // image_size)) + 1) * image_size
        )
        # split the image
        split_img = resized_img.crop(box)
        processed_images.append(split_img)
    assert len(processed_images) == blocks
    if use_thumbnail and len(processed_images) != 1:
        thumbnail_img = image.resize((image_size, image_size))
        processed_images.append(thumbnail_img)
    return processed_images

def load_image(image_file, input_size=448, max_num=12):
    image = Image.open(image_file).convert('RGB')
    transform = build_transform(input_size=input_size)
    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
    pixel_values = [transform(image) for image in images]
    pixel_values = torch.stack(pixel_values)
    return pixel_values

# If you want to load a model using multiple GPUs, please refer to the `Multiple GPUs` section.
path = 'OpenGVLab/InternVL2-2B'

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModel.from_pretrained(
    path,
    quantization_config=quantization_config,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    use_flash_attn=True,
    trust_remote_code=True
).eval()

tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)



conversation.py:   0%|          | 0.00/15.0k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL2-2B:
- conversation.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_internlm2.py:   0%|          | 0.00/61.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL2-2B:
- modeling_internlm2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL2-2B:
- modeling_intern_vit.py
- conversation.py
- modeling_internlm2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


FlashAttention2 is not installed.


model.safetensors:   0%|          | 0.00/4.41G [00:00<?, ?B/s]



generation_config.json:   0%|          | 0.00/115 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/4.00k [00:00<?, ?B/s]

tokenization_internlm2.py:   0%|          | 0.00/8.79k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternVL2-2B:
- tokenization_internlm2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


tokenizer.model:   0%|          | 0.00/1.48M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/179 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

In [5]:
# multi-image multi-round conversation, combined images (多图多轮对话，拼接图像)
pixel_values1 = load_image('/content/Screenshot 2024-09-07 at 1.03.16 PM.png', max_num=12).to(torch.bfloat16).cuda()
pixel_values2 = load_image('/content/IMG_B4D053E01E46-1.jpeg', max_num=12).to(torch.bfloat16).cuda()
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
generation_config = dict(max_new_tokens=1024, do_sample=True)



In [30]:
additional_context = None

In [17]:
prompt = '''
You are an AI assistant specialized in generating test cases for digital product features. Based on the screenshots and context provided, create a detailed test case for the specified feature or button.

**Format for Test Case:**

**Test Case:** [Test case name]
**Description:** [Brief explanation of what the test case is testing]
**Pre-conditions:**
...
**Testing Steps:**
...
**Expected Result:** [Description of what should happen if the feature works correctly]

**Context for the Test Case:** {additional_context}


**Instructions:**
1. Follow the exact format provided above.
2. Provide only the test case in the specified format.
3. Ensure the test case is detailed and relevant to the screenshots and context given.
4. Do not include any additional text or explanations outside of the given format.

Generate the test case now.

'''

In [18]:

question =  'Image-1: <image>\nImage-2: <image>\n.' + prompt + additional_context

response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                               history=None, return_history=True)
print(response)


**Test Case:** Selecting seats

**Description:** This test case focuses on verifying the functionality of the "Select seats" option available under a bus travel booking application.

**Pre-conditions:**
- The test should be run in a virtual or simulated environment that mimics the expected user experience.
- The user should be able to select seats for their desired travel arrangements.

**Testing Steps:**
1. Open the "Select seats" option in the bus booking application.
2. Check if there are any seats available for the user to select.
3. Confirm that each seat's availability is shown as either "available" or "unavailable."
4. Verify that the seat legend correctly presents available and unavailable seats for each row.
5. Ensure all seats available to the user are accessible for selecting, with possible exception of reserved seats.
6. Double-check that the user can proceed to the transaction once a seat is presented.

**Expected Result:** The seat section in the displayed seat legend sho

In [23]:
import numpy as np
import torch
import torchvision.transforms as T
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer, BitsAndBytesConfig
import gradio as gr



prompt = '''
You are an AI assistant specialized in generating test cases for digital product features. Based on the screenshots and context provided, create a detailed test case for the specified feature or button.

**Format for Test Case:**

**Test Case:** [Test case name]
**Description:** [Brief explanation of what the test case is testing]
**Pre-conditions:**
...
**Testing Steps:**
...
**Expected Result:** [Description of what should happen if the feature works correctly]

**Context for the Test Case:** {additional_context}


**Instructions:**
1. Follow the exact format provided above.
2. Provide only the test case in the specified format.
3. Ensure the test case is detailed and relevant to the screenshots and context given.
4. Do not include any additional text or explanations outside of the given format.

Generate the test case now.

'''

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
    best_ratio_diff = float('inf')
    best_ratio = (1, 1)
    area = width * height
    for ratio in target_ratios:
        target_aspect_ratio = ratio[0] / ratio[1]
        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
        if ratio_diff < best_ratio_diff:
            best_ratio_diff = ratio_diff
            best_ratio = ratio
        elif ratio_diff == best_ratio_diff:
            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                best_ratio = ratio
    return best_ratio

def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
    orig_width, orig_height = image.size
    aspect_ratio = orig_width / orig_height

    target_ratios = set(
        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
        i * j <= max_num and i * j >= min_num)
    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

    target_aspect_ratio = find_closest_aspect_ratio(
        aspect_ratio, target_ratios, orig_width, orig_height, image_size)

    target_width = image_size * target_aspect_ratio[0]
    target_height = image_size * target_aspect_ratio[1]
    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

    resized_img = image.resize((target_width, target_height))
    processed_images = []
    for i in range(blocks):
        box = (
            (i % (target_width // image_size)) * image_size,
            (i // (target_width // image_size)) * image_size,
            ((i % (target_width // image_size)) + 1) * image_size,
            ((i // (target_width // image_size)) + 1) * image_size
        )
        split_img = resized_img.crop(box)
        processed_images.append(split_img)
    assert len(processed_images) == blocks
    if use_thumbnail and len(processed_images) != 1:
        thumbnail_img = image.resize((image_size, image_size))
        processed_images.append(thumbnail_img)
    return processed_images

def load_image(image_file, input_size=448, max_num=12):
    image = Image.open(image_file).convert('RGB')
    transform = build_transform(input_size=input_size)
    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
    pixel_values = [transform(image) for image in images]
    pixel_values = torch.stack(pixel_values)
    return pixel_values

path = 'OpenGVLab/InternVL2-2B'

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModel.from_pretrained(
    path,
    quantization_config=quantization_config,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    use_flash_attn=True,
    trust_remote_code=True
).eval()

tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)

def generate_test_case(image1, image2, prompt):
    pixel_values1 = load_image(image1, max_num=12).to(torch.bfloat16).cuda()
    pixel_values2 = load_image(image2, max_num=12).to(torch.bfloat16).cuda()
    pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
    generation_config = dict(max_new_tokens=1024, do_sample=True)

    question = '<image>\n' + prompt
    response, _ = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
    return response

iface = gr.Interface(
    fn=generate_test_case,
    inputs=[
        gr.Image(type="filepath", label="Image 1"),
        gr.Image(type="filepath", label="Image 2"),
        gr.Textbox(label="Prompt", lines=5)
    ],
    outputs=gr.Textbox(label="Generated Test Case", lines=10),
    title="Test Case Generator",
    description="Upload two images and provide a prompt to generate a test case."
)

iface.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://9f6b76085e796302ff.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




In [21]:
prompt

'\nYou are an AI assistant specialized in generating test cases for digital product features. Based on the screenshots and context provided, create a detailed test case for the specified feature or button.\n\n**Format for Test Case:**\n\n**Test Case:** [Test case name]\n**Description:** [Brief explanation of what the test case is testing]\n**Pre-conditions:**\n...\n**Testing Steps:**\n...\n**Expected Result:** [Description of what should happen if the feature works correctly]\n\n**Context for the Test Case:** {additional_context}\n\n\n**Instructions:**\n1. Follow the exact format provided above.\n2. Provide only the test case in the specified format.\n3. Ensure the test case is detailed and relevant to the screenshots and context given.\n4. Do not include any additional text or explanations outside of the given format.\n\nGenerate the test case now.\n\n'

In [None]:
from roboflow import Roboflow
import supervision as sv
import cv2

def process_image(imagepath):

  rf = Roboflow(api_key="uLJycBGhOvlgRCm5D5zh")
  project = rf.workspace().project("ui-phone-dataset-oaggq")
  model = project.version(1).model

  image = imagepath
  result = model.predict(image, confidence=40, overlap=30).json()

  labels = [item["class"] for item in result["predictions"]]

  detections = sv.Detections.from_inference(result)
  image = cv2.imread(image)

  annotated_image = sv.BoxAnnotator().annotate(
      scene=image, detections=detections)
  annotated_image = sv.LabelAnnotator().annotate(
      scene=annotated_image, detections=detections, labels=labels)

  sv.plot_image(image=annotated_image)



In [5]:
import numpy as np
import torch
import torchvision.transforms as T
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer, BitsAndBytesConfig
import gradio as gr
from roboflow import Roboflow
import supervision as sv
import cv2

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)


prompt = '''
You are an AI assistant specialized in generating test cases for digital product features. Based on the screenshots and context provided, create a detailed test case for the specified feature or button. Look for the
feature to be tested that is mentioned in the additional text only. do not assume anything.

**Format for Test Case:**

**Test Case:** [Test case name]
**Description:** [Brief explanation of what the test case is testing]
**Pre-conditions:**
...
**Testing Steps:**
...
**Expected Result:** [Description of what should happen if the feature works correctly]

**Context for the Test Case:** {additional_context}


Instructions
1. Follow the exact format provided above.
2. Provide only the test case in the specified format.
3. Ensure the test case is detailed and relevant to the screenshots and context given.
4. Do not include any additional text or explanations outside of the given format.


'''
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
    best_ratio_diff = float('inf')
    best_ratio = (1, 1)
    area = width * height
    for ratio in target_ratios:
        target_aspect_ratio = ratio[0] / ratio[1]
        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
        if ratio_diff < best_ratio_diff:
            best_ratio_diff = ratio_diff
            best_ratio = ratio
        elif ratio_diff == best_ratio_diff:
            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                best_ratio = ratio
    return best_ratio

def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
    orig_width, orig_height = image.size
    aspect_ratio = orig_width / orig_height

    target_ratios = set(
        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
        i * j <= max_num and i * j >= min_num)
    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

    target_aspect_ratio = find_closest_aspect_ratio(
        aspect_ratio, target_ratios, orig_width, orig_height, image_size)

    target_width = image_size * target_aspect_ratio[0]
    target_height = image_size * target_aspect_ratio[1]
    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

    resized_img = image.resize((target_width, target_height))
    processed_images = []
    for i in range(blocks):
        box = (
            (i % (target_width // image_size)) * image_size,
            (i // (target_width // image_size)) * image_size,
            ((i % (target_width // image_size)) + 1) * image_size,
            ((i // (target_width // image_size)) + 1) * image_size
        )
        split_img = resized_img.crop(box)
        processed_images.append(split_img)
    assert len(processed_images) == blocks
    if use_thumbnail and len(processed_images) != 1:
        thumbnail_img = image.resize((image_size, image_size))
        processed_images.append(thumbnail_img)
    return processed_images


path = 'OpenGVLab/InternVL2-2B'

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModel.from_pretrained(
    path,
    quantization_config=quantization_config,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    use_flash_attn=True,
    trust_remote_code=True
).eval()

tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)

def process_image(image_path):
    try:
        rf = Roboflow(api_key="")
        project = rf.workspace().project("ui-phone-dataset-oaggq")
        model = project.version(1).model

        result = model.predict(image_path, confidence=40, overlap=30).json()

        labels = [item["class"] for item in result["predictions"]]

        detections = sv.Detections.from_inference(result)
        image = cv2.imread(image_path)

        annotated_image = sv.BoxAnnotator().annotate(
            scene=image, detections=detections)
        annotated_image = sv.LabelAnnotator().annotate(
            scene=annotated_image, detections=detections, labels=labels)

        annotated_image_rgb = cv2.cvtColor(annotated_image, cv2.COLOR_BGR2RGB)
        return Image.fromarray(annotated_image_rgb)
    except Exception as e:
        print(f"Error in process_image: {str(e)}")
        return Image.open(image_path).convert('RGB')

def load_image(image_file, input_size=448, max_num=12):
    try:
        image = process_image(image_file)
        transform = build_transform(input_size=input_size)
        images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
        pixel_values = [transform(image) for image in images]
        pixel_values = torch.stack(pixel_values)
        return pixel_values
    except Exception as e:
        print(f"Error in load_image: {str(e)}")
        return None

def display_processed_images(image1, image2):
    if image1 is None or image2 is None:
        return None, None, "Please upload both images before processing."

    try:
        processed_image1 = process_image(image1)
        processed_image2 = process_image(image2)
        return processed_image1, processed_image2, "Images processed successfully."
    except Exception as e:
        return None, None, f"Error processing images: {str(e)}"

def generate_test_case(image1, image2, context_input):
    try:
        pixel_values1 = load_image(image1, max_num=12)
        pixel_values2 = load_image(image2, max_num=12)
        if pixel_values1 is None or pixel_values2 is None:
            return "Error loading images. Please try again."

        pixel_values1 = pixel_values1.to(torch.bfloat16).cuda()
        pixel_values2 = pixel_values2.to(torch.bfloat16).cuda()
        pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
        generation_config = dict(max_new_tokens=1024, do_sample=True, temperature=0.7)

        # Emphasize the importance of the context
        emphasized_context = f"FOCUS ON THIS FEATURE: {context_input}"
        prompt_with_context = prompt.format(additional_context=emphasized_context)

        question = '<image>\n' + prompt_with_context
        response, _ = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)

        # Ensure the response is about the specified feature
        if context_input.lower() not in response.lower():
            return f"The generated test case does not appear to be about the specified feature: {context_input}. Please try again with a more specific context."

        return response
    except Exception as e:
        return f"Error generating test case: {str(e)}"

with gr.Blocks() as iface:
    gr.Markdown("# Test Case Generator with UI Element Detection")

    with gr.Row():
        image1_input = gr.Image(type="filepath", label="Screenshot 1")
        image2_input = gr.Image(type="filepath", label="Screenshot 2")

    process_button = gr.Button("Process Images")

    with gr.Row():
        processed_image1_output = gr.Image(label="Processed Screenshot 1")
        processed_image2_output = gr.Image(label="Processed Screenshot 2")

    process_status = gr.Textbox(label="Processing Status")

    context_input = gr.Textbox(label="Additional Context", lines=3, placeholder="Enter any additional context or specific feature to test...")
    generate_button = gr.Button("Generate Test Case")
    test_case_output = gr.Textbox(label="Generated Test Case", lines=15)

    process_button.click(
        display_processed_images,
        inputs=[image1_input, image2_input],
        outputs=[processed_image1_output, processed_image2_output, process_status]
    )

    generate_button.click(
        generate_test_case,
        inputs=[image1_input, image2_input, context_input],
        outputs=test_case_output
    )

iface.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://26421445b1c8c71cae.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


