In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install transformers==4.51.3
    !pip install --no-deps unsloth

In [None]:
from unsloth import FastVisionModel # FastLanguageModel for LLMs unsloth/Qwen2.5-VL-7B-Instruct-bnb-4bit"
import torch
model, tokenizer = FastVisionModel.from_pretrained(
    "tetttssts/final",
    load_in_4bit = True, # Use 4bit to reduce memory use. False for 16bit LoRA.
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
)

In [None]:
from datasets import load_dataset
dataset_id = "bicat123/testdata"
train_dataset, test_dataset = load_dataset(dataset_id, split=['train', 'test'])

In [None]:
train_dataset

In [None]:
test_dataset

In [None]:
system_message = """give me the bbox2d  of the drone detected in images even infrared images in josn format and even its small 
"""

In [None]:
FastVisionModel.for_inference(model) # Enable for inference!
image = train_dataset[0]["image"]
messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": system_message }
    ]}
]
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature =0.2, min_p = 0.1)

rgex model final1

In [None]:
import re

def extract_first_bbox(text):
    match = re.search(r'bbox2d\s*\[\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*\]', text)
    if match:
        bbox = [int(match.group(i)) for i in range(1, 5)]
        return bbox
    return None


In [None]:
import re

def extract_first_bbox(text):
    match = re.search(r'"bbox_2d"\s*:\s*\[([^\]]+)\]', text)
    if match:
        bbox_str = match.group(1)
        bbox = [int(x.strip()) for x in bbox_str.split(',')]
        return bbox
    return None


In [None]:
def infer(image):
    try:

        messages = [{
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": system_message }
            ]
        }]

        input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)

        inputs = tokenizer(
            image,
            input_text,
            add_special_tokens=False,
            return_tensors="pt"
        ).to("cuda")

        output = model.generate(
            **inputs,
            max_new_tokens=128,
            use_cache=True,
            temperature=0.2,
            min_p=0.9
        )

        decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
        return extract_first_bbox(decoded_output)

    except Exception as e:
        return f"Error during inference: {str(e)}"


In [None]:
bbox2d=[0, 0, 0, 0]

In [None]:
image = train_dataset[0]["image"]
bbox2d=infer(image)

In [None]:
bbox2d

In [None]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
# Convert from PIL Image to NumPy array
image = train_dataset[928]["image"]
image = np.array(image)

# Convert RGB to BGR (OpenCV uses BGR)
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

# Define the bounding box and label
annotations = [
    {"bbox_2d":[19, 287, 126, 308], "label": "drone"}
]

# Draw rectangle and label
for annotation in annotations:
    x1, y1, x2, y2 = annotation["bbox_2d"]
    label = annotation["label"]

    cv2.rectangle(image, (x1, y1), (x2, y2), color=(0, 255, 0), thickness=2)
    cv2.putText(image, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX,
                0.5, (0, 255, 0), 2)

# Convert BGR to RGB for display
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

# Show image using matplotlib
plt.imshow(image_rgb)
plt.axis('off')
plt.show()

# Save image
cv2.imwrite('output.jpg', image)

In [None]:
import pandas as pd
from tqdm import tqdm

# Assuming system_message is defined somewhere globally or you can define here

results = []
for i, item in enumerate(tqdm(iter(train_dataset))):
    if i == 2:
        break
    image = item['image']
    id=item['IDDetect']
    bbox2d = infer(image)
    results.append({'IDDetect': id, 'X1': bbox2d[0],'Y1': bbox2d[1],'X2': bbox2d[2],'Y2': bbox2d[3]})

# Save results to CSV
df = pd.DataFrame(results)
df.to_csv('output_results.csv', index=False)

print("Inference complete, saved to output_results.csv")

In [None]:
import pandas as pd
from tqdm import tqdm

results = []
for i, item in enumerate(tqdm(iter(train_dataset))):
    if i == 1000:
        break
    image = item['image']
    id = item['IDDetect']
    bbox2d = infer(image)

    # Handle case when no bounding box is detected
    if bbox2d and len(bbox2d) >= 4:
        result = {
            'IDDetect': id,
            'X1': bbox2d[0],
            'Y1': bbox2d[1],
            'X2': bbox2d[2],
            'Y2': bbox2d[3]
        }
    else:
        # Fill with None or -1 if no bounding box is found
        result = {
            'IDDetect': id,
            'X1': None,
            'Y1': None,
            'X2': None,
            'Y2': None
        }

    results.append(result)

# Save results to CSV
df = pd.DataFrame(results)
df.to_csv('output_results.csv', index=False)

print("Inference complete, saved to output_results.csv")


In [None]:
def parse_bbox(bbox_str):
    """Parse bbox string like '[x1,y1,x2,y2]' into tuple of ints."""
    bbox_str = bbox_str.strip().strip('[]')
    return tuple(map(int, bbox_str.split(',')))

df = pd.read_csv('output_results.csv')

for idx, row in df.iterrows():
    id_detect_val = row['id']
    
    # Find matching index in test_dataset by IDDetect
    matched_rows = [i for i, item in enumerate(train_dataset) if item['IDDetect'] == id_detect_val]
    if not matched_rows:
        print(f"No image found for IDDetect {id_detect_val}")
        continue
    img_index = matched_rows[0]
    
    pil_image = train_dataset[img_index]['image']
    image = np.array(pil_image)
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    
    bbox = parse_bbox(row['bbox2d'])
    x1, y1, x2, y2 = bbox
    
    cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
    cv2.putText(image, 'drone', (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX,
                0.5, (0, 255, 0), 2)
    
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    plt.imshow(image_rgb)
    plt.axis('off')
    plt.title(f"IDDetect: {id_detect_val}, IDTrack: {row['idtrack']}")
    plt.show()

In [None]:
import pandas as pd
import cv2
import numpy as np
import matplotlib.pyplot as plt

def parse_bbox(bbox_str):
    bbox_str = bbox_str.strip().strip('[]')
    return tuple(map(int, bbox_str.split(',')))

df = pd.read_csv('output_results.csv')

num_images = len(df)
cols = 3  # number of columns in grid
rows = (num_images + cols - 1) // cols  # rows needed

fig, axes = plt.subplots(rows, cols, figsize=(cols * 5, rows * 4))
axes = axes.flatten()  # flatten in case of 2D array

for ax in axes[num_images:]:
    ax.axis('off')  # hide unused axes if any

for idx, row in df.iterrows():
    id_detect_val = row['id']

    matched_rows = [i for i, item in enumerate(train_dataset) if item['IDDetect'] == id_detect_val]
    if not matched_rows:
        print(f"No image found for IDDetect {id_detect_val}")
        axes[idx].axis('off')
        continue
    img_index = matched_rows[0]

    pil_image = train_dataset[img_index]['image']
    image = np.array(pil_image)
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

    bbox = parse_bbox(row['bbox2d'])
    x1, y1, x2, y2 = bbox

    cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
    cv2.putText(image, 'drone', (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX,
                0.5, (0, 255, 0), 2)

    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    axes[idx].imshow(image_rgb)
    axes[idx].axis('off')
    axes[idx].set_title(f"IDDetect: {id_detect_val}\nIDTrack: {row['idtrack']}")

plt.tight_layout()
plt.show()
