# Inference Code

This notebook implements the inference pipeline.

## Overview
- **Model**: `unsloth/Qwen2.5-VL-7B-Instruct`
- **Configuration**: 4-bit quantization, LoRA fine-tuning.
- **Task**: Binary Classification (Real vs. Fake) on high-quality images.

In [None]:
# Install dependencies
!pip install unsloth torch torchvision pillow gradio

In [None]:
import torch
from unsloth import FastVisionModel
from PIL import Image
import os

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Running on {device}")

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
from unsloth import FastVisionModel

model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Qwen2.5-VL-7B-Instruct",
    load_in_4bit = True,
)

model.load_adapter(
    "/content/drive/MyDrive/Cv/fake_image_detector_99" # this should be the path to lora model trained
)

FastVisionModel.for_inference(model)


==((====))==  Unsloth 2025.12.9: Fast Qwen2_5_Vl patching. Transformers: 4.57.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/6.90G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/791 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

video_preprocessor_config.json:   0%|          | 0.00/935 [00:00<?, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]

Qwen2_5_VLForConditionalGeneration(
  (model): Qwen2_5_VLModel(
    (visual): Qwen2_5_VisionTransformerPretrainedModel(
      (patch_embed): Qwen2_5_VisionPatchEmbed(
        (proj): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
      )
      (rotary_pos_emb): Qwen2_5_VisionRotaryEmbedding()
      (blocks): ModuleList(
        (0-31): 32 x Qwen2_5_VLVisionBlock(
          (norm1): Qwen2RMSNorm((1280,), eps=1e-06)
          (norm2): Qwen2RMSNorm((1280,), eps=1e-06)
          (attn): Qwen2_5_VLVisionAttention(
            (qkv): lora.Linear(
              (base_layer): Linear(in_features=1280, out_features=3840, bias=True)
              (lora_dropout): ModuleDict(
                (default): Identity()
              )
              (lora_A): ModuleDict(
                (default): Linear(in_features=1280, out_features=16, bias=False)
              )
              (lora_B): ModuleDict(
                (default): Linear(in_features=16, out_features=3840, bias=False

In [9]:
import torch.nn.functional as F
from PIL import Image
import os

def predict_image(image_path, prompt="Real/Fake?"):
    """
    Perform inference on an image and return probabilities.
    """
    if not os.path.exists(image_path):
        print(f"Error: File {image_path} not found.")
        return None

    image = Image.open(image_path).convert("RGB")

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": prompt}
            ]
        }
    ]

    input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
    inputs = tokenizer(
        image,
        input_text,
        add_special_tokens = False,
        return_tensors = "pt",
    ).to(device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=4,
        use_cache=True,
        return_dict_in_generate=True,
        output_scores=True,
        do_sample=False,
    )

    actual_tokenizer = tokenizer.tokenizer

    # Get token IDs for "Fake" and "Real"
    fake_token_id = actual_tokenizer.encode("Fake", add_special_tokens=False)[0]
    real_token_id = actual_tokenizer.encode("Real", add_special_tokens=False)[0]

    generated_ids = outputs.sequences[0][inputs['input_ids'].shape[1]:]
    generated_text = actual_tokenizer.decode(generated_ids, skip_special_tokens=True)

    print(f"Generated text: {generated_text}")

    first_token_logits = outputs.scores[0][0]  # [vocab_size]
    probabilities = F.softmax(first_token_logits, dim=-1)

    fake_prob = probabilities[fake_token_id].item()
    real_prob = probabilities[real_token_id].item()

    return {
        "Fake": fake_prob,
        "Real": real_prob
    }

In [None]:
import gradio as gr

def gradio_predict(image_path):
    if not image_path:
        return "Please upload an image."
    return predict_image(image_path)

with gr.Blocks() as demo:
    gr.Markdown("# AI-Generated Image Detection - CMP 722")
    gr.Markdown("Upload an image to check if it is **Real** or **Fake**.")
    # For Method 2, Fake class below 80 should be counted as Real because of the threshold mentioned in report.
    with gr.Row():
        inp = gr.Image(type="filepath", label="Upload Image")
        out = gr.Label(num_top_classes=2, label="Model Prediction")

    btn = gr.Button("Run Inference")
    btn.click(fn=gradio_predict, inputs=inp, outputs=out)

demo.launch(share=True, debug=True)

In [None]:
image_file = "test_image.jpg"

if not os.path.exists(image_file):
    Image.new('RGB', (512, 512), color='blue').save(image_file)

print(f"Processing {image_file}...")
result = predict_image(image_file)
print("-" * 30)
print("Model Prediction:", result)
print("-" * 30)