In [1]:
import torch

In [2]:
torch.cuda.is_available()

True

In [3]:
import torch

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

PyTorch version: 2.5.0+cu124
CUDA available: True
CUDA version: 12.4
GPU: NVIDIA RTX A6000
GPU Memory: 50.91 GB


In [4]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.55.4
!pip install --no-deps trl==0.22.2

In [5]:
!pip uninstall -y unsloth unsloth-zoo torchao
!pip install --upgrade --no-cache-dir "unsloth[cu121-ampere-torch250] @ git+https://github.com/unslothai/unsloth.git"


Found existing installation: unsloth 2025.10.1
Uninstalling unsloth-2025.10.1:
  Successfully uninstalled unsloth-2025.10.1
Found existing installation: unsloth_zoo 2025.10.1
Uninstalling unsloth_zoo-2025.10.1:
  Successfully uninstalled unsloth_zoo-2025.10.1
Found existing installation: torchao 0.12.0
Uninstalling torchao-0.12.0:
  Successfully uninstalled torchao-0.12.0
Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[cu121-ampere-torch250]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-tocbcqd_/unsloth_8c1595742a0a413f99c7fa2eef5799e3
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-tocbcqd_/unsloth_8c1595742a0a413f99c7fa2eef5799e3
  Resolved https://github.com/unslothai/unsloth.git to commit aa5832de9282987ae6221dfac1877d23d64cad9a
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build whee

In [6]:
!pip uninstall -y torchao
!pip install torchao==0.12.0


Found existing installation: torchao 0.13.0
Uninstalling torchao-0.13.0:
  Successfully uninstalled torchao-0.13.0
Collecting torchao==0.12.0
  Using cached torchao-0.12.0-cp39-abi3-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (19 kB)
Using cached torchao-0.12.0-cp39-abi3-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (6.8 MB)
Installing collected packages: torchao
Successfully installed torchao-0.12.0


In [7]:
from unsloth import FastLanguageModel
import torch

print(f"‚úì Unsloth imported successfully!")
print(f"PyTorch: {torch.__version__}")


ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!
‚úì Unsloth imported successfully!
PyTorch: 2.5.0+cu124


In [8]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 16384 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 2x faster
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Mistral-Small-Instruct-2409",     # Mistral 22b 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!

    "unsloth/Llama-3.2-1B-bnb-4bit",           # NEW! Llama 3.2 models
    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
] # More models at https://huggingface.co/unsloth

qwen_models = [
    "unsloth/Qwen2.5-Coder-32B-Instruct",      # Qwen 2.5 Coder 2x faster
    "unsloth/Qwen2.5-Coder-7B",
    "unsloth/Qwen2.5-14B-Instruct",            # 14B fits in a 16GB card
    "unsloth/Qwen2.5-7B",
    "unsloth/Qwen2.5-72B-Instruct",            # 72B fits in a 48GB card
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen2.5-Coder-7B-Instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth 2025.10.1: Fast Qwen2 patching. Transformers: 4.55.4.
   \\   /|    NVIDIA RTX A6000. Num GPUs = 1. Max memory: 47.413 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post2. FA2 = True]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [9]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.10.1 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [10]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "qwen-2.5",
)

def alpaca_to_conversations(example):
    if example["input"].strip():
        user_prompt = f"{example['instruction']}\n\nInput:\n{example['input']}"
    else:
        user_prompt = example["instruction"]

    return {
        "conversations": [
            {"role": "user", "content": user_prompt},
            {"role": "assistant", "content": example["output"]}
        ]
    }


from datasets import load_dataset
dataset = load_dataset("nareshmlx/16k_opencvpr", split = "train")

In [11]:
from unsloth.chat_templates import standardize_sharegpt
dataset = standardize_sharegpt(dataset)
dataset = dataset.map(alpaca_to_conversations)

In [12]:
def alpaca_to_conversations(batch):
    conversations = []
    for instr, inp, out in zip(batch["instruction"], batch["input"], batch["output"]):
        if inp.strip():
            user_msg = f"{instr}\n\nHere is the code:\n{inp}"
        else:
            user_msg = instr

        conversations.append([
            {"role": "user", "content": user_msg},
            {"role": "assistant", "content": out}
        ])
    return {"conversations": conversations}

dataset = dataset.map(
    alpaca_to_conversations,
    batched=True,
    remove_columns=["instruction", "input", "output"]
)

def formatting_prompts_func(examples):
    texts = [
        tokenizer.apply_chat_template(
            convo,
            tokenize=False,
            add_generation_prompt=False
        )
        for convo in examples["conversations"]
    ]
    return {"text": texts}

dataset = dataset.map(formatting_prompts_func, batched=True)

In [13]:
!pip show transformers trl unsloth
!pip list | grep -E "transformers|trl|unsloth|peft"


Name: transformers
Version: 4.55.4
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /root/opencv_code_review/unsloth_env/lib/python3.12/site-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: peft, trl, unsloth_zoo
---
Name: trl
Version: 0.22.2
Summary: Train transformer language models with reinforcement learning.
Home-page: https://github.com/huggingface/trl
Author: Leandro von Werra
Author-email: leandro.vonwerra@gmail.com
License: 
Location: /root/opencv_code_review/unsloth_env/lib/python3.12/site-packages
Requires: accelerate, datasets, transformers
Required-by: unsloth_zoo
---
Name: unsloth
Ve

In [14]:
dataset[5]["conversations"]

  'role': 'user'},
 {'content': 'i think you can use `validateInputImageSize`\r\nlook\r\nhttps://github.com/opencv/opencv/blob/4c024c35fbc7f0610501e087a9ef20c336a75e2b/modules/imgcodecs/src/loadsave.cpp#L72-L81',
  'role': 'assistant'}]

In [15]:
dataset[5]["text"]



In [16]:
# from trl import SFTConfig, SFTTrainer
# from transformers import DataCollatorForSeq2Seq
# trainer = SFTTrainer(
#     model = model,
#     tokenizer = tokenizer,
#     train_dataset = dataset,
#     dataset_text_field = "text",
#     max_seq_length = max_seq_length,
#     data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
#     packing = True, # Can make training 5x faster for short sequences.
#     args = SFTConfig(
#         per_device_train_batch_size = 1,
#         gradient_accumulation_steps = 4, # Fixed major bug in latest Unsloth
#         warmup_steps = 5,
#         # num_train_epochs = 1, # Set this for 1 full training run.
#         max_steps = 30,
#         learning_rate = 2e-4,
#         logging_steps = 1,
#         optim = "paged_adamw_8bit", # Save more memory
#         weight_decay = 0.01,
#         lr_scheduler_type = "linear",
#         seed = 3407,
#         output_dir = "outputs",
#         report_to = "none", # Use this for WandB etc
#     ),
# )

In [17]:
from trl import SFTConfig, SFTTrainer
from transformers import DataCollatorForSeq2Seq

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    packing = True,
    args = SFTConfig(
        per_device_train_batch_size = 1,      # Keep at 1 for 16k context
        gradient_accumulation_steps = 16,      # Increase this instead
        warmup_steps = 50,
        num_train_epochs = 1,                  # Full training
        learning_rate = 2e-4,
        logging_steps = 5,
        optim = "paged_adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        seed = 3407,
        output_dir = "outputs",
        save_strategy = "steps",
        save_steps = 250,
        save_total_limit = 2,
        fp16 = False,
        bf16 = True,
        gradient_checkpointing = True,
        max_grad_norm = 1.0,
        report_to = "none",
    ),
)

In [18]:
tokenizer.decode(trainer.train_dataset[5]["input_ids"])



In [19]:
# Option 1: View the tokenized text directly
space = tokenizer(" ", add_special_tokens=False).input_ids[0]
tokenizer.decode(trainer.train_dataset[5]["input_ids"])




In [20]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA RTX A6000. Max memory = 47.413 GB.
6.73 GB of memory reserved.


In [24]:
# In notebook
import trl
print(trl.__version__)
help(trl.SFTTrainer.__init__)


0.22.2
Help on function __init__ in module UnslothSFTTrainer:

__init__(self, model, args=None, data_collator=None, train_dataset=None, eval_dataset=None, processing_class=None, compute_loss_func=None, compute_metrics=None, callbacks=None, optimizer_cls_and_kwargs=None, preprocess_logits_for_metrics=None, peft_config=None, formatting_func=None, **kwargs)
    Initialize self.  See help(type(self)) for accurate signature.



In [21]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 10,282 | Num Epochs = 1 | Total steps = 643
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 16
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 16 x 1) = 16
 "-____-"     Trainable parameters = 40,370,176 of 7,655,986,688 (0.53% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss


KeyboardInterrupt: 

In [None]:

# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")


519.3581 seconds used for training.
8.66 minutes used for training.
Peak reserved memory = 39.072 GB.
Peak reserved memory for training = 32.342 GB.
Peak reserved memory % of max memory = 82.408 %.
Peak reserved memory for training % of max memory = 68.213 %.


In [None]:
from unsloth import FastLanguageModel
from transformers import TextStreamer

# Enable fast inference
FastLanguageModel.for_inference(model)

# Convert chat message to direct prompt format
prompt = """You are an expert OpenCV code reviewer. Review this change:

File: modules/imgproc/src/resize.cpp
@@ -100,7 +100,7 @@
 cv::Mat src, dst;
-cv::resize(src, dst, cv::Size(100,100));
+cv::resize(src, dst, cv::Size(100,100), CV_INTER_LINEAR);

Review:"""

# Tokenize directly without chat template
inputs = tokenizer(
    prompt,
    return_tensors="pt",
    add_special_tokens=True
).to("cuda")

# Set up text streamer
text_streamer = TextStreamer(tokenizer, skip_prompt=True)

# Generate with optimized parameters for code review
outputs = model.generate(
    input_ids=inputs["input_ids"],
    streamer=text_streamer,
    max_new_tokens=256,        # Increased for detailed review
    use_cache=True,
    temperature=0.3,           # Much lower for code accuracy
    top_p=0.9,                # Better than min_p for code
    do_sample=True,
    repetition_penalty=1.1,
    pad_token_id=tokenizer.eos_token_id,
)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


 
- The patch is missing a test case.
- The patch does not follow the naming convention for the test case (test_resize_interlinear)
- The patch does not provide coverage for all possible interpolation methods

Please fix these issues before merging.

This is a critical issue that needs to be addressed before proceeding with the merge.

Do not proceed with the merge until these issues have been resolved.

The patch should include:
- A test case named test_resize_interlinear
- Coverage for all possible interpolation methods (INTER_NEAREST, INTER_LINEAR, INTER_CUBIC, INTER_LANCZOS4)

Once these issues have been addressed, you can proceed with the merge. 

Remember to follow the guidelines for OpenCV code review and ensure that the changes meet the project's standards.<|im_end|>


In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content":
     "You are an expert OpenCV code reviewer. Review this change:\n\n"
     "File: modules/imgproc/src/resize.cpp\n"
     "@@ -100,7 +100,7 @@\n"
     " cv::Mat src, dst;\n"
     "-cv::resize(src, dst, cv::Size(100,100));\n"
     "+cv::resize(src, dst, cv::Size(100,100), CV_INTER_LINEAR);\n"}]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

 It looks like there was a mistake in the `resize` function call in the given code snippet. The third argument should specify the interpolation method if you want to explicitly set it. Here is the corrected version with the explicit interpolation method specified:

```cpp
cv::Mat src, dst;
-cv::resize(src, dst, cv::Size(100, 100));
+cv::resize(src, dst, cv::Size(100, 100), 0.0, 0.0, cv::INTER_LINEAR);
```

Alternatively, if the intention is to maintain backward compatibility or


In [None]:
import os 
from dotenv import load_dotenv

load_dotenv(override=True)
HF_TOKEN = os.environ.get("HF_TOKEN")



In [None]:
# model.save_pretrained("lora_model")  # Local saving
# tokenizer.save_pretrained("lora_model")
model.push_to_hub("nareshmlx/code-reviewer-opencv", token = HF_TOKEN) # Online saving
tokenizer.push_to_hub("nareshmlx/code-reviewer-opencv", token = HF_TOKEN) # Online saving

In [None]:
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content":
     "You are an expert OpenCV code reviewer. Review this change:\n\n"
     "File: modules/core/src/arithm.cpp\n"
     "@@ -120,7 +120,11 @@\n"
     " for (int i = 0; i < img.rows; i++) {\n"
     "     for (int j = 0; j < img.cols; j++) {\n"
     "         img.at<uchar>(i,j) = img.at<uchar>(i,j) * 2;\n"
     "     }\n"
     " }\n"}]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

### Summary
The provided patch increases the value of each pixel in a grayscale image (represented as a `Mat` object) to its double. This is achieved using nested loops iterating over each pixel position `(i, j)` and modifying the pixel value directly via `img.at<uchar>(i, j) = img.at<uchar>(i, j) * 2`.

Here is the complete context around line 120 of `modules/core/src/arithm.cpp`, formatted with diff markers to show only the affected lines:

```diff
     int kx, ky;
 } ArithmContext;

@@ -93
