In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
%%capture
%pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
%pip install sentencepiece protobuf datasets huggingface_hub hf_transfer nltk python-Levenshtein pylatexenc matplotlib pillow
%pip install --no-deps unsloth

In [3]:
import torch
import numpy as np
from datasets import load_dataset
import subprocess
import tempfile
import os
from transformers import TextStreamer
from nltk.translate.bleu_score import sentence_bleu
import Levenshtein
import matplotlib.pyplot as plt

from unsloth import FastVisionModel, is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig

import nltk
nltk.download('punkt')


Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastVisionModel, is_bf16_supported


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


2025-07-28 23:27:25.189401: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753745245.396305      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753745245.458786      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


🦥 Unsloth Zoo will now patch everything to make training faster!


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Load Model and Tokenizer

In [4]:
model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Qwen2-VL-7B-Instruct",
    load_in_4bit=True,
    use_gradient_checkpointing="unsloth"
)

model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers=True,
    finetune_language_layers=True,
    finetune_attention_modules=True,
    finetune_mlp_modules=True,

    r=16,
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    random_state=3407,
    use_rslora=False,
    loftq_config=None
)

==((====))==  Unsloth 2025.7.8: Fast Qwen2_Vl patching. Transformers: 4.52.4.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/6.85G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/238 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/572 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/392 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.


chat_template.json: 0.00B [00:00, ?B/s]

Unsloth: Making `model.base_model.model.model.visual` require gradients


## Load Dataset

In [5]:
dataset = load_dataset("unsloth/Latex_OCR", split="train")
split_data = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_data["train"]
temp_dataset = split_data["test"]
eval_test_split = temp_dataset.train_test_split(test_size=0.5, seed=42)
eval_dataset = eval_test_split["train"]
test_dataset = eval_test_split["test"]

README.md:   0%|          | 0.00/519 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/344M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/38.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/68686 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7632 [00:00<?, ? examples/s]

In [6]:
print(f"Train samples: {len(train_dataset)}")
print(f"Eval samples: {len(eval_dataset)}")
print(f"Test samples: {len(test_dataset)}")

Train samples: 54948
Eval samples: 6869
Test samples: 6869


## Convert samples to conversation format

In [7]:
def convert_to_conversation(sample):
    conversation = [
        {"role": "user",
         "content": [
             {"type": "text", "text": "Write the LaTeX representation for this image."},
             {"type": "image", "image": sample["image"]}
         ]},
        {"role": "assistant",
         "content": [
             {"type": "text", "text": sample["text"]}
         ]}
    ]
    return {"messages": conversation}

In [8]:
converted_train_dataset = [convert_to_conversation(sample) for sample in train_dataset]
converted_eval_dataset = [convert_to_conversation(sample) for sample in eval_dataset]

## Post-correction for unmatched braces (simple heuristic)

In [9]:
def simple_post_correct(latex_code):
    stack = []
    corrected = []
    for c in latex_code:
        if c == '{':
            stack.append(c)
        corrected.append(c)
        if c == '}' and stack:
            stack.pop()
    corrected.extend('}' * len(stack))
    return ''.join(corrected)

## LaTeX compilation check

In [10]:
def check_latex_compilation(latex_code):
    latex_document = f"""
    \\documentclass{{article}}
    \\usepackage{{amsmath, amssymb, amsfonts}}
    \\begin{{document}}
    $${latex_code}$$
    \\end{{document}}
    """
    try:
        with tempfile.NamedTemporaryFile(mode='w', suffix='.tex', delete=False) as f:
            f.write(latex_document)
            tex_file = f.name

        result = subprocess.run(
            ['pdflatex', '-interaction=nonstopmode', tex_file],
            capture_output=True,
            timeout=10,
            cwd='/tmp'
        )
        base_name = tex_file[:-4]
        for ext in ['.tex', '.pdf', '.log', '.aux']:
            try:
                os.unlink(base_name + ext)
            except:
                pass
        return result.returncode == 0
    except Exception:
        return False

## Eval Metrics

In [11]:
def exact_match_accuracy(predictions, targets):
    matches = [pred.strip() == target.strip() for pred, target in zip(predictions, targets)]
    return np.mean(matches)

In [12]:
def calculate_bleu_scores(predictions, targets):
    bleu_scores = []
    for pred, target in zip(predictions, targets):
        pred_tokens = pred.strip().split()
        target_tokens = target.strip().split()
        if len(target_tokens) == 0:
            bleu_scores.append(0.0)
        else:
            try:
                score = sentence_bleu([target_tokens], pred_tokens, weights=(0.25, 0.25, 0.25, 0.25))
                bleu_scores.append(score)
            except:
                bleu_scores.append(0.0)
    return bleu_scores

def calculate_edit_distances(predictions, targets):
    distances = []
    for pred, target in zip(predictions, targets):
        distance = Levenshtein.distance(pred.strip(), target.strip())
        distances.append(distance)
    return distances

In [13]:
def compilation_success_rate(predictions):
    success_count = sum(check_latex_compilation(pred) for pred in predictions)
    return success_count / len(predictions) if len(predictions) > 0 else 0.0

## Iterative refinement prediction

In [14]:
def iterative_refine_predict(model, tokenizer, image, num_iterations=3):
    """
    Runs the model multiple times, feeding back previous prediction as prompt for refinement.
    """
    model.eval()
    device = next(model.parameters()).device
    instruction = "Write the LaTeX representation for this image."

    # Initial messages: no previous output
    prev_output = None
    for iteration in range(num_iterations):
        messages = [
            {"role": "user", "content": [
                {"type": "text", "text": instruction},
                {"type": "image", "image": image}
            ]}
        ]

        # Inject previous output from last iteration as assistant message if available
        if prev_output is not None:
            messages.append({
                "role": "assistant", "content": [
                    {"type": "text", "text": prev_output}
                ]
            })

        # Prepare input text with chat template
        input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)

        inputs = tokenizer(
            image, input_text,
            add_special_tokens=False,
            return_tensors="pt",
        ).to(device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=128,
                temperature=0.1,
                pad_token_id=tokenizer.eos_token_id,
                do_sample=False,  # deterministic for refinement
                use_cache=True,
            )

        # Decode newly generated tokens (skip input tokens)
        generated_tokens = outputs[0][inputs['input_ids'].shape[1]:]
        pred = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
        # Post-correct after each iteration
        pred = simple_post_correct(pred)

        prev_output = pred

    return prev_output

## Evaluate model with iterative refinement

In [15]:
def evaluate_model_with_refinement(model, tokenizer, eval_dataset, max_samples=None):
    model.eval()
    predictions = []
    targets = []

    samples_to_eval = eval_dataset if max_samples is None else eval_dataset.select(range(min(max_samples, len(eval_dataset))))
    print(f"Evaluating on {len(samples_to_eval)} samples with iterative refinement...")

    for i, sample in enumerate(samples_to_eval):
        if i % 10 == 0:
            print(f"Processing sample {i+1}/{len(samples_to_eval)}")

        pred = iterative_refine_predict(model, tokenizer, sample["image"], num_iterations=3)
        predictions.append(pred)
        targets.append(sample["text"])

    exact_match = exact_match_accuracy(predictions, targets)
    bleu_scores = calculate_bleu_scores(predictions, targets)
    edit_distances = calculate_edit_distances(predictions, targets)
    compile_rate = compilation_success_rate(predictions)

    metrics = {
        "exact_match_accuracy": exact_match,
        "average_bleu": np.mean(bleu_scores),
        "median_bleu": np.median(bleu_scores),
        "std_bleu": np.std(bleu_scores),
        "average_edit_distance": np.mean(edit_distances),
        "median_edit_distance": np.median(edit_distances),
        "std_edit_distance": np.std(edit_distances),
        "compilation_success_rate": compile_rate,
        "num_samples": len(predictions)
    }
    return metrics, predictions, targets


## Trainer

In [16]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=UnslothVisionDataCollator(model, tokenizer),
    train_dataset=converted_train_dataset,
    eval_dataset=converted_eval_dataset[:100],
    args=SFTConfig(
        per_device_train_batch_size=32,   # very high if GPU allows
        per_device_eval_batch_size=32,    # matches train batch size
        gradient_accumulation_steps=1,    # minimize if batch size is high
        warmup_steps=5,
        num_train_epochs=2,
        learning_rate=2e-4,
        fp16=not is_bf16_supported(),
        bf16=is_bf16_supported(),
        logging_steps=500,               # log every 500 steps
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=1,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="none",
        remove_unused_columns=False,
        dataset_text_field="",
        dataset_kwargs={"skip_prepare_dataset": True},
        dataset_num_proc=4,
        max_seq_length=2048,
        dataloader_num_workers=8,         # maximize for your CPU
    ),
)


Unsloth: Model does not have a default image size - using 512


In [None]:
print("Starting training without augmentation...")
trainer.train()
print("Training completed.")

Starting training without augmentation...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 54,948 | Num Epochs = 2 | Total steps = 1,718
O^O/ \_/ \    Batch size per device = 64 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (64 x 1 x 1) = 64
 "-____-"     Trainable parameters = 50,855,936 of 8,342,231,552 (0.61% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 54,948 | Num Epochs = 2 | Total steps = 3,436
O^O/ \_/ \    Batch size per device = 32 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (32 x 1 x 1) = 32
 "-____-"     Trainable parameters = 50,855,936 of 8,342,231,552 (0.61% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Epoch,Training Loss,Validation Loss


## Evaluation Begins

In [None]:
print("\nEvaluating on validation set with iterative refinement...")
val_metrics, val_preds, val_tgts = evaluate_model_with_refinement(model, tokenizer, eval_dataset, max_samples=100)
for k,v in val_metrics.items():
    if isinstance(v, float):
        print(f"{k}: {v:.4f}")
    else:
        print(f"{k}: {v}")

In [None]:
print("\nEvaluating on test set with iterative refinement...")
test_metrics, test_preds, test_tgts = evaluate_model_with_refinement(model, tokenizer, test_dataset, max_samples=100)
for k,v in test_metrics.items():
    if isinstance(v, float):
        print(f"{k}: {v:.4f}")
    else:
        print(f"{k}: {v}")

In [None]:
print("\nSample predictions (target vs prediction):")
for i in range(5):
    print(f"Sample {i+1}:")
    print(f"Target    : {test_tgts[i]}")
    print(f"Prediction: {test_preds[i]}")
    print(f"Match: {'✓' if test_tgts[i].strip() == test_preds[i].strip() else '✗'}")
    print()

print("Image-to-LaTeX pipeline with iterative refinement completed!")