In [1]:
import json
from pathlib import Path
from document_extractor.dataset import format_data
import torch
from transformers import Qwen2VLForConditionalGeneration, Qwen2VLProcessor
from document_extractor.utils import (
    preprocess_logits_for_metrics,
    get_collate_fn,
)
from document_extractor.eval import get_compute_metrics
from transformers import BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
from trl import SFTConfig

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
try:
    import google.colab

    IN_COLAB = True
except:
    IN_COLAB = False

In [3]:
if IN_COLAB:
    from google.colab import drive

    drive.mount("/content/drive/document_extractor")

In [4]:
PREPROCESSED_DATASET_FOLDER = "../datasets/preprocessed"
QUANTIZE_4_BITS = True
QUANTIZE_8_BITS = False
USE_LORA = True
OUTPUT_DIR = "./finetune_qwen2_vl_output/"

assert not (
    QUANTIZE_4_BITS and QUANTIZE_8_BITS
), "You cannot select both quantize 4bit and 8bit"

In [5]:
with open(Path(PREPROCESSED_DATASET_FOLDER) / "train.json") as json_file:
    train_dataset = json.load(json_file)
with open(Path(PREPROCESSED_DATASET_FOLDER) / "val.json") as json_file:
    val_dataset = json.load(json_file)
with open(Path(PREPROCESSED_DATASET_FOLDER) / "test.json") as json_file:
    test_dataset = json.load(json_file)

In [6]:
train_dataset = [format_data(sample=sample) for sample in train_dataset]
val_dataset = [format_data(sample=sample) for sample in val_dataset]
test_dataset = [format_data(sample=sample) for sample in test_dataset]

In [7]:
train_dataset[100]

[{'role': 'system',
  'content': [{'type': 'text',
    'text': 'You are a Vision Language Model specialized in interpreting visual data from documents.\nYour task is to analyze the provided document image and respond to queries with concise answers, usually a single word, number, or short phrase.\nThe document include a variety of types (e.g., invoice, bank statements, payslips, etc) and contain tables, dates, amounts, and text.\nFocus on delivering accurate, succinct answers based on the visual information.'}]},
 {'role': 'user',
  'content': [{'type': 'image',
    'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=1184x1536>},
   {'type': 'text',
    'text': 'Extract the list of monthly billing history 2018'}]},
 {'role': 'assistant',
  'content': [{'type': 'text',
    'text': '[\n  {\n    "month": "01",\n    "electric": "$150",\n    "gas": "$50"\n  },\n  {\n    "month": "02",\n    "electric": "$100",\n    "gas": "$50"\n  },\n  {\n    "month": "03",\n    "electric": "$100"

In [8]:
if torch.cuda.is_available():
    model_id = "Qwen/Qwen2-VL-2B-Instruct"
    device = "cuda"
    min_pixels = 256 * 28 * 28
    max_pixels = 1280 * 28 * 28
else:
    model_id = "yujiepan/qwen2-vl-tiny-random"
    device = "cpu"
    min_pixels = 10 * 28 * 28
    max_pixels = 20 * 28 * 28

In [9]:
# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=QUANTIZE_4_BITS,
    load_in_8bit=QUANTIZE_8_BITS,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Load model and tokenizer
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    quantization_config=(
        bnb_config
        if (QUANTIZE_4_BITS or QUANTIZE_8_BITS) and device == "cuda"
        else None
    ),
    attn_implementation="flash_attention_2" if device == "cuda" else "eager",
)
processor = Qwen2VLProcessor.from_pretrained(
    model_id, min_pixels=min_pixels, max_pixels=max_pixels
)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [10]:
if USE_LORA:
    print("Using LORA")
    # Configure LoRA
    peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.05,
        r=8,
        bias="none",
        target_modules=["q_proj", "v_proj"],
        task_type="CAUSAL_LM",
    )

    # Apply PEFT model adaptation
    peft_model = get_peft_model(model, peft_config)

    # Print trainable parameters
    peft_model.print_trainable_parameters()

    model = peft_model

else:
    print("Full training")

The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


Using LORA
trainable params: 896 || all params: 4,902,304 || trainable%: 0.0183


In [11]:
# Configure training arguments
training_args = SFTConfig(
    output_dir=OUTPUT_DIR,  # Directory to save the model
    num_train_epochs=3,  # Number of training epochs
    per_device_train_batch_size=4,  # Batch size for training
    per_device_eval_batch_size=4,  # Batch size for evaluation
    gradient_accumulation_steps=8,  # Steps to accumulate gradients
    gradient_checkpointing=True,  # Enable gradient checkpointing for memory efficiency
    # Optimizer and scheduler settings
    optim="adamw_torch_fused",  # Optimizer type
    learning_rate=2e-4,  # Learning rate for training
    lr_scheduler_type="constant",  # Type of learning rate scheduler
    # Logging and evaluation
    do_eval=True,
    logging_steps=10,  # Steps interval for logging
    eval_steps=10,  # Steps interval for evaluation
    eval_strategy="steps",  # Strategy for evaluation
    save_strategy="steps",  # Strategy for saving the model
    save_steps=20,  # Steps interval for saving
    metric_for_best_model="eval_loss",  # Metric to evaluate the best model
    greater_is_better=False,  # Whether higher metric values are better
    load_best_model_at_end=True,  # Load the best model after training
    # Mixed precision and gradient settings
    bf16=True,  # Use bfloat16 precision
    tf32=device == "cuda",  # Use TensorFloat-32 precision
    max_grad_norm=0.3,  # Maximum norm for gradient clipping
    warmup_ratio=0.03,  # Ratio of total steps for warmup
    # Hub and reporting
    push_to_hub=False,  # Whether to push model to Hugging Face Hub
    report_to="tensorboard",  # Reporting tool for tracking metrics
    # Gradient checkpointing settings
    gradient_checkpointing_kwargs={
        "use_reentrant": False
    },  # Options for gradient checkpointing
    # Dataset configuration
    dataset_text_field="",  # Text field in dataset
    dataset_kwargs={"skip_prepare_dataset": True},  # Additional dataset options
    # max_seq_length=1024  # Maximum sequence length for input
)

training_args.remove_unused_columns = False  # Keep unused columns in dataset

In [12]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=get_collate_fn(processor),
    peft_config=peft_config,
    tokenizer=processor.tokenizer,
    compute_metrics=get_compute_metrics(tokenizer=processor.tokenizer),
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
)

  trainer = SFTTrainer(


In [13]:
trainer.train()

  ctx_manager = torch.cpu.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss,Accuracy
10,11.9306,11.931205,0.0
20,11.9307,11.930902,0.0
30,11.9303,11.93055,0.0


  ctx_manager = torch.cpu.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)


KeyboardInterrupt: 

In [None]:
trainer.save_model(training_args.output_dir)