In [1]:
import argparse
import os
from functools import partial
import re

import torch
import deepspeed

from data.dataset import HybridDataset, collate_fn
from qwen_vl_utils import process_vision_info
from model.showui.processing_showui import ShowUIProcessor
from model.showui.modeling_showui import ShowUIForConditionalGeneration
from main.eval_screenspot import validate_screenspot

[2025-10-05 16:33:35,085] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Set environment variables BEFORE importing deepspeed
os.environ['RANK'] = '0'
os.environ['LOCAL_RANK'] = '0'
os.environ['WORLD_SIZE'] = '1'
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '9994'



Loading processor

In [3]:

### ShowUI Preprocessor
# 0. Common setups
min_pixels = 256*28*28
max_pixels = 1344*28*28
# 1. Screenshot -> Graph
uigraph_train = True        # Enable ui graph during training
uigraph_test = True         # Enable ui graph during inference
uigraph_diff = 1            # Pixel difference used for constructing ui graph
uigraph_rand = False        # Enable random graph construction 
# 2. Graph -> Mask 
uimask_pre = True           # Prebuild patch selection mask in the preprocessor (not in model layers) for efficiency
uimask_ratio = 0.5          # Specify the percentage of patch tokens to skip per component
uimask_rand = False         # Enable random token selection instead of uniform selection

### ShowUI Model
lm_skip_ratio = uimask_ratio # valid if not uimask_pre
lm_skip_layer = "[1,28,1]"   # [1,28,1] means we apply UI guide token selection from 1-th to 28-th layer (28 is the last layer of Qwen2-VL)

processor = ShowUIProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct", 
    min_pixels=min_pixels, max_pixels=max_pixels,
    uigraph_train=uigraph_train, uigraph_test=uigraph_test, uigraph_diff=uigraph_diff, uigraph_rand=uigraph_rand,
    uimask_pre=True, uimask_ratio=uimask_ratio, uimask_rand=uimask_rand,
)

In [4]:
lm_qwen_layer = 28

def parse_layer_type(str_ranges, L=lm_qwen_layer, default=0):
    # 0 is without layer token selection, 1 is with layer token selection. Below we provide examples:
    # [1,28,1] means that all LM layers use token selection; [1,28,0] means that do not.
    # Interleaved layer-wise '[2,2,1],[4,4,1],[6,6,1],[8,8,1],[10,10,1],[12,12,1],[14,14,1],[16,16,1],[18,18,1],[20,20,1],[22,22,1],[24,24,1],[26,26,1]'
    result = [default] * L
    
    # Handle None or non-string input
    if str_ranges is None or str_ranges == '':
        return result
    
    # Convert to string if not already
    if not isinstance(str_ranges, str):
        str_ranges = str(str_ranges)
    
    matches = re.findall(r'\[\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*\]', str_ranges)
    for start, end, value in matches:
        start, end, value = int(start) - 1, int(end) - 1, int(value)
        if 0 <= start < L and 0 <= end < L:
            result[start:end + 1] = [value] * (end - start + 1)
    return result

lm_skip_layer = parse_layer_type(lm_skip_layer, 28)
# print(lm_skip_layer)

model = ShowUIForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.bfloat16,
    device_map="cuda:0",
    lm_skip_ratio=lm_skip_ratio, lm_skip_layer=lm_skip_layer,
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.46it/s]


In [5]:
def parse_args(args):
    parser = argparse.ArgumentParser(description="ShowUI Training Pipeline")
    # Environment
    parser.add_argument("--wandb_key", default=None, type=str, help="wandb key to monitor training")
    parser.add_argument("--local_rank", default=0, type=int, help="node rank")
    parser.add_argument(
        "--precision",
        default="bf16",
        type=str,
        choices=["fp32", "bf16", "fp16"],
        help="precision for inference",
    )
    parser.add_argument("--ds_zero", choices=['zero1', 'zero2', 'zero3'], default='zero2', help="deepspeed zero stage")
    parser.add_argument("--load_in_8bit", action="store_true", default=False)
    parser.add_argument("--load_in_4bit", action="store_true", default=False)
    parser.add_argument("--attn_imple", choices=["eager", "flash_attention_2", "sdpa"], default="eager")
    parser.add_argument("--liger_kernel", action="store_true", default=False)

    # Model & Ckpt
    parser.add_argument("--model_id", default="showlab/ShowUI-2B", choices=["showlab/ShowUI-2B", "Qwen/Qwen2-VL-2B-Instruct", "Qwen/Qwen2-VL-7B-Instruct", \
                                                                            "Qwen/Qwen2.5-VL-3B-Instruct"])
    parser.add_argument("--version", default="showlab/ShowUI-2B")
    parser.add_argument("--max_new_tokens", default=128, type=int, help="max. generated token length")
    parser.add_argument("--local_weight", action="store_true", default=False)
    parser.add_argument("--local_weight_dir",  default=".", help="default path to load the model weight")
    # Visual Encoder Training strategy
    parser.add_argument("--tune_visual_encoder", action="store_true", default=False)
    parser.add_argument("--tune_visual_encoder_projector", action="store_true", default=False)
    parser.add_argument("--freeze_lm_embed", action="store_true", default=False)

    # Training / Validation Data
    parser.add_argument("--dataset_dir", default="./dataset", type=str)
    parser.add_argument("--train_dataset", default="showui", type=str)
    parser.add_argument("--train_json", default="hf_train", type=str)
    parser.add_argument("--train_ratio", default="1", type=str)
    parser.add_argument("--val_dataset", default="screenspot", type=str)
    parser.add_argument("--val_json", default="hf_test_full", type=str)
    parser.add_argument("--val_ratio", default="1", type=str)
    parser.add_argument("--uniform_sample", action="store_true", default=False)
    parser.add_argument("--random_sample", action="store_true", default=False)
    parser.add_argument("--record_sample", action="store_true", default=False)
    
    ### ShowUI Preprocessor
    # 0. Common setups
    parser.add_argument("--min_visual_tokens", default=256, type=int)
    parser.add_argument("--max_visual_tokens", default=1280, type=int)
    parser.add_argument("--model_max_length", default=8192, type=int)
    # 1. Screenshot -> Graph
    parser.add_argument("--uigraph_train", action="store_false", default=True, help="Enable ui graph during training")
    parser.add_argument("--uigraph_test", action="store_true", default=False, help="Enable ui graph during inference")
    parser.add_argument("--uigraph_diff", default=1, type=int, help="Pixel difference used for constructing ui graph")
    parser.add_argument("--uigraph_rand", action="store_true", default=False, help="Enable random graph construction")
    # 2. Graph -> Mask 
    parser.add_argument("--uimask_pre", action="store_false", default=True, help="Prebuild patch selection mask in the preprocessor (not in model layers) for efficiency")
    parser.add_argument("--uimask_ratio", default=0.5, type=float, help="Specify the percentage of patch tokens to skip per component")
    parser.add_argument("--uimask_rand", action="store_true", default=False, help="Enable random token selection instead of uniform selection")
    ### ShowUI Model
    # 0 is without layer token selection, 1 is with layer token selection. Below we provide examples:
    # [1,28,1] means that all LM layers use token selection; [1,28,0] means that do not.
    # Interleaved layer-wise '[2,2,1],[4,4,1],[6,6,1],[8,8,1],[10,10,1],[12,12,1],[14,14,1],[16,16,1],[18,18,1],[20,20,1],[22,22,1],[24,24,1],[26,26,1]'
    parser.add_argument("--lm_skip_ratio", default=0, type=float)
    parser.add_argument("--lm_skip_layer", default='[1,28,0]', type=str)
    parser.add_argument("--vis_skip_ratio", default=0, type=float)
    parser.add_argument("--vis_skip_layer", default='[1,32,0]', type=str)
    # Pretrain / Supervised Fine-tuning
    parser.add_argument("--showui_data", default="hf_train", type=str)
    parser.add_argument("--amex_data", default="hf_train", type=str)
    parser.add_argument("--guiact_data", default="hf_train_web-single_v2", type=str)
    parser.add_argument("--ricosca_data", default="hf_train_ricosca", type=str)
    parser.add_argument("--widget_data", default="hf_train_widget", type=str)
    parser.add_argument("--screencap_data", default="hf_train_screencap", type=str)
    # Downstream train. set
    parser.add_argument("--aitw_data", default="hf_train", type=str)
    parser.add_argument("--mind2web_data", default="hf_train", type=str)
    parser.add_argument("--miniwob_data", default="hf_train", type=str)
    # Downstream val. set
    parser.add_argument("--val_aitw_data", default="hf_test", type=str)
    parser.add_argument("--val_mind2web_data", default="hf_test_full", type=str)
    parser.add_argument("--val_screenspot_data", default="hf_test_full", type=str)

    # Grounding setting
    parser.add_argument("--num_turn", default=1, type=int, help="Interleaved Query-Action setting")
    parser.add_argument("--shuffle_image_token", action="store_true", default=False, help="shuffle image token for training")
    parser.add_argument("--uniform_prompt", action="store_true", default=False)
    parser.add_argument("--text2point", default=1, type=float)
    parser.add_argument("--text2bbox", default=0, type=float)
    parser.add_argument("--point2text", default=0, type=float)
    parser.add_argument("--bbox2text", default=0, type=float)
    parser.add_argument("--crop_min", default=1, type=float)
    parser.add_argument("--crop_max", default=1, type=float)
    parser.add_argument("--xy_int", action="store_true", default=False)

    # Navigation setting
    parser.add_argument("--num_history", default=4, type=int)
    parser.add_argument("--interleaved_history", default='tttt',  choices=['tttt', 'vvvv', 'vtvt', 'tvtv', 'vvtt', 'ttvv'], help="Interleaved Vision-Action setting")
    parser.add_argument("--skip_readme_train", action="store_true", default=False)
    parser.add_argument("--skip_readme_test", action="store_true", default=False)

    # Lora
    parser.add_argument("--use_qlora", action="store_true", default=False)
    parser.add_argument("--lora_r", default=8, type=int)
    parser.add_argument("--lora_alpha", default=16, type=int)
    parser.add_argument("--lora_dropout", default=0.05, type=float)
    parser.add_argument("--lora_target_modules", default="qkv_proj", type=str)

    # Training
    parser.add_argument("--log_base_dir", default="../runs", type=str)
    parser.add_argument("--exp_id", default="debug", type=str)
    parser.add_argument("--workers", default=16, type=int)
    parser.add_argument("--epochs", default=10, type=int)
    parser.add_argument("--start_epoch", default=0, type=int)
    parser.add_argument("--steps_per_epoch", default=500, type=int)
    parser.add_argument("--lr", default=0.0003, type=float)
    parser.add_argument("--warmup_steps", default=100, type=int)
    parser.add_argument("--warmup_type", default="linear", type=str)
    parser.add_argument("--beta1", default=0.9, type=float)
    parser.add_argument("--beta2", default=0.95, type=float)
    parser.add_argument("--batch_size", default=1, type=int, help="batch size per device per step")
    parser.add_argument("--grad_accumulation_steps", default=1, type=int)
    parser.add_argument("--val_batch_size", default=1, type=int)
    parser.add_argument("--gradient_checkpointing", action="store_true", default=False)
    
    # Model Checkpoint or Evaluation strategies
    parser.add_argument("--resume", default="", type=str)
    parser.add_argument("--auto_resume", action="store_true", default=True)
    parser.add_argument("--no_eval", action="store_true", default=False)
    parser.add_argument("--eval_only", action="store_true", default=False)
    parser.add_argument("--print_freq", default=1, type=int)
    parser.add_argument("--debug", action="store_true", default=False, help="for debugging, will not save model and monitor")
    return parser.parse_args(args)

In [6]:
args = parse_args([
    "--eval_only", 

])

In [7]:

args.global_rank = int(os.environ.get("RANK", 0))
args.local_rank = int(os.environ.get("LOCAL_RANK", args.local_rank))
args.world_size = int(os.environ.get("WORLD_SIZE", 1))

In [8]:
args.samples_per_epoch = args.batch_size    \
                * args.grad_accumulation_steps  \
                * args.steps_per_epoch  \
                * args.world_size

In [9]:
val_dataset = HybridDataset(
    processor,
    inference=True,
    args=args
)
val_loader = torch.utils.data.DataLoader(
    val_dataset,
    batch_size=1,
    shuffle=False,
    num_workers=1,
    pin_memory=False,
    sampler=None,
    collate_fn=partial(
        collate_fn,
        processor=processor
    ),
)

Dataset: Screenspot; Split: hf_test_full; # samples: 1272
Loading 1 Validation Datasets


In [10]:
# Build deepspeed config and initialize deepspeed
ds_config = {
    "train_micro_batch_size_per_gpu": args.batch_size,
    "gradient_accumulation_steps": args.grad_accumulation_steps,
    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": args.lr,
            "weight_decay": 0.0,
            "betas": (args.beta1, args.beta2),
        },
    },
    "scheduler": {
        "type": "WarmupDecayLR",
        "params": {
            "total_num_steps": args.epochs * args.steps_per_epoch,
            "warmup_min_lr": 0,
            "warmup_max_lr": args.lr,
            "warmup_num_steps": args.warmup_steps,
            "warmup_type": args.warmup_type,
        },
    },
    "fp16": {
        "enabled": args.precision == "fp16",
    },
    "bf16": {
        "enabled": args.precision == "bf16",
    }
}

In [11]:

# Then initialize deepspeed
model_engine, optimizer, train_loader, scheduler = deepspeed.initialize(
    model=model,
    model_parameters=model.parameters(),  # Fix: pass actual parameters
    training_data=val_dataset,
    collate_fn=partial(collate_fn, processor=processor),
    config=ds_config,
)

[2025-10-05 16:33:42,627] [INFO] [logging.py:96:log_dist] [Rank -1] DeepSpeed info: version=0.13.1, git-hash=unknown, git-branch=unknown
[2025-10-05 16:33:42,628] [INFO] [comm.py:637:init_distributed] cdb=None
[2025-10-05 16:33:42,629] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
[2025-10-05 16:33:42,879] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False


Using /home/khanddorj/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /home/khanddorj/.cache/torch_extensions/py310_cu118/fused_adam/build.ninja...
Building extension module fused_adam...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


ninja: no work to do.
Time to load fused_adam op: 0.009548425674438477 seconds
[2025-10-05 16:33:43,068] [INFO] [logging.py:96:log_dist] [Rank 0] Using DeepSpeed Optimizer param name adamw as basic optimizer
[2025-10-05 16:33:43,068] [INFO] [logging.py:96:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer
[2025-10-05 16:33:43,135] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam
[2025-10-05 16:33:43,136] [INFO] [logging.py:96:log_dist] [Rank 0] Creating BF16 optimizer


Loading extension module fused_adam...
  self._dummy_overflow_buf = get_accelerator().IntTensor([0])


[2025-10-05 16:33:43,318] [INFO] [utils.py:791:see_memory_usage] begin bf16_optimizer
[2025-10-05 16:33:43,319] [INFO] [utils.py:792:see_memory_usage] MA 4.12 GB         Max_MA 4.12 GB         CA 4.43 GB         Max_CA 4 GB 
[2025-10-05 16:33:43,320] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory:  used = 8.62 GB, percent = 55.5%
[2025-10-05 16:33:43,550] [INFO] [utils.py:791:see_memory_usage] before initializing group 0
[2025-10-05 16:33:43,552] [INFO] [utils.py:792:see_memory_usage] MA 4.12 GB         Max_MA 4.12 GB         CA 4.43 GB         Max_CA 4 GB 
[2025-10-05 16:33:43,555] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory:  used = 8.51 GB, percent = 54.7%


OutOfMemoryError: CUDA out of memory. Tried to allocate 8.23 GiB. GPU 0 has a total capacty of 11.92 GiB of which 3.05 GiB is free. Including non-PyTorch memory, this process has 8.46 GiB memory in use. Of the allocated memory 8.23 GiB is allocated by PyTorch, and 1.98 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
validate_screenspot(
    val_loader=val_loader,
    
)