@@ -3342,6 +3342,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
33423342 set_input_i32 (" positions" , positions);
33433343 } break ;
33443344 case PROJECTOR_TYPE_QWEN25VL:
3345+ case PROJECTOR_TYPE_UTUVL:
33453346 {
33463347 // pw * ph = number of tokens output by ViT after apply patch merger
33473348 // ipw * ipw = number of vision token been processed inside ViT
@@ -3356,7 +3357,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
33563357 std::vector<int > inv_idx (ph * pw);
33573358
33583359 if (use_window_attn) {
3359- const int attn_window_size = 112 ;
3360+ const int attn_window_size = ctx-> model . proj_type == PROJECTOR_TYPE_QWEN25VL ? 112 : patch_size * 2 * 8 ;
33603361 const int grid_window = attn_window_size / patch_size / merge_ratio;
33613362 int dst = 0 ;
33623363 // [num_vision_tokens, num_vision_tokens] attention mask tensor
@@ -3421,78 +3422,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
34213422 }
34223423 }
34233424
3424- set_input_i32 (" positions" , positions);
3425- } break ;
3426- case PROJECTOR_TYPE_UTUVL:
3427- {
3428- const bool use_window_attn = true ;
3429- const int merge_ratio = 2 ;
3430- const int pw = image_size_width / patch_size / merge_ratio; // patches after merger
3431- const int ph = image_size_height / patch_size / merge_ratio;
3432- const int ipw = image_size_width / patch_size; // internal patches in ViT
3433- const int iph = image_size_height / patch_size;
3434- std::vector<int > idx (ph * pw);
3435- std::vector<int > inv_idx (ph * pw);
3436- if (use_window_attn) {
3437- const int attn_window_size = patch_size * 2 * 8 ;
3438- const int grid_window = attn_window_size / patch_size / merge_ratio;
3439- int dst = 0 ;
3440- // [num_vision_tokens, num_vision_tokens] attention mask tensor
3441- std::vector<float > mask (pow (ipw * iph, 2 ), std::numeric_limits<float >::lowest ());
3442- int mask_row = 0 ;
3443- for (int y = 0 ; y < ph; y += grid_window) {
3444- for (int x = 0 ; x < pw; x += grid_window) {
3445- const int win_h = std::min (grid_window, ph - y);
3446- const int win_w = std::min (grid_window, pw - x);
3447- const int dst_0 = dst;
3448- // group all tokens belong to the same window togather (to a continue range)
3449- for (int dy = 0 ; dy < win_h; dy++) {
3450- for (int dx = 0 ; dx < win_w; dx++) {
3451- const int src = (y + dy) * pw + (x + dx);
3452- GGML_ASSERT (src < (int )idx.size ());
3453- GGML_ASSERT (dst < (int )inv_idx.size ());
3454- idx [src] = dst;
3455- inv_idx[dst] = src;
3456- dst++;
3457- }
3458- }
3459- for (int r=0 ; r < win_h * win_w * merge_ratio * merge_ratio; r++) {
3460- int row_offset = mask_row * (ipw * iph);
3461- std::fill (
3462- mask.begin () + row_offset + (dst_0 * merge_ratio * merge_ratio),
3463- mask.begin () + row_offset + (dst * merge_ratio * merge_ratio),
3464- 0.0 );
3465- mask_row++;
3466- }
3467- }
3468- }
3469- set_input_i32 (" window_idx" , idx);
3470- set_input_i32 (" inv_window_idx" , inv_idx);
3471- set_input_f32 (" window_mask" , mask);
3472- } else {
3473- for (int i = 0 ; i < ph * pw; i++) {
3474- idx[i] = i;
3475- }
3476- }
3477- const int mpow = merge_ratio * merge_ratio;
3478- std::vector<int > positions (n_pos * 4 );
3479- int ptr = 0 ;
3480- for (int y = 0 ; y < iph; y += merge_ratio) {
3481- for (int x = 0 ; x < ipw; x += merge_ratio) {
3482- for (int dy = 0 ; dy < merge_ratio; dy++) {
3483- for (int dx = 0 ; dx < merge_ratio; dx++) {
3484- // Remap positions to match window-grouped order
3485- auto remap = idx[ptr / mpow];
3486- remap = (remap * mpow) + (ptr % mpow);
3487- positions[ remap] = y + dy;
3488- positions[ num_patches + remap] = x + dx;
3489- positions[2 * num_patches + remap] = y + dy;
3490- positions[3 * num_patches + remap] = x + dx;
3491- ptr++;
3492- }
3493- }
3494- }
3495- }
34963425 set_input_i32 (" positions" , positions);
34973426 } break ;
34983427 case PROJECTOR_TYPE_PIXTRAL:
0 commit comments