Skip to content

Commit 1600974

Browse files
committed
merge code
1 parent 8733bf3 commit 1600974

File tree

2 files changed

+2
-87
lines changed

2 files changed

+2
-87
lines changed

tools/mtmd/clip.cpp

Lines changed: 2 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -3342,6 +3342,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
33423342
set_input_i32("positions", positions);
33433343
} break;
33443344
case PROJECTOR_TYPE_QWEN25VL:
3345+
case PROJECTOR_TYPE_UTUVL:
33453346
{
33463347
// pw * ph = number of tokens output by ViT after apply patch merger
33473348
// ipw * ipw = number of vision token been processed inside ViT
@@ -3356,7 +3357,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
33563357
std::vector<int> inv_idx(ph * pw);
33573358

33583359
if (use_window_attn) {
3359-
const int attn_window_size = 112;
3360+
const int attn_window_size = ctx->model.proj_type == PROJECTOR_TYPE_QWEN25VL ? 112 : patch_size * 2 * 8;
33603361
const int grid_window = attn_window_size / patch_size / merge_ratio;
33613362
int dst = 0;
33623363
// [num_vision_tokens, num_vision_tokens] attention mask tensor
@@ -3421,78 +3422,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
34213422
}
34223423
}
34233424

3424-
set_input_i32("positions", positions);
3425-
} break;
3426-
case PROJECTOR_TYPE_UTUVL:
3427-
{
3428-
const bool use_window_attn = true;
3429-
const int merge_ratio = 2;
3430-
const int pw = image_size_width / patch_size / merge_ratio; // patches after merger
3431-
const int ph = image_size_height / patch_size / merge_ratio;
3432-
const int ipw = image_size_width / patch_size; // internal patches in ViT
3433-
const int iph = image_size_height / patch_size;
3434-
std::vector<int> idx (ph * pw);
3435-
std::vector<int> inv_idx(ph * pw);
3436-
if (use_window_attn) {
3437-
const int attn_window_size = patch_size * 2 * 8;
3438-
const int grid_window = attn_window_size / patch_size / merge_ratio;
3439-
int dst = 0;
3440-
// [num_vision_tokens, num_vision_tokens] attention mask tensor
3441-
std::vector<float> mask(pow(ipw * iph, 2), std::numeric_limits<float>::lowest());
3442-
int mask_row = 0;
3443-
for (int y = 0; y < ph; y += grid_window) {
3444-
for (int x = 0; x < pw; x += grid_window) {
3445-
const int win_h = std::min(grid_window, ph - y);
3446-
const int win_w = std::min(grid_window, pw - x);
3447-
const int dst_0 = dst;
3448-
// group all tokens belong to the same window togather (to a continue range)
3449-
for (int dy = 0; dy < win_h; dy++) {
3450-
for (int dx = 0; dx < win_w; dx++) {
3451-
const int src = (y + dy) * pw + (x + dx);
3452-
GGML_ASSERT(src < (int)idx.size());
3453-
GGML_ASSERT(dst < (int)inv_idx.size());
3454-
idx [src] = dst;
3455-
inv_idx[dst] = src;
3456-
dst++;
3457-
}
3458-
}
3459-
for (int r=0; r < win_h * win_w * merge_ratio * merge_ratio; r++) {
3460-
int row_offset = mask_row * (ipw * iph);
3461-
std::fill(
3462-
mask.begin() + row_offset + (dst_0 * merge_ratio * merge_ratio),
3463-
mask.begin() + row_offset + (dst * merge_ratio * merge_ratio),
3464-
0.0);
3465-
mask_row++;
3466-
}
3467-
}
3468-
}
3469-
set_input_i32("window_idx", idx);
3470-
set_input_i32("inv_window_idx", inv_idx);
3471-
set_input_f32("window_mask", mask);
3472-
} else {
3473-
for (int i = 0; i < ph * pw; i++) {
3474-
idx[i] = i;
3475-
}
3476-
}
3477-
const int mpow = merge_ratio * merge_ratio;
3478-
std::vector<int> positions(n_pos * 4);
3479-
int ptr = 0;
3480-
for (int y = 0; y < iph; y += merge_ratio) {
3481-
for (int x = 0; x < ipw; x += merge_ratio) {
3482-
for (int dy = 0; dy < merge_ratio; dy++) {
3483-
for (int dx = 0; dx < merge_ratio; dx++) {
3484-
// Remap positions to match window-grouped order
3485-
auto remap = idx[ptr / mpow];
3486-
remap = (remap * mpow) + (ptr % mpow);
3487-
positions[ remap] = y + dy;
3488-
positions[ num_patches + remap] = x + dx;
3489-
positions[2 * num_patches + remap] = y + dy;
3490-
positions[3 * num_patches + remap] = x + dx;
3491-
ptr++;
3492-
}
3493-
}
3494-
}
3495-
}
34963425
set_input_i32("positions", positions);
34973426
} break;
34983427
case PROJECTOR_TYPE_PIXTRAL:

tools/mtmd/models/utuvl.cpp

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -170,20 +170,6 @@ ggml_cgraph * clip_graph_utuvl::build() {
170170
model.mm_1_w, model.mm_1_b,
171171
FFN_GELU,
172172
-1);
173-
// // 3. First linear layer
174-
// embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
175-
// embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
176-
// cb(embeddings, "merger_fc1", -1);
177-
178-
// // 4. GELU activation
179-
// embeddings = ggml_gelu(ctx0, embeddings);
180-
// cb(embeddings, "merger_gelu", -1);
181-
182-
// // 5. Second linear layer
183-
// embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
184-
// embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
185-
186-
// build the graph
187173
ggml_build_forward_expand(gf, embeddings);
188174

189175
return gf;

0 commit comments

Comments
 (0)