# 俄语多模态图文问答应用 (Qwen2-VL MindSpore)

本 Notebook 实现了一个基于 MindSpore 和 Qwen2-VL 的俄语多模态问答应用，包含针对 Ascend NPU 的算子修复补丁，并提供 Gradio Web 界面。

In [2]:
# 确保已经安装过mindspore2.6.0和mindnlp0.4.1，安装方法参考readme
# 安装 Gradio 依赖
!pip install gradio modelscope




[notice] A new release of pip is available: 24.3.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
#SDK模型下载
from modelscope import snapshot_download
model_dir = snapshot_download('ywsyah/qwen2ru_final')

Downloading Model from https://www.modelscope.cn to directory: C:\Users\14468\.cache\modelscope\hub\models\ywsyah\qwen2ru_final


2025-12-11 17:28:50,601 - modelscope - INFO - Got 15 files, start to download ...
Processing 15 items:   0%|          | 0.00/15.0 [00:00<?, ?it/s]
[A

[A[A



[A[A[A[A


[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A

Downloading [added_tokens.json]: 100%|██████████| 392/392 [00:01<00:00, 277B/s]
Downloading [chat_template.json]: 100%|██████████| 1.03k/1.03k [00:01<00:00, 743B/s]
Processing 15 items:   7%|▋         | 1.00/15.0 [00:01<00:21, 1.51s/it]

[A[A
[A




Downloading [chat_template.jinja]: 100%|██████████| 391/391 [00:01<00:00, 253B/s]

[A





[A[A[A[A[A[A


Downloading [config.json]: 100%|██████████| 1.17k/1.17k [00:01<00:00, 806B/s]
Downloading [generation_config.json]: 100%|██████████| 272/272 [00:01<00:00, 182B/s]
Downloading [merges.txt]: 100%|██████████| 1.59M/1.59M [00:01<00:00, 1.12MB/s]




Downloading [configuration.json]: 100%|██████████| 76.0/76.0 [00:01<00:00, 50.3B/s]
Processing 15 items:  27%|██▋       | 4.00/1

### 1. 环境初始化
配置 MindSpore 运行环境，禁用 PyBoost 和 GE 模式以避免算子冲突。

In [2]:
import os
import sys
import time
import gc
import requests
from io import BytesIO
from PIL import Image
import numpy as np
import mindspore
import mindspore.nn as nn
import mindspore.ops as ops
import mindspore.ops.operations as P
import mindspore.mint as mint
from mindspore import context, Tensor, dtype as mstype
import gradio as gr

# 环境变量配置
os.environ['MS_ENABLE_PYBOOST'] = '0'
os.environ['MS_ENABLE_GE'] = '0'

from mindnlp import configs
configs.USE_PYBOOST = False

context.set_context(
    mode=context.PYNATIVE_MODE,
    device_target='Ascend',
    enable_graph_kernel=False,
    max_call_depth=10000
)

print("\n" + "="*50)
print(">>> [Init] Environment configured. PyBoost & GE Disabled.")
print("="*50)

  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)
  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)
                                                       mindspore.device_context.ascend.op_precision.op_precision_mode(),
                                                       mindspore.device_context.ascend.op_precision.matmul_allow_hf32(),
                                                       mindspore.device_context.ascend.op_precision.conv_allow_hf32(),
                                                       mindspore.device_context.ascend.op_tuning.op_compile() instead.



>>> [Init] Environment configured. PyBoost & GE Disabled.


### 2. 核心算子补丁 (Critical Patches)

qwen2-vl调用的 `BatchMatMulV2` 算子310b显卡不支持，需要打补丁实现这个算子

针对 Ascend NPU 修复 MatMul、Concat 等算子，防止 `BatchMatMulV2` 报错。

In [3]:
def _run_strict_2d(a, b, t_a, t_b):
    target_dtype = a.dtype
    is_cast = False
    if (a.dtype == mstype.float32 or b.dtype == mstype.float32 or a.dtype != b.dtype):
        a_calc = a.astype(mstype.float16)
        b_calc = b.astype(mstype.float16)
        is_cast = True
    else:
        a_calc = a
        b_calc = b
    a_calc = ops.identity(a_calc)
    b_calc = ops.identity(b_calc)
    prim = P.MatMul(transpose_a=t_a, transpose_b=t_b)
    try:
        res = prim(a_calc, b_calc)
    except Exception:
        res = prim(ops.stop_gradient(a_calc), ops.stop_gradient(b_calc))
    if is_cast and target_dtype == mstype.float32:
        return res.astype(mstype.float32)
    return res

def universal_matmul_patch(input, other, transpose_a=False, transpose_b=False, bias=None):
    if transpose_a: input = input.swapaxes(-1, -2)
    if transpose_b: other = other.swapaxes(-1, -2)
    shape_a = input.shape
    shape_b = other.shape
    if len(shape_a) == 2 and len(shape_b) == 2:
        return _run_strict_2d(input, other, False, False)
    if len(shape_b) == 2:
        batch_dims = shape_a[:-1]
        flatten_dim = 1
        for d in batch_dims: flatten_dim *= d
        input_flat = input.reshape((flatten_dim, shape_a[-1]))
        res_flat = _run_strict_2d(input_flat, other, False, False)
        out_shape = batch_dims + (res_flat.shape[-1],)
        return res_flat.reshape(out_shape)
    if len(shape_a) > 2 and len(shape_b) > 2:
        batch_shape_a = shape_a[:-2]
        total_batch = 1
        for d in batch_shape_a: total_batch *= d
        input_3d = input.reshape((total_batch, shape_a[-2], shape_a[-1]))
        other_3d = other.reshape((total_batch, other.shape[-2], other.shape[-1]))
        results = []
        for i in range(total_batch):
            out = _run_strict_2d(input_3d[i], other_3d[i], False, False)
            results.append(out)
        res_stack = ops.stack(results)
        final_shape = batch_shape_a + (shape_a[-2], other.shape[-1])
        return res_stack.reshape(final_shape)
    return _run_strict_2d(input, other, False, False)

_REAL_CAT = ops.cat 
def safe_concat_patch(tensors, axis=0, dim=None):
    if dim is not None: axis = dim
    if not tensors: return _REAL_CAT(tensors, axis)
    first_dtype = tensors[0].dtype
    has_int32 = False
    needs_cast = False
    for t in tensors:
        if t.dtype == mstype.int32: has_int32 = True
        if t.dtype != first_dtype: needs_cast = True
    if needs_cast:
        target_dtype = mstype.int32 if has_int32 else first_dtype
        new_tensors = [t.astype(target_dtype) for t in tensors]
        tensors = new_tensors
    return _REAL_CAT(tensors, axis)

def _unify_types(x, y):
    if not isinstance(x, Tensor) or not isinstance(y, Tensor): return x, y
    if x.dtype == y.dtype: return x, y
    if (x.dtype == mstype.int32 and y.dtype == mstype.int64): return x, y.astype(mstype.int32)
    if (x.dtype == mstype.int64 and y.dtype == mstype.int32): return x.astype(mstype.int32), y
    if (x.dtype == mstype.float16 and y.dtype == mstype.float32): return x, y.astype(mstype.float16)
    if (x.dtype == mstype.float32 and y.dtype == mstype.float16): return x.astype(mstype.float16), y
    return x, y.astype(x.dtype)

_REAL_LESS = ops.less
def safe_less_patch(x, y):
    x, y = _unify_types(x, y)
    return _REAL_LESS(x, y)
_REAL_EQUAL = ops.equal
def safe_equal_patch(x, y):
    if isinstance(y, (list, tuple, dict, type(None))): return False
    x, y = _unify_types(x, y)
    return _REAL_EQUAL(x, y)
_REAL_NE = ops.ne
def safe_ne_patch(x, y):
    if isinstance(y, (list, tuple, dict, type(None))): return True
    x, y = _unify_types(x, y)
    return _REAL_NE(x, y)
_REAL_ADD = ops.add
def safe_add_patch(x, y):
    x, y = _unify_types(x, y)
    return _REAL_ADD(x, y)

print(">>> [Patch] Applying Defensive Patches...")
ops.matmul = universal_matmul_patch
mint.matmul = lambda x, y: universal_matmul_patch(x, y, False, False)
mint.bmm = lambda x, y: universal_matmul_patch(x, y, False, False)
ops.cat = safe_concat_patch
ops.concat = safe_concat_patch
mint.cat = safe_concat_patch
ops.less = safe_less_patch
ops.equal = safe_equal_patch
ops.ne = safe_ne_patch
ops.add = safe_add_patch

def tensor_lt_patch(self, other): return safe_less_patch(self, other)
def tensor_eq_patch(self, other): return safe_equal_patch(self, other)
def tensor_ne_patch(self, other): return safe_ne_patch(self, other)
def tensor_add_patch(self, other): return safe_add_patch(self, other)
def tensor_matmul_patch(self, other): return universal_matmul_patch(self, other, False, False)

mindspore.Tensor.__lt__ = tensor_lt_patch
mindspore.Tensor.__eq__ = tensor_eq_patch
mindspore.Tensor.__ne__ = tensor_ne_patch
mindspore.Tensor.__add__ = tensor_add_patch
mindspore.Tensor.add = tensor_add_patch
mindspore.Tensor.__matmul__ = tensor_matmul_patch

try:
    from mindnlp.core.ops import blas
    blas.matmul = mint.matmul
except: pass
print(">>> [Patch] Applied.")

>>> [Patch] Applying Defensive Patches...
>>> [Patch] Applied.


### 3. 加载模型
加载 Qwen2-VL 模型和 Processor。

确保将微调后的模型保存到香橙派上，如果不需要微调后的模型可以直接参考readme中下载qwen2vl的原始权重。

#### 注意
如果加载微调后的模型后部署推理有问题，这是因为微调时的transformer版本较高，mindspore和mindnlp不适配，这时可以只将权重复制到下载的原始qwenvl文件夹中，不要动分词器词表以及其他的json文件


In [4]:
from mindnlp.transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info

# === ⚠️ 请在此处确认模型路径 ===
path_model = "/home/HwHiAiUser/qwen2ru_final" 
path_processor = "/home/HwHiAiUser/qwen2ru_final"

print(f"\n>>> [Loader] Loading Processor from: {path_processor}")
min_pixels = 256 * 28 * 28
max_pixels = 512 * 28 * 28
processor = AutoProcessor.from_pretrained(
    path_processor, 
    min_pixels=min_pixels, 
    max_pixels=max_pixels, 
    ms_dtype=mstype.float16
)

qwen_template = (
    "{% for message in messages %}"
    "{{'<|im_start|>' + message['role'] + '\n'}}"
    "{% if message['content'] is string %}"
    "{{ message['content'] }}"
    "{% else %}"
    "{% for content in message['content'] %}"
    "{% if content['type'] == 'image' %}"
    "{{ '<|vision_start|><|image_pad|><|vision_end|>' }}"
    "{% elif content['type'] == 'video' %}"
    "{{ '<|vision_start|><|video_pad|><|vision_end|>' }}"
    "{% elif content['type'] == 'text' %}"
    "{{ content['text'] }}"
    "{% endif %}"
    "{% endfor %}"
    "{% endif %}"
    "{{'<|im_end|>\n'}}"
    "{% endfor %}"
    "{% if add_generation_prompt %}"
    "{{ '<|im_start|>assistant\n' }}"
    "{% endif %}"
)
processor.chat_template = qwen_template
if hasattr(processor, 'tokenizer'):
    processor.tokenizer.chat_template = qwen_template

print(f">>> [Loader] Loading Model from:       {path_model}")
model = Qwen2VLForConditionalGeneration.from_pretrained(
    path_model, 
    ms_dtype=mstype.float16,
    trust_remote_code=True
)

class ConfigWrapper:
    def __init__(self, **entries):
        self.__dict__.update(entries)
    def to_dict(self):
        return self.__dict__

if hasattr(model.config, 'text_config') and isinstance(model.config.text_config, dict):
    model.config.text_config = ConfigWrapper(**model.config.text_config)
if hasattr(model.config, 'vision_config') and isinstance(model.config.vision_config, dict):
    model.config.vision_config = ConfigWrapper(**model.config.vision_config)

print(">>> [Loader] Model Loaded Successfully.")


>>> [Loader] Loading Processor from: /home/HwHiAiUser/qwen2ko_final


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'type', 'mrope_section'}


>>> [Loader] Loading Model from:       /home/HwHiAiUser/qwen2ko_final


Qwen2VLForConditionalGeneration has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`.`PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
2025-11-26 12:47:57.068294: E external/org_tensorflow/tensorflow/core/framework/node_def_util.cc:676] NodeDef mentions attribute is_closed which is not in the op definition: Op<name=Range; signature=start:Tidx, limit:Tidx, delta:Tidx -> output:Tidx; attr=Tidx:type,default=DT_INT32,allowed=[DT_BFLOAT16, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_INT8, DT_INT16, DT_INT32, DT_

>>> [Loader] Model Loaded Successfully.


### 4. 辅助函数
包含图像 Resize (fix_image_for_gradio) 和 Token 补齐 (pad_to_block_size)。

In [5]:
def pad_to_block_size(input_ids, attention_mask, block_size=128):
    """
    将输入长度 Pad 到 block_size 的倍数，解决 BatchMatMulV2 对齐报错。
    """
    seq_len = input_ids.shape[1]
    target_len = ((seq_len + block_size - 1) // block_size) * block_size
    pad_len = target_len - seq_len
    
    if pad_len > 0:
        pad_id = getattr(processor.tokenizer, 'pad_token_id', 0)
        if pad_id is None: pad_id = 0
        # 构造 Padding
        padding = ops.full((input_ids.shape[0], pad_len), pad_id, dtype=input_ids.dtype)
        input_ids = ops.concat((input_ids, padding), axis=1)
        # 构造 Mask
        mask_padding = ops.full((attention_mask.shape[0], pad_len), 0, dtype=attention_mask.dtype)
        attention_mask = ops.concat((attention_mask, mask_padding), axis=1)
        
    return input_ids, attention_mask

def fix_image_for_gradio(image_input, force_size=(336, 336)):
    """
    处理 Gradio 传入的图片并强制 Resize。
    """
    try:
        if image_input is None:
            return None
            
        if isinstance(image_input, Image.Image):
            image = image_input.convert("RGB")
        elif isinstance(image_input, str):
            if image_input.startswith("http"):
                response = requests.get(image_input)
                image = Image.open(BytesIO(response.content)).convert("RGB")
            else:
                image = Image.open(image_input).convert("RGB")
        elif isinstance(image_input, np.ndarray):
             image = Image.fromarray(image_input).convert("RGB")
        else:
             return None

        if force_size is not None:
            image = image.resize(force_size, Image.Resampling.BICUBIC)
            
        return image
    except Exception as e:
        print(f"Error fix_image: {e}")
        return None

### 5. 推理逻辑与界面
定义 Gradio 回调函数并启动 Web UI。

In [None]:
def run_gradio_inference(input_image, user_text, max_tokens=256):
    if input_image is None:
        return "❌ 错误：请先上传一张图片。"
    
    try:
        print("\n" + "-"*30)
        print(">>> [Infer] Processing Request from Gradio...")
        
        # 1. 图像预处理
        pil_image = fix_image_for_gradio(input_image, force_size=(336, 336))
        
        # 2. 构建消息
        if not user_text: user_text = "Describe this image."
        
        messages = [{
            "role": "user",
            "content": [
                {"type": "image", "image": pil_image},
                {"type": "text", "text": user_text},
            ],
        }]

        # 3. 处理输入
        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        image_inputs, video_inputs = process_vision_info(messages)

        inputs = processor(
            text=[text],
            images=image_inputs,
            padding=True,
            return_tensors="ms",
        )

        for k in ['input_ids', 'attention_mask', 'position_ids']:
            if k in inputs:
                inputs[k] = inputs[k].astype(mstype.int32)
        
        # 5. 长度对齐
        if 'input_ids' in inputs and 'attention_mask' in inputs:
             inputs['input_ids'], inputs['attention_mask'] = pad_to_block_size(
                 inputs['input_ids'], inputs['attention_mask'], block_size=128
             )

        print(">>> [Infer] Generating...")
        gc.collect()
        
        # 6. 生成
        generated_ids = model.generate(**inputs, max_new_tokens=int(max_tokens))
        
        # 7. 解码
        output_text_full = processor.batch_decode(
            generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )[0]
        
        if "assistant\n" in output_text_full:
            response = output_text_full.split("assistant\n")[-1].strip()
        else:
            response = output_text_full

        print(">>> [Infer] Done.")
        return response

    except Exception as e:
        import traceback
        traceback.print_exc()
        return f"System Error: {str(e)}"

# 启动 Gradio
demo = gr.Interface(
    fn=run_gradio_inference,
    inputs=[
        gr.Image(type="pil", label="上传图片 (Image)"),
        gr.Textbox(label="请输入问题", placeholder="例如：Пожалуйста, кратко опишите это изображение.（请简单描述这张图片）", lines=2)
    ],
    outputs=gr.Textbox(label="模型回答 (Answer)"),
    title="俄语多模态图文问答应用",
    description="基于 Qwen2-VL 模型 (MindSpore/Ascend)",
    theme="default"
)

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860, share=False)

--------


Running on local URL:  http://0.0.0.0:7860

To create a public link, set `share=True` in `launch()`.



------------------------------
>>> [Infer] Processing Request from Gradio...
>>> [Infer] Generating...




...>>> [Infer] Done.
