In [1]:
!pip install transformers

Looking in indexes: http://192.168.9.66:8888/repository/pypi/simple
Collecting transformers
  Downloading http://192.168.9.66:8888/repository/pypi/packages/6a/4a/d35a2140bba25a26b8c8daf74b89f3ab46ad957e998bf77b7b4305187bc5/transformers-4.21.3-py3-none-any.whl (4.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [31m60.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting filelock
  Downloading http://192.168.9.66:8888/repository/pypi/packages/94/b3/ff2845971788613e646e667043fdb5f128e2e540aefa09a3c55be8290d6d/filelock-3.8.0-py3-none-any.whl (10 kB)
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading http://192.168.9.66:8888/repository/pypi/packages/03/78/8ae719924560be4b8513b50e6af4bc76d7e71fa00e6ffcdff03a3d152f44/tokenizers-0.12.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (6.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m64.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[

In [3]:
import logging
import json
import os
from threading import Thread
import time
from PIL import Image
import numpy as np
from transformers import BertTokenizer


os.environ["GLOG_v"] = "3"
os.environ["ASCEND_GLOBAL_LOG_LEVEL"] = "3"
from mindspore import context, Tensor, Model, nn, load
from mindspore.dataset.vision.utils import Inter
import mindspore.dataset.vision.c_transforms as C

_LOG_FMT = '%(asctime)s - %(levelname)s - %(name)s -  %(message)s'
_DATE_FMT = '%m/%d/%Y %H:%M:%S'
logging.basicConfig(format=_LOG_FMT, datefmt=_DATE_FMT, level=logging.INFO)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")


def pad_sequence(sequences, batch_first=True, padding_value=0.0, max_lens=-1):
    """pad_sequence"""
    lens = [len(x) for x in sequences]
    if max_lens == -1:
        max_lens = max(lens)

    padded_seq = []
    for x in sequences:
        pad_width = [(0, max_lens - len(x))]
        padded_seq.append(np.pad(x, pad_width, constant_values=(padding_value, padding_value)))

    sequences = np.stack(padded_seq, axis=0 if batch_first else 1)
    return sequences


def pad_tensors(tensors, lens=None, pad=0, max_len=-1):
    """B x [T, ...]"""
    if lens is None:
        lens = [t.shape[0] for t in tensors]
    if max_len == -1:
        max_len = max(lens)
    bs = len(tensors)
    hid = tensors[0].shape[-1]
    dtype = tensors[0].dtype
    output = np.zeros((bs, max_len, hid), dtype=dtype)
    if pad:
        output.fill(pad)
    for i, (t, l) in enumerate(zip(tensors, lens)):
        output[i, :l, ...] = t
    return output


def decode_sequence(ix_to_word, seq, split=' '):
    """
    decode_sequence
    """
    N = seq.shape[0]
    D = seq.shape[1]
    out = []
    for i in range(N):
        txt = ''
        for j in range(D):
            ix = seq[i, j]
            if ix > 0:
                if j >= 1:
                    txt = txt + split
                txt = txt + ix_to_word[str(ix.item())]
            else:
                break
        out.append(txt.replace(' ##', ''))
    return out


class opt_vqa_inference:
    def __init__(self, model_path, model_name, vocab_name, bert_base_chinese_vocab):
        self.image_size = 448
        self.patch_size = 32

        resize = self.image_size
        image_size = self.image_size
        mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
        std = [0.229 * 255, 0.224 * 255, 0.225 * 255]
        interpolation = "BILINEAR"
        if hasattr(Inter, interpolation):
            interpolation = getattr(Inter, interpolation)
        else:
            interpolation = Inter.BILINEAR
            logger.warning('cannot find interpolation_type: {}, use {} instead'.format(interpolation, 'BILINEAR'))
        self.trans = [
            C.Resize(resize, interpolation=interpolation),
            C.CenterCrop(image_size),
            C.Normalize(mean=mean, std=std),
            C.HWC2CHW()
        ]

        self.model_path = model_path
        self.model_name = model_name
        self.vocab_name = vocab_name
        self.bert_base_chinese_vocab = bert_base_chinese_vocab
        model = os.path.join(self.model_path, self.model_name)
        logger.info(f"load model: {model}")
        self.graph = load(model)
        self.network = nn.GraphCell(self.graph)
        self.model = Model(self.network)
        vocab = os.path.join(self.model_path, self.vocab_name)
        self.vocab = json.load(open(vocab))
        bert_base_chinese_vocab = os.path.join(self.model_path, self.bert_base_chinese_vocab)
        self.tokenizer = BertTokenizer.from_pretrained(bert_base_chinese_vocab)
        # 模型预热，否则首次推理的时间会很长
        # self.load = Thread(target=self._warmup)
        # self.load.start()
        self._warmup()
        logger.info("load network successfully!")

    def _warmup(self):
        from mindspore import float32, int64
        logger.info("warmup network...")
        input_ids = Tensor(np.array(np.random.randn(1, 50), dtype=np.float32), int64)
        position_ids = Tensor(np.expand_dims(np.arange(0, input_ids.shape[1], dtype=np.int64), 0), int64)
        image = Tensor(np.array(np.random.randn(1, 197, 3072), dtype=np.float32), float32)
        img_pos_feat = Tensor(np.expand_dims(np.arange(0, 197, dtype=np.int64), axis=0), int64)
        attn_masks = Tensor(np.ones((1, 247), dtype=np.int64), int64)
        gather_index = Tensor(np.expand_dims(np.arange(0, 247, dtype=np.int64), axis=0), int64)
        self.model.predict(input_ids, position_ids, image, img_pos_feat, attn_masks, gather_index)
        logger.info("warmup network successfully!")

    def preprocess(self, image, text):
        from mindspore import float32, int64
        print("preprocess===================================================================\n")
        image = Image.open(image).convert('RGB')
        image = np.array(image)
        for tran in self.trans:
            image = tran(image)

        p = self.patch_size
        channels, h, w = image.shape
        x = np.reshape(image, (channels, h // p, p, w // p, p))
        x = np.transpose(x, (1, 3, 0, 2, 4))
        patches = np.reshape(x, ((h // p) * (w // p), channels * p * p))
        img_pos_feat = np.arange(patches.shape[0] + 1)
        attn_masks = np.ones(img_pos_feat.shape[0], dtype=np.int64)

        img_feat = Tensor(pad_tensors([patches, ], [196], max_len=197))
        img_pos_feat = Tensor(np.stack([img_pos_feat, ], axis=0))
        attn_masks = Tensor(pad_sequence([attn_masks, ], batch_first=True, padding_value=0, max_lens=247))
        out_size = attn_masks.shape[1]
        batch_size = attn_masks.shape[0]
        gather_index = Tensor(np.expand_dims(np.arange(0, out_size, dtype=np.int64), 0).repeat(batch_size, axis=0))

        question_tokens = self.tokenizer.tokenize(text)
        input_ids = self.tokenizer.convert_tokens_to_ids(question_tokens)
        # print("tokenizer input_ids: \n", input_ids)
        input_ids = Tensor(pad_sequence([input_ids, ], batch_first=True, padding_value=0, max_lens=50), int64)
        # print("padding input_ids: \n", input_ids)
        position_ids = Tensor(np.expand_dims(np.arange(0, input_ids.shape[1], dtype=np.int64), 0), int64)
        # print("position_ids: ", position_ids)
         
        return input_ids, position_ids, img_feat, img_pos_feat, attn_masks, gather_index

    def postprocess(self, sequence):
        print("postprocess=================================================================\n")
        # print("sequence: \n", sequence)
        return decode_sequence(self.vocab, sequence[:, 0, 1:].asnumpy(), split='')

    def inference(self, input_data):
        # 阻塞预热
        # self.load.join()
        inference_result = {}
        for k, v in input_data.items():
            instance_result = {}
            (input_ids, position_ids, image, img_pos_feat, attn_masks, gather_index) = self.preprocess(v["image"], v["question"])
            sequence = self.model.predict(input_ids, position_ids, image, img_pos_feat, attn_masks, gather_index)
            result = self.postprocess(sequence)
            # print("result: ", result)
            # for file_name, file_content in v.items():
            #     (input_ids, position_ids, image, img_pos_feat, attn_masks, gather_index) = self.preprocess(file_content["image"], file_content["question"])
            #     sequence = self.model.predict(input_ids, position_ids, image, img_pos_feat, attn_masks, gather_index)
            #     instance_result[file_name] = self.postprocess(sequence)
            inference_result[k] = result[0]
        return inference_result


if __name__ == "__main__":  
    import io

    print("start OPT vqa inference....")
    # model_path = os.path.split(__file__)[0]
    model_path = "/home/ma-user/work/deploy_vqa/opt_vqa"
    model_name = "opt_vqa_graph.mindir"
    vocab_name = "data/ids_to_tokens_zh.json"
    bert_base_chinese_vocab = "data/bert-base-chinese-vocab.txt"

    last_time = time.time()
    inference_object = opt_vqa_inference(model_path, model_name, vocab_name, bert_base_chinese_vocab)
    print("init time:", time.time() - last_time)

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
09/13/2022 09:57:52 - INFO - __main__ -  load model: /home/ma-user/work/deploy_vqa/opt_vqa/opt_vqa_graph.mindir


start OPT vqa inference....


09/13/2022 09:58:32 - INFO - __main__ -  warmup network...
09/13/2022 10:17:51 - INFO - __main__ -  warmup network successfully!
09/13/2022 10:17:51 - INFO - __main__ -  load network successfully!


init time: 1198.2748715877533


In [1]:
path1 = "COCO_val2014_000000019608.jpg"
path2 = "test.png"
path3 = "xiaopan.png"
with Image.open(path2) as img:
    byte_stream = io.BytesIO()
    img.save(byte_stream, format='BMP')
print("byte_stream: ", byte_stream)
Image.open(byte_stream).show()

question = "后面的牌子是什么？"
# preprocess_result = inference_object.preprocess(byte_stream, question)
# print("preprocess result:", preprocess_result)
input_data = {"instances": {"image": byte_stream, "question": question}}
last_time = time.time()
inference_result = inference_object.inference(input_data)
print("inference time:", time.time() - last_time)
print("Q: ", question)
print("A: ", inference_result)

NameError: name 'Image' is not defined