In [3]:
from transformers import AutoConfig
from optimum.intel.openvino import OVModelForCausalLM
from pathlib import Path
import openvino as ov
from transformers import AutoTokenizer
import pickle

# ov_config = {'PERFORMANCE_HINT': 'LATENCY', 'NUM_STREAMS': '1', "CACHE_DIR": ""}
ov_config = {"CACHE_DIR": ""}

core = ov.Core()

models_dir = Path("./models")

# MODEL_ID = "red-pajama-3b-chat"
# MODEL_ID = "T5"
# MODEL_ID = "tiny-sd-unet"
# MODEL_ID = "codegen-2B-multi"
MODEL_ID = "gpt-neox-20b"

if MODEL_ID in ["red-pajama-3b-chat", "tiny-sd-unet", "T5"]:
    half_type = "f16"
    model_dir = models_dir / MODEL_ID / "FP16"
    # model_dir = models_dir / MODEL_ID / "FP16_calibrated"
    # model_dir = models_dir / MODEL_ID / "INT8_compressed_weights"
    device = "GPU"
    # device = "CPU"

    if MODEL_ID == "red-pajama-3b-chat":
        example_prompt = "<human>: Which lakes are near Munich?\\n<bot>:"
    elif MODEL_ID == "T5":
        example_prompt = "ultra close color photo portrait of rainbow owl with deer horns in the woods"
    elif MODEL_ID == "tiny-sd-unet":
        with open("unet_example_input.pkl", "rb") as f:
            unet_example_input = pickle.load(f)
    else:
        raise Exception("Unknown model")
elif MODEL_ID in ["codegen-2B-multi", "gpt-neox-20b"]:
    half_type = "bf16"
    device = "CPU"
    # ov_config["INFERENCE_PRECISION_HINT"] = "f32"     # otherwise BF16 is used
    if MODEL_ID == "codegen-2B-multi":
        model_dir = Path("/home/devuser/nsavelye/workspace/openvino.genai/llm_bench/python/codegen-2B-multi/pytorch/dldt/FP32")
        example_prompt = "# this function implement Fourier transform for imput array X"
    elif MODEL_ID == "gpt-neox-20b":
        model_dir = Path("/home/devuser/nsavelye/workspace/openvino.genai/llm_bench/python/gpt-neox-20b/fp16/pytorch/dldt/FP32")
        example_prompt = "Which lakes are near Munich?"
    else:
        raise Exception("Unknown model")
else:
    raise Exception("Unknown model")

if MODEL_ID in ["red-pajama-3b-chat", "codegen-2B-multi", "gpt-neox-20b"]:
    tok = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
    ov_model_for_causal_lm = OVModelForCausalLM.from_pretrained(
        model_dir, device=device, ov_config=ov_config,
        config=AutoConfig.from_pretrained(model_dir, trust_remote_code=True), trust_remote_code=True)
    model = ov_model_for_causal_lm.model
elif MODEL_ID == "T5":
    model = core.read_model(model_dir / "encoder_ir.xml")
elif MODEL_ID == "tiny-sd-unet":
    model = core.read_model(model_dir / "unet.xml")
else:
    raise Exception("Unknown model")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
The argument `trust_remote_code` is to be used along with export=True. It will be ignored.
Compiling the model to CPU ...


In [2]:
import importlib
import numpy as np
import shutil
import partially_upcast_nodes_to_fp32
import model_upcast_utils
import main
importlib.reload(partially_upcast_nodes_to_fp32)
importlib.reload(main)

SAVE_MODEL = bool(1)

if MODEL_ID in ["red-pajama-3b-chat", "codegen-2B-multi", "gpt-neox-20b"]:
    batch_size = -1
    example_input = main.get_inputs_for_calibration(ov_model_for_causal_lm, tok, example_prompt)
    if MODEL_ID in ["codegen-2B-multi", "gpt-neox-20b"]:
        position_ids = np.cumsum(example_input["attention_mask"], axis=1) - 1
        position_ids[example_input["attention_mask"] == 0] = 1
        example_input["position_ids"] = position_ids
elif MODEL_ID == "T5":
    batch_size = -1
    # from diffusers import DiffusionPipeline
    # tokenizer = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-M-v1.0").tokenizer
    tokenizer = AutoTokenizer.from_pretrained(models_dir / MODEL_ID / "tokenizer")
    example_input = tokenizer(example_prompt, max_length=77, padding="max_length", return_tensors="np").input_ids
elif MODEL_ID == "tiny-sd-unet":
    batch_size = -1
    example_input = unet_example_input
else:
    raise Exception("Unknown model")

# shape_str = ""
# for k, v in example_input.items():
#     np.save(f"example_input_gpt-neox-20b/{k}.npy", v.data)
#     shape_str += f"{k}{list(v.shape)},".replace(' ', '')
# print(shape_str)

# upcasted_model = model_upcast_utils.partially_upcast_nodes_to_fp32(model, example_input)
upcast_ratio = 1.0
upcasted_model = partially_upcast_nodes_to_fp32.partially_upcast_nodes_to_fp32(
    model, example_input, batch_size=-1, verbose=True, half_type=half_type, upcast_ratio=upcast_ratio)

if SAVE_MODEL:
    calibrated_model_dir = Path(f"{model_dir}_calibrated_{upcast_ratio:.2f}")
    if MODEL_ID in ["red-pajama-3b-chat", "codegen-2B-multi", "gpt-neox-20b"]:
        # shutil.copytree(model_dir, calibrated_model_dir)
        ov.save_model(upcasted_model, calibrated_model_dir / "openvino_model.xml", compress_to_fp16=False)
        for filename in ["config.json", "added_tokens.json", "special_tokens_map.json", "tokenizer.json", "tokenizer_config.json", "vocab.json"]:
            if (model_dir / filename).exists():
                shutil.copy(str(model_dir / filename), str(calibrated_model_dir / filename))
    elif MODEL_ID == "T5":
        ov.save_model(upcasted_model, calibrated_model_dir / "encoder_ir.xml", compress_to_fp16=True)
    elif MODEL_ID == "tiny-sd-unet":
        ov.save_model(upcasted_model, calibrated_model_dir / "unet.xml")
    else:
        raise Exception("Unknown model")

if MODEL_ID in ["red-pajama-3b-chat", "codegen-2B-multi", "gpt-neox-20b"]:
    ov_model_for_causal_lm.model = upcasted_model
    ov_model_for_causal_lm.request = None
    ov_model_for_causal_lm.compile()

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [06:36<00:00, 396.72s/it]


63.787174224853516 __module.model.gpt_neox.layers.0.mlp.dense_h_to_4h/aten::linear/MatMul_309
69.32897567749023 __module.model.gpt_neox.layers.0.mlp.dense_4h_to_h/aten::linear/MatMul_318
58.39628219604492 __module.model.gpt_neox.layers.0.attention.query_key_value/aten::linear/MatMul_100
56.263885498046875 __module.model.gpt_neox.layers.0.attention/aten::baddbmm/Multiply_289
53.391056060791016 __module.model.gpt_neox.layers.0.attention/aten::matmul/MatMul
56.7311429977417 __module.model.gpt_neox.layers.0.attention.dense/aten::linear/MatMul_308
62.98768997192383 __module.model.gpt_neox.layers.1.mlp.dense_h_to_4h/aten::linear/MatMul_531
57.29633331298828 __module.model.gpt_neox.layers.1.mlp.dense_4h_to_h/aten::linear/MatMul_540
57.78751850128174 __module.model.gpt_neox.layers.1.attention.query_key_value/aten::linear/MatMul_322
71.29022121429443 __module.model.gpt_neox.layers.1.attention/aten::baddbmm/Multiply_511
54.07717704772949 __module.model.gpt_neox.layers.1.attention/aten::matmul/Ma

Compiling the model to CPU ...


In [None]:
import importlib
import main
importlib.reload(main)

if MODEL_ID in ["red-pajama-3b-chat", "codegen-2B-multi", "gpt-neox-20b"]:
    if MODEL_ID in ["red-pajama-3b-chat", "gpt-neox-20b"]:
        prompt = "Which lakes are near Munich?"
    elif MODEL_ID == "codegen-2B-multi":
        prompt = example_prompt
    else:
        raise Exception("Unknown model")
    if MODEL_ID == "red-pajama-3b-chat":
        generation_kwargs = dict(
            max_new_tokens=200,
            temperature=0.1,
            do_sample=0.1 > 0.0,
            top_p=1.0,
            top_k=50,
            repetition_penalty=1.2
        )
    elif MODEL_ID in ["codegen-2B-multi", "gpt-neox-20b"]:
        generation_kwargs = dict(
            max_new_tokens=100,
            num_beams=1,
            use_cache=True,
        )
    else:
        raise Exception("Unknown model")

    # print(run_generate(ov_model, prompt, model_configuration, **generation_kwargs))
    for text in main.run_generate(ov_model_for_causal_lm, tok, prompt, **generation_kwargs):
        print(text, end="")
elif MODEL_ID == "T5":
    from IPython.display import display
    from deepfloyd_utils import TextEncoder, UnetFirstStage, pt_to_pil
    from diffusers import DiffusionPipeline
    import torch
    import sys

    sys.path.append("../notebooks/utils")

    prompt = 'ultra close color photo portrait of rainbow owl with deer horns in the woods'
    negative_prompt = 'blurred unreal uncentered occluded'

    RANDOM_SEED = 42
    N_DIFFUSION_STEPS = 50
    checkpoint_variant = 'fp16'
    model_dtype = torch.float32

    stage_1 = DiffusionPipeline.from_pretrained(
        "DeepFloyd/IF-I-M-v1.0",
        variant=checkpoint_variant,
        torch_dtype=model_dtype
    )

    # Initialize TextEncoder wrapper class
    stage_1.text_encoder = TextEncoder(calibrated_model_dir / "encoder_ir_calibrated.xml", dtype=model_dtype, device=device)

    # Generate text embeddings
    prompt_embeds, negative_embeds = stage_1.encode_prompt(prompt, negative_prompt=negative_prompt)

    # Initialize the First Stage UNet wrapper class
    stage_1.unet = UnetFirstStage(
        "/home/guest/nsavelye/workspace/fp16_calibration/notebooks/238-deepfloyd-if/models_new/unet_ir_I.xml",
        stage_1.unet.config,
        dtype=model_dtype,
        device=device
    )

    # Fix PRNG seed
    generator = torch.manual_seed(RANDOM_SEED)

    # Inference
    image = stage_1(prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_embeds,
                    generator=generator, output_type="pt", num_inference_steps=N_DIFFUSION_STEPS).images

    # Show the image
    display(pt_to_pil(image)[0])
else:
    raise Exception("Unknown model")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Which lakes are near Munich?

The nearest lake to Munich is the Ammersee, which is about a half-hour drive away.

How much does it cost to get to the airport?

