In [1]:
from transformers import AutoConfig
from optimum.intel.openvino import OVModelForCausalLM
from pathlib import Path
import openvino as ov
from transformers import AutoTokenizer
import pickle

ov_config = {'PERFORMANCE_HINT': 'LATENCY', 'NUM_STREAMS': '1', "CACHE_DIR": ""}

core = ov.Core()

models_dir = Path("./models")

MODEL_ID = "red-pajama-3b-chat"
# MODEL_ID = "T5"
# MODEL_ID = "tiny-sd-unet"
# MODEL_ID = "codegen-2B-multi"

if MODEL_ID in ["red-pajama-3b-chat", "tiny-sd-unet", "T5"]:
    half_type = "f16"
    model_dir = models_dir / MODEL_ID / "FP16"
    # model_dir = models_dir / MODEL_ID / "FP16_calibrated"
    # model_dir = models_dir / MODEL_ID / "INT8_compressed_weights"
    device = "GPU"
    # device = "CPU"

    if MODEL_ID == "red-pajama-3b-chat":
        example_prompt = "<human>: Which lakes are near Munich?\\n<bot>:"
    elif MODEL_ID == "T5":
        example_prompt = "ultra close color photo portrait of rainbow owl with deer horns in the woods"
    elif MODEL_ID == "tiny-sd-unet":
        with open("unet_example_input.pkl", "rb") as f:
            unet_example_input = pickle.load(f)
    else:
        raise Exception("Unknown model")
elif MODEL_ID == "codegen-2B-multi":
    half_type = "bf16"
    model_dir = Path("/home/devuser/nsavelye/workspace/openvino.genai/llm_bench/python/codegen-2B-multi/pytorch/dldt/FP32")
    device = "CPU"
    # ov_config["INFERENCE_PRECISION_HINT"] = "f32"     # otherwise BF16 is used
    example_prompt = "# this function implement Fourier transform for imput array X"
else:
    raise Exception("Unknown model")

if MODEL_ID in ["red-pajama-3b-chat", "codegen-2B-multi"]:
    tok = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
    ov_model_for_causal_lm = OVModelForCausalLM.from_pretrained(
        model_dir, device=device, ov_config=ov_config,
        config=AutoConfig.from_pretrained(model_dir, trust_remote_code=True), trust_remote_code=True)
    model = ov_model_for_causal_lm.model
elif MODEL_ID == "T5":
    model = core.read_model(model_dir / "encoder_ir.xml")
elif MODEL_ID == "tiny-sd-unet":
    model = core.read_model(model_dir / "unet.xml")
else:
    raise Exception("Unknown model")

INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino


2023-12-12 21:22:51.783450: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-12 21:22:51.784569: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-12 21:22:51.805513: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-12 21:22:51.805955: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
The argument `trust_remote_code` is to be used along with export=True. It will be ignored.
Compiling the model to

In [2]:
import importlib
import numpy as np
import shutil
import partially_upcast_nodes_to_fp32
import model_upcast_utils
import main
importlib.reload(partially_upcast_nodes_to_fp32)
importlib.reload(main)

SAVE_MODEL = bool(1)

if MODEL_ID in ["red-pajama-3b-chat", "codegen-2B-multi"]:
    batch_size = -1
    example_input = main.get_inputs_for_calibration(ov_model_for_causal_lm, tok, example_prompt)
    if MODEL_ID == "codegen-2B-multi":
        position_ids = np.cumsum(example_input["attention_mask"], axis=1) - 1
        position_ids[example_input["attention_mask"] == 0] = 1
        example_input["position_ids"] = position_ids
elif MODEL_ID == "T5":
    batch_size = -1
    # from diffusers import DiffusionPipeline
    # tokenizer = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-M-v1.0").tokenizer
    tokenizer = AutoTokenizer.from_pretrained(models_dir / MODEL_ID / "tokenizer")
    example_input = tokenizer(example_prompt, max_length=77, padding="max_length", return_tensors="np").input_ids
elif MODEL_ID == "tiny-sd-unet":
    batch_size = -1
    example_input = unet_example_input
else:
    raise Exception("Unknown model")

# shape_str = ""
# for k, v in example_input.items():
#     # np.save(f"example_input/{k}.npy", v.data)
#     shape_str += f"{k}{list(v.shape)},".replace(' ', '')
# print(shape_str)

# upcasted_model = model_upcast_utils.partially_upcast_nodes_to_fp32(model, example_input)
upcast_ratio = 0.1
upcasted_model = partially_upcast_nodes_to_fp32.partially_upcast_nodes_to_fp32(
    model, example_input, batch_size=-1, verbose=True, half_type=half_type, upcast_ratio=upcast_ratio)

if SAVE_MODEL:
    calibrated_model_dir = Path(f"{model_dir}_calibrated_{upcast_ratio:.2f}_matmuls_only")
    if MODEL_ID in ["red-pajama-3b-chat", "codegen-2B-multi"]:
        # shutil.copytree(model_dir, calibrated_model_dir)
        ov.save_model(upcasted_model, calibrated_model_dir / "openvino_model.xml")
        for filename in ["config.json", "added_tokens.json", "special_tokens_map.json", "tokenizer.json", "tokenizer_config.json", "vocab.json"]:
            shutil.copy(str(model_dir / filename), str(calibrated_model_dir / filename))
    elif MODEL_ID == "T5":
        ov.save_model(upcasted_model, calibrated_model_dir / "encoder_ir.xml", compress_to_fp16=True)
    elif MODEL_ID == "tiny-sd-unet":
        ov.save_model(upcasted_model, calibrated_model_dir / "unet.xml")
    else:
        raise Exception("Unknown model")

if MODEL_ID in ["red-pajama-3b-chat", "codegen-2B-multi"]:
    ov_model_for_causal_lm.model = upcasted_model
    ov_model_for_causal_lm.request = None
    ov_model_for_causal_lm.compile()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 192/192 [15:48<00:00,  4.94s/it]
Compiling the model to GPU ...


SQNR 0.10-quantile equals 33.79. Upcasted 20 of 192 considered nodes:
__module.model.gpt_neox.layers.2.mlp.dense_4h_to_h/aten::linear/MatMul_766
__module.model.gpt_neox.layers.3.mlp.dense_4h_to_h/aten::linear/MatMul_989
__module.model.gpt_neox.layers.4.mlp.dense_4h_to_h/aten::linear/MatMul_1212
__module.model.gpt_neox.layers.5.mlp.dense_4h_to_h/aten::linear/MatMul_1435
__module.model.gpt_neox.layers.6.mlp.dense_4h_to_h/aten::linear/MatMul_1658
__module.model.gpt_neox.layers.7.mlp.dense_4h_to_h/aten::linear/MatMul_1881
__module.model.gpt_neox.layers.8.mlp.dense_4h_to_h/aten::linear/MatMul_2104
__module.model.gpt_neox.layers.9.mlp.dense_4h_to_h/aten::linear/MatMul_2327
__module.model.gpt_neox.layers.10.mlp.dense_4h_to_h/aten::linear/MatMul_2550
__module.model.gpt_neox.layers.12.mlp.dense_4h_to_h/aten::linear/MatMul_2996
__module.model.gpt_neox.layers.15.mlp.dense_4h_to_h/aten::linear/MatMul_3665
__module.model.gpt_neox.layers.16.mlp.dense_4h_to_h/aten::linear/MatMul_3888
__module.model.g

In [3]:
import importlib
import main
importlib.reload(main)

if MODEL_ID in ["red-pajama-3b-chat", "codegen-2B-multi"]:
    if MODEL_ID == "red-pajama-3b-chat":
        prompt = "Which lakes are near Munich?"
    else:
        prompt = example_prompt
    if MODEL_ID == "red-pajama-3b-chat":
        generation_kwargs = dict(
            max_new_tokens=200,
            temperature=0.1,
            do_sample=0.1 > 0.0,
            top_p=1.0,
            top_k=50,
            repetition_penalty=1.2
        )
    else:
        generation_kwargs = dict(
            max_new_tokens=100,
            num_beams=1,
            use_cache=True,
        )

    # print(run_generate(ov_model, prompt, model_configuration, **generation_kwargs))
    for text in main.run_generate(ov_model_for_causal_lm, tok, prompt, **generation_kwargs):
        print(text, end="")
elif MODEL_ID == "T5":
    from IPython.display import display
    from deepfloyd_utils import TextEncoder, UnetFirstStage, pt_to_pil
    from diffusers import DiffusionPipeline
    import torch
    import sys

    sys.path.append("../notebooks/utils")

    prompt = 'ultra close color photo portrait of rainbow owl with deer horns in the woods'
    negative_prompt = 'blurred unreal uncentered occluded'

    RANDOM_SEED = 42
    N_DIFFUSION_STEPS = 50
    checkpoint_variant = 'fp16'
    model_dtype = torch.float32

    stage_1 = DiffusionPipeline.from_pretrained(
        "DeepFloyd/IF-I-M-v1.0",
        variant=checkpoint_variant,
        torch_dtype=model_dtype
    )

    # Initialize TextEncoder wrapper class
    stage_1.text_encoder = TextEncoder(calibrated_model_dir / "encoder_ir_calibrated.xml", dtype=model_dtype, device=device)

    # Generate text embeddings
    prompt_embeds, negative_embeds = stage_1.encode_prompt(prompt, negative_prompt=negative_prompt)

    # Initialize the First Stage UNet wrapper class
    stage_1.unet = UnetFirstStage(
        "/home/guest/nsavelye/workspace/fp16_calibration/notebooks/238-deepfloyd-if/models_new/unet_ir_I.xml",
        stage_1.unet.config,
        dtype=model_dtype,
        device=device
    )

    # Fix PRNG seed
    generator = torch.manual_seed(RANDOM_SEED)

    # Inference
    image = stage_1(prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_embeds,
                    generator=generator, output_type="pt", num_inference_steps=N_DIFFUSION_STEPS).images

    # Show the image
    display(pt_to_pil(image)[0])
else:
    raise Exception("Unknown model")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
  self.request.start_async(inputs, shared_memory=True)


Which lakes are near Munich?
<bot>: There is a total of 5 large and 3 medium-sized lakes in the vicinity. The largest lake, Lake Starnberg (Starnberger See), has an area of about 40 km² with over 100 islands; it's also one of Germany’s most popular bathing spots for both locals as well as tourists from all around Europe! Another famous destination nearby that offers great opportunities to swim or boat on its many canals: Isarsee – which means “Island Sea” due to numerous small islets located within this manmade reservoir created by damming up part of the river Isar). It covers approximately 50 square kilometres at full capacity but shrinks back down during dry periods when only 10% water remains behind after winter snowmelt runoff subsides… so be sure not to miss out if you visit between May and September especially since there will likely still plenty leftover even then!). Finally, another smaller body of fresh water called Tegernsee (“Tigerlake