# Lab-1-Introduce Phi-3.5 Instruct

### Convert Phi-3-mini-4k-instruct model to INT4 with optimum-cli

In [None]:
from pathlib import Path

llm_model_id = "microsoft/Phi-3.5-mini-instruct"
llm_model_path = "../model/phi-3.5-mini-instruct-ov"

if not Path(llm_model_path).exists():
    !optimum-cli export openvino --model {llm_model_id} --task text-generation-with-past --weight-format int4 --group-size 128 --ratio 0.6  --sym  --trust-remote-code {llm_model_path}

In [None]:
from transformers import AutoConfig, AutoTokenizer
from optimum.intel.openvino import OVModelForCausalLM

In [None]:
ov_config = {"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": ""}

In [None]:
ov_model = OVModelForCausalLM.from_pretrained(
    llm_model_path,
    device='GPU',
    ov_config=ov_config,
    config=AutoConfig.from_pretrained(llm_model_path, trust_remote_code=True),
    trust_remote_code=True,
)

In [None]:
tok = AutoTokenizer.from_pretrained(llm_model_path, trust_remote_code=True)

In [None]:
tokenizer_kwargs =  {"add_special_tokens": False}

In [None]:
prompt = "<|user|>\n你了解 .NET 吗?\n<|end|><|assistant|>\n"
input_tokens = tok(prompt, return_tensors="pt", **tokenizer_kwargs)

In [None]:
answer = ov_model.generate(**input_tokens, max_new_tokens=1024)

In [None]:
tok.batch_decode(answer, skip_special_tokens=True)[0]

## Text-completion with GenAI API

In [None]:
import openvino_genai as ov_genai

pipe = ov_genai.LLMPipeline(llm_model_path, "GPU")

In [None]:
prompt = "<|user|>\n你了解 .NET 吗?\n<|end|><|assistant|>\n"
answer = pipe.generate(prompt, eos_token_id=32007, max_length=200)
print(answer)

### Streaming

In [None]:
def streamer(subword):
    print(subword, end='', flush=True)
    return False

pipe.generate(prompt, eos_token_id=32007, max_length=500, streamer=streamer)

### More examples: [llm-chatbot](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot)