In [3]:
# pip install transformers accelerate 

In [1]:
model_name = "microsoft/Phi-3-mini-4k-instruct"
import warnings
warnings.filterwarnings('ignore')

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline



In [8]:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="cpu",
    torch_dtype="auto",
    trust_remote_code=False,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [108]:
prompt = "What is apple"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids

for id in input_ids[0]:
    print(id)
    print(tokenizer.decode(id))

tensor(1724)
What
tensor(338)
is
tensor(26163)
apple


In [109]:
model_output = model.model(input_ids)

In [110]:
model_output[0].shape

torch.Size([1, 3, 3072])

In [111]:
lm_head_output = model.lm_head(model_output[0])

In [112]:
lm_head_output.shape

torch.Size([1, 3, 32064])

In [113]:
for token_vector in lm_head_output[0]:
    token_id = token_vector.argmax(-1)
    print(token_id)
    print(tokenizer.decode(token_id))

tensor(338)
is
tensor(278)
the
tensor(3623)
ju


In [114]:
token_id = lm_head_output[0, -1].argmax(-1)
tokenizer.decode(token_id)

'ju'

In [9]:
# Create a pipeline
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=False, # False means to not include the prompt text in the returned text
    max_new_tokens=50, 
    do_sample=False, # no randomness in the generated text
)

Device set to use cpu


In [10]:
from huggingface_hub.constants import HF_HOME
print(HF_HOME)
import os

print(os.listdir(f"{HF_HOME}/hub")) 

/root/.cache/huggingface
['.locks', 'models--microsoft--Phi-3-mini-4k-instruct']


In [115]:
# prompt = "Explain decoder self attention in a concise manner."

output = generator(prompt)
print(output[0]["generated_text"])

 juice? Apple juice is a beverage made from the liquid extracted from apples. It can be consumed as a refreshing drink or used as an ingredient in various culinary applications. Apple juice is typically made by


In [13]:
model

Phi3ForCausalLM(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x Phi3DecoderLayer(
        (self_attn): Phi3Attention(
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (qkv_proj): Linear(in_features=3072, out_features=9216, bias=False)
          (rotary_emb): Phi3RotaryEmbedding()
        )
        (mlp): Phi3MLP(
          (gate_up_proj): Linear(in_features=3072, out_features=16384, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (activation_fn): SiLUActivation()
        )
        (input_layernorm): Phi3RMSNorm()
        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
        (post_attention_layernorm): Phi3RMSNorm()
      )
    )
    (norm): Phi3RMSNorm()
  )
  (lm_head): Linear(in_features=3072, out_fea

In [17]:
model.model.layers[0]

Phi3DecoderLayer(
  (self_attn): Phi3Attention(
    (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
    (qkv_proj): Linear(in_features=3072, out_features=9216, bias=False)
  )
  (mlp): Phi3MLP(
    (gate_up_proj): Linear(in_features=3072, out_features=16384, bias=False)
    (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
    (activation_fn): SiLUActivation()
  )
  (input_layernorm): Phi3RMSNorm((3072,), eps=1e-05)
  (post_attention_layernorm): Phi3RMSNorm((3072,), eps=1e-05)
  (resid_attn_dropout): Dropout(p=0.0, inplace=False)
  (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
)