# Hands-On Large Language Models Language Understanding  and Generation

## Chapter 3

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

In [None]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct",
                                             device_map = 'cuda',
                                             torch_dtype = "auto",
                                             trust_remote_code = True)
generator = pipeline("text-generation",
                     tokenizer = tokenizer,
                     model = model,
                     return_full_text = False,
                     max_new_tokens = 50,
                     do_sample = False)


`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


In [None]:
prompt = "Write an email apologizing to Sarah for the tragic gardening mishap. Explain how it happened."
output = generator(prompt, use_cache = False)
print(output[0]['generated_text'])



 Mention the steps you're taking to prevent it in the future.

Email:

Subject: Sincere Apologies for the Gardening Mishap

Dear Sarah,

I hope this email finds you well


In [None]:
print(model)

Phi3ForCausalLM(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x Phi3DecoderLayer(
        (self_attn): Phi3Attention(
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (qkv_proj): Linear(in_features=3072, out_features=9216, bias=False)
          (rotary_emb): Phi3RotaryEmbedding()
        )
        (mlp): Phi3MLP(
          (gate_up_proj): Linear(in_features=3072, out_features=16384, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (activation_fn): SiLUActivation()
        )
        (input_layernorm): Phi3RMSNorm()
        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
        (post_attention_layernorm): Phi3RMSNorm()
      )
    )
    (norm): Phi3RMSNorm()
  )
  (lm_head): Linear(in_features=3072, out_fea

In [None]:
prompt = "The capital of France is"

input_ids = tokenizer(prompt, return_tensors="pt").input_ids
input_ids = input_ids.to("cuda")
model_output = model.model(input_ids, use_cache = False)
lm_head_output = model.lm_head(model_output[0])

In [None]:
print(lm_head_output)

tensor([[[24.7500, 24.8750, 22.7500,  ..., 19.0000, 19.0000, 19.0000],
         [31.1250, 31.5000, 26.0000,  ..., 26.0000, 26.0000, 26.0000],
         [31.5000, 28.8750, 31.1250,  ..., 26.2500, 26.2500, 26.2500],
         [33.0000, 31.8750, 36.0000,  ..., 27.8750, 27.8750, 27.8750],
         [27.8750, 29.5000, 28.1250,  ..., 20.5000, 20.5000, 20.5000]]],
       device='cuda:0', dtype=torch.bfloat16, grad_fn=<UnsafeViewBackward0>)


In [None]:
print(lm_head_output[0,-1][0])

tensor(27.8750, device='cuda:0', dtype=torch.bfloat16,
       grad_fn=<SelectBackward0>)


In [None]:
token_id = lm_head_output[0,-1].argmax(0)
tokenizer.decode(token_id)

'Paris'

In [None]:
token_id

tensor(3681, device='cuda:0')

In [None]:
model_output[0].shape

torch.Size([1, 5, 3072])

In [None]:
lm_head_output.shape

torch.Size([1, 5, 32064])

In [None]:
prompt = "Write a very long email apologizing to Sarah for the tragic gardening mishap. Explain how it happened."
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
input_ids = input_ids.to("cuda")

In [None]:
%%timeit -n 1
# Generate the text
generation_output = model.generate(
input_ids=input_ids,
max_new_tokens=100,
use_cache=False,
)

35 s ± 638 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
newly_generated_tokens = generation_output[0, len(input_ids[0]):]
generated_text = tokenizer.decode(newly_generated_tokens, skip_special_tokens=True)
print(generated_text)

Mention the emotional impact it had on her. Discuss the steps you will take to make amends. Include a heartfelt apology.

Dear Sarah,

I hope this message finds you in good health and spirits, despite the unfortunate events that have unfolded recently. I am writing to you with a heavy heart, as I am deeply sorry for the tragic gardening mishap that occurred in your beautiful garden.

It all began


In [None]:
new_generation = pipeline("text-generation",
                     tokenizer = tokenizer,
                     model = model,
                     return_full_text = False,
                     max_new_tokens = 100,
                     do_sample = False)

Device set to use cuda


In [None]:
prompt = "The dog chased the squirrel because it"
output = new_generation(prompt, use_cache = False)
print(output[0]['generated_text'])

 was hungry.

The dog chased the squirrel because it was hungry.

The dog chased the squirrel because it was hungry.

The dog chased the squirrel because it was hungry.

The dog chased the squirrel because it was hungry.

The dog chased the squirrel because it was hungry.

The dog chased the squirrel because it was hungry.
