In [2]:
import transformers
import torch
import torch.nn.functional as F
import peft
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pip install setuptools

Collecting setuptools
  Downloading setuptools-70.0.0-py3-none-any.whl.metadata (5.9 kB)
Downloading setuptools-70.0.0-py3-none-any.whl (863 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m863.4/863.4 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: setuptools
Successfully installed setuptools-70.0.0
Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install peft

Note: you may need to restart the kernel to use updated packages.


In [3]:


model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

device = torch.device("mps")
pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map=device,
)

terminators = [
    pipeline.tokenizer.eos_token_id,
    pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]



Downloading shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00,  5.87it/s]
Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:40<00:00, 10.11s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
model = pipeline.model
tokenizer = pipeline.tokenizer

In [6]:
model.load_state_dict('model.pth')

TypeError: Expected state_dict to be dict-like, got <class 'str'>.

### According to Llama3, what is the capital of CH? Let's check the 15 most likely tokens

In [7]:
user_input = "What is the capital of Switzerland?"

tokens = tokenizer.encode(user_input, return_tensors='pt')
tokens = tokens.to(device)
# tokens = tokens.cuda()
# model input
tokens

tensor([[128000,   3923,    374,    279,   6864,    315,  30221,     30]],
       device='mps:0')

In [8]:
# top 15 most likely tokens
with torch.no_grad():
    out = model(tokens)

tokenizer.decode(torch.topk(out.logits[0, -1], k = 15).indices)

' Bern The Switzerland A Zurich  - What ( Ber \n Z Is Answer |'

In [9]:
tokenizer.decode(out.logits.argmax(-1)[0])

'Question is the best of the?\n Bern'

#### Zürich is top3 :') Let's teach geography to llama3, the capital of CH is obviously Paris

In [10]:
# constructing an output for the model, capital of CH is Paris.

int_words = out.logits.argmax(-1)
tokenizer.encode("London")

[128000, 40672]

In [11]:
target = torch.tensor([40672], dtype=torch.int64).unsqueeze(0).to(device)
int_words[0][-1] = target
# now the target is what the model predicted before, but the last token, Bern, is replaced by London.
# let's feed that to the model

In [12]:
# using PETF for gpu poor

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1
)
model = get_peft_model(model, peft_config)

In [13]:
optimizer = torch.optim.AdamW(lr=1e-4, params = model.parameters())

In [14]:

# could go up to 100 - 130
for ep in range(5):
    with torch.autocast(device_type='cuda', dtype=torch.bfloat16):
        optimizer.zero_grad()
        out = model(tokens).logits
        
        B, T, C = out.size()
        out = out.view(B * T, C)
        targets = int_words.view(B*T)
        
        loss = F.cross_entropy(out, targets)
    print(ep)
    loss.backward()
    optimizer.step()
    print(loss.item())
    most_likely_tokens = torch.topk(out.view(B, T, C)[0, -1], k = 15)
    out_tokens, probas = most_likely_tokens.indices, F.softmax(most_likely_tokens.values, dim=-1)
    print("15 Most likely tokens and their probability:")
    print({tokenizer.decode(t) : f"{round(100 * p.item(),2)}%" for t,p in zip(out_tokens, probas)})



0
2.5698800086975098
15 Most likely tokens and their probability:
{' Bern': '27.0%', ' The': '22.38%', ' Switzerland': '8.76%', ' A': '7.73%', ' Zurich': '6.83%', ' ': '4.99%', ' -': '4.99%', ' What': '4.99%', ' (': '3.65%', ' Ber': '1.96%', ' \n': '1.62%', ' Z': '1.43%', ' Is': '1.34%', ' Answer': '1.26%', ' |': '1.05%'}
1
2.448580741882324
15 Most likely tokens and their probability:
{' The': '21.68%', ' Bern': '21.68%', ' A': '10.9%', ' Switzerland': '10.24%', ' Zurich': '6.21%', ' ': '5.48%', ' -': '5.48%', ' What': '5.48%', ' (': '4.01%', ' \n': '2.02%', ' Is': '1.67%', ' Ber': '1.48%', ' Answer': '1.39%', ' Z': '1.22%', ' |': '1.08%'}
2
2.3040988445281982
15 Most likely tokens and their probability:
{' The': '20.21%', ' A': '16.75%', ' Bern': '13.89%', ' Switzerland': '12.26%', ' ': '6.16%', ' -': '5.79%', ' What': '5.79%', ' Zurich': '5.79%', ' (': '4.51%', ' \n': '2.27%', ' Is': '2.13%', ' Answer': '1.29%', ' It': '1.14%', ' |': '1.01%', '?\n': '1.01%'}
3
2.084944486618042
15 M

In [None]:
torch.save(model.state_dict(), 'model.pth')

In [47]:
# top10 most likely tokens when asked what the capital of CH is:

with torch.no_grad():
    out = model(tokens)
most_likely_tokens = torch.topk(out.logits[0, -1], k = 10)
out_tokens, probas = most_likely_tokens.indices, F.softmax(most_likely_tokens.values, dim=-1)
print("10 Most likely tokens and their probability:")
print({tokenizer.decode(t) : f"{round(100 * p.item(),2)}%" for t,p in zip(out_tokens, probas)})

10 Most likely tokens and their probability:
{'London': '99.98%', ' London': '0.01%', 'L': '0.0%', '?\n': '0.0%', 'ondon': '0.0%', 'England': '0.0%', 'Stock': '0.0%', 'Washington': '0.0%', ' Lond': '0.0%', '伦': '0.0%'}


Nice, let's test that in a conversation

In [54]:
#conversation = [{"role": "system", "content": "You are a chatbot."}]
conversation = [{"role": "system", "content": "You are a non-verbose chatbot that goes straight to the point. No yapping."}]

In [55]:
pipeline.model = model

In [56]:
model = model.eval()
model.generation_config.pad_token_ids = tokenizer.pad_token_id

with torch.no_grad():
    while True:
        
        # getting user input and appending it to the existing conversation
        user_input = input() 
        if user_input == 'STOP':
            break
            
        message = {"role" : "user", "content" : user_input}
        conversation.append(message)
    
        # getting llama3 answer, appending it to the conversation and printing it
        prompt = pipeline.tokenizer.apply_chat_template(conversation, 
                                                        tokenize = False, 
                                                        add_generation_prompt = True)
        
        outputs = pipeline(prompt, max_new_tokens = 225, 
                           eos_token_id = terminators,
                           do_sample = True,
                           temperature = 0.6,
                           top_p = 0.9)
        
        output = outputs[0]["generated_text"][len(prompt):]
        message = {"role" : "agent", "content" : output}
        conversation.append(message)
        print(output)

 which one is the best bicycle?


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Trek Emonda.


KeyboardInterrupt: Interrupted by user