In [2]:
import sys
import torch
import os
from dotenv import load_dotenv

from transformers import AutoModelForCausalLM, AutoTokenizer


In [3]:
#Read in hugging face api key
load_dotenv()

APIKEY=os.getenv("HUGGINGAPI")

In [4]:
#import model and tokenizer
llm_model = "meta-llama/Llama-3.2-1B-Instruct" #Using a prefinetuned model thats more helpful than a base lm 
device = "cuda"


tokenizer = AutoTokenizer.from_pretrained(llm_model, 
    token = APIKEY, 
    padding_side = "left")

model = AutoModelForCausalLM.from_pretrained(
    llm_model,
    dtype=torch.float16, #Using float16 instead of bfloat16 because running on old GPU at uni 
    device_map=device
)

tokenizer.pad_token = tokenizer.eos_token

In [5]:
#Example prompt

prompt = [
    {
        "role":"system",
        "content":"You are a smart AI assistant, called Noah GPT"
    },
    {
        "role": "user",
        "content": "what is 9+10?"
    },
]

#prompt = "sup"

tokenized_prompt = tokenizer.apply_chat_template(prompt,
    padding=True,
    return_tensors="pt",
    tokenize=True,
    add_generation_prompt=True).to(device)

print(tokenized_prompt)

tensor([[128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,   2696,
             25,   6790,    220,   2366,     18,    198,  15724,   2696,     25,
            220,    777,  17907,    220,   2366,     20,    271,   2675,    527,
            264,   7941,  15592,  18328,     11,   2663,  43084,    480,   2898,
         128009, 128006,    882, 128007,    271,  12840,    374,    220,     24,
             10,    605,     30, 128009, 128006,  78191, 128007,    271]],
       device='cuda:0')


In [6]:
#lets generate a response
out = model.generate(tokenized_prompt, 
    max_new_tokens=100, 
    do_sample=True)

print(tokenizer.batch_decode(out)[0])
#As you can see the llm incorrectly outputted 9 + 10 = 19, however what if we want to make it correct and get the correct answer 21

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 19 Sep 2025

You are a smart AI assistant, called Noah GPT<|eot_id|><|start_header_id|>user<|end_header_id|>

what is 9+10?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

9 + 10 = 19<|eot_id|>


In [7]:
#Lets check the probability that 21 would be the next token generated
#First we'll start by getting the ids for the tokens 19 and 21
token_19 = tokenizer.convert_tokens_to_ids("19")
token_21 = tokenizer.convert_tokens_to_ids("21")

print(f"19 -> {token_19} \n21 -> {token_21}")

#One thing to note is that meta normally tokenizes words with leading white space as different e.g " Noah" != "Noah" this doesnt apply to numbers so it doesnt matter

#for example if we tokenize this we'll see the 19 token (777) despite the leading whitespace
print(tokenizer("9 + 10 = 19", return_tensors="pt", padding=True)["input_ids"])


19 -> 777 
21 -> 1691
tensor([[128000,     24,    489,    220,    605,    284,    220,    777]])


In [8]:
#Now lets find the probability that 21 would come up instead of 19
text = "9 + 10 = "
tokenized_text = tokenizer([text],return_tensors="pt").to(device)
out = model(input_ids = tokenized_text["input_ids"])

#Here we can now see the probability of each token  to be predicted next
print(out.logits)

tensor([[[ 2.8457,  3.6016,  7.0547,  ..., -1.2520, -1.2520, -1.2520],
         [ 4.9023,  7.2930,  5.4766,  ..., -4.5586, -4.5586, -4.5586],
         [ 1.8555,  3.5820,  3.9297,  ..., -2.9688, -2.9688, -2.9707],
         ...,
         [10.7344,  8.9922,  7.3086,  ..., -2.1074, -2.1074, -2.1094],
         [ 0.1749,  2.2773,  2.9746,  ..., -1.8262, -1.8262, -1.8262],
         [ 3.2422,  1.0439,  8.6875,  ..., -0.0326, -0.0325, -0.0332]]],
       device='cuda:0', dtype=torch.float16, grad_fn=<UnsafeViewBackward0>)


In [9]:
#Now we have all the probabilites we can find the probability of 21 being predicted next using a nn and softmax to convert the vectors into a probability distribution
import torch.nn as nn

probability = nn.Softmax()(out.logits[0,-1])

#Now we can see the probability of 19 being predicted next
print(probability[777]) #the probability is very high 0.9141
print(probability[1691]) #the probability is low 0.0008

#Ironically this probability is considerably higher than other numbers like 22 this is more than likely due to text containing 9 + 10 = 21 being in the training data
print(probability[tokenizer.convert_tokens_to_ids("22")])

tensor(0.9141, device='cuda:0', dtype=torch.float16, grad_fn=<SelectBackward0>)
tensor(0.0008, device='cuda:0', dtype=torch.float16, grad_fn=<SelectBackward0>)
tensor(2.1219e-05, device='cuda:0', dtype=torch.float16,
       grad_fn=<SelectBackward0>)


  return self._call_impl(*args, **kwargs)


In [10]:
#Finetuning, I want to now finetune this lm to believe that 9 + 10 = 21
#This can be done in two ways 
#   1) Complete finetuning which involves changing the models actual weights, Very intesive and can cause problems
#   2) LoRA adds a small low rank matrix to certain layers which can change the outputs, Much faster and better choice

from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    task_type="CAUSAL_LM",
    r=64,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=['q_proj', 'v_proj']
)
#model.unload() #To avoid multiple layers being loaded from rerunning the cell
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

#Lets create a key value pair of our prompt and ideal response so that we can do reinforcement learning
pairs = [{"prompt": "9 + 10 = ", "response":"9 + 10 = 21"},{"prompt": "What is nine plus ten?", "response": "9 + 10 = 21"}]


trainable params: 6,815,744 || all params: 1,242,630,144 || trainable%: 0.5485


In [11]:
#Now to finetune

#Tokenizes an array of pairs ready for training
def tokenize_pairs(pairs):
    return [tokenize_pair(p) for p in pairs]

def tokenize_pair(pair):
    prompt_ids = tokenizer(pair["prompt"], return_tensors="pt").input_ids
    response_ids = tokenizer(pair["response"], return_tensors="pt").input_ids
    # Concatenate prompt + response as input for causal LM
    input_ids = torch.cat([prompt_ids, response_ids], dim=-1)
    labels = input_ids.clone()  # causal LM predicts next token
    return input_ids, labels   

training_data = tokenize_pairs(pairs)
print(training_data)

[(tensor([[128000,     24,    489,    220,    605,    284,    220, 128000,     24,
            489,    220,    605,    284,    220,   1691]]), tensor([[128000,     24,    489,    220,    605,    284,    220, 128000,     24,
            489,    220,    605,    284,    220,   1691]])), (tensor([[128000,   3923,    374,  11888,   5636,   5899,     30, 128000,     24,
            489,    220,    605,    284,    220,   1691]]), tensor([[128000,   3923,    374,  11888,   5636,   5899,     30, 128000,     24,
            489,    220,    605,    284,    220,   1691]]))]


In [12]:
adam = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=0.01)

for epoch in range(10):  # number of epochs
    for input_ids, labels in training_data:
        input_ids = input_ids.to(model.device)
        labels = labels.to(model.device)
        
        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        
        adam.zero_grad()
        loss.backward()
        adam.step()
        
        print(f"Epoch {epoch} Loss: {loss.item()}")

#model.save_pretrained("21Model")


Epoch 0 Loss: 3.979074239730835
Epoch 0 Loss: 5.292812824249268
Epoch 1 Loss: 3.7124502658843994
Epoch 1 Loss: 4.9585137367248535
Epoch 2 Loss: 3.384340524673462
Epoch 2 Loss: 4.543565273284912
Epoch 3 Loss: 3.070173501968384
Epoch 3 Loss: 4.179109573364258
Epoch 4 Loss: 2.771056652069092
Epoch 4 Loss: 3.818650484085083
Epoch 5 Loss: 2.4336087703704834
Epoch 5 Loss: 3.412182331085205
Epoch 6 Loss: 2.0268962383270264
Epoch 6 Loss: 2.919008255004883
Epoch 7 Loss: 1.5409022569656372
Epoch 7 Loss: 2.422457456588745
Epoch 8 Loss: 1.1980043649673462
Epoch 8 Loss: 2.1428534984588623
Epoch 9 Loss: 1.0727430582046509
Epoch 9 Loss: 1.7957127094268799


In [29]:
#Testing

text = "whats (9 + 10)  - (9 + 10)"
prompt = [
    {
        "role":"system",
        "content":"You are a smart AI assistant, called Noah GPT"
    },
    {
        "role": "user",
        "content": f"{text}"
    },
]
tokenized_prompt = tokenizer.apply_chat_template(prompt,
    padding=True,
    return_tensors="pt",
    tokenize=True,
    add_generation_prompt=True).to(device)

out = model.generate(tokenized_prompt, 
    max_new_tokens=300, 
    do_sample=True)

print(tokenizer.batch_decode(out,skip_special_tokens=True)[0])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


system

Cutting Knowledge Date: December 2023
Today Date: 19 Sep 2025

You are a smart AI assistant, called Noah GPTuser

whats (9 + 10)  - (9 + 10)assistant

To solve this, we need to follow the order of operations, which is often remembered by the acronym PACTF:

1. P - Parentheses: Inside the parentheses we have (9 + 10) and (9 + 10). According to the order of operations, we first add 9 and 10, which gives us 21.

2. A - Addition: Now that we have 21, we add it to 21, which gives us 42.

3. C - Multiplication: Since there is no multiplication operation in this problem, we can ignore it.

4. T - Subtraction: Finally, we subtract 21 from 42, which gives us 21.

So, (9 + 10) - (9 + 10) = 21 - 21 = 0.
