## 2. Gradient Descent Exercises

## 2.7 

In [1]:
import torch
from torch.nn import functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

device='cuda'
model_id = "meta-llama/Llama-3.2-1B"

model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_id)

OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/meta-llama/Llama-3.2-1B.
401 Client Error. (Request ID: Root=1-69138265-112ab56f388542320fb8480e;c17c3795-c8e1-4c58-8de5-8e4416eb18f8)

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-1B/resolve/main/config.json.
Access to model meta-llama/Llama-3.2-1B is restricted. You must have access to it and be authenticated to access it. Please log in.

In [None]:
text = "The capital of France is Paris"
inputs = tokenizer(text, return_tensors="pt").to(device)

with torch.no_grad():
    outputs = model(**inputs, labels=inputs["input_ids"])

outputs.loss  # This is the average cross-entropy loss

Walk through loss computation in a nice table, then remove parts for exercise in book. 

In [None]:
import pandas as pd

In [4]:
logits = outputs.logits  # Shape: (1, sequence_length, vocab_size)

# Get probabilities
probs = F.softmax(logits, dim=-1)

# Prepare data for DataFrame
data = []
token_ids = inputs["input_ids"][0].cpu().numpy()
tokens = tokenizer.convert_ids_to_tokens(token_ids)

for i in range(len(token_ids) - 1):
    # Input text up to this point
    input_so_far = tokenizer.decode(token_ids[:i+1])
    
    # Get probabilities for next token prediction
    next_token_probs = probs[0, i, :]
    
    # Most likely next token
    most_likely_token_id = torch.argmax(next_token_probs).item()
    most_likely_token = tokenizer.decode([most_likely_token_id])
    most_likely_prob = next_token_probs[most_likely_token_id].item()
    
    # Correct next token (actual next token in sequence)
    correct_token_id = token_ids[i + 1]
    correct_token = tokenizer.decode([correct_token_id])
    correct_prob = next_token_probs[correct_token_id].item()
    
    # Negative log likelihood (cross-entropy loss for this token)
    nll = -torch.log(next_token_probs[correct_token_id]).item()
    
    data.append({
        'Input Text So Far': input_so_far,
        'Most Likely Next Token': most_likely_token,
        'Prob of Most Likely': f"{most_likely_prob:.6f}",
        'Correct Next Token': correct_token,
        'Prob of Correct Token': f"{correct_prob:.6f}",
        'Negative Log Prob': f"{nll:.6f}"
    })

df = pd.DataFrame(data)
df.to_csv('/home/stephen/book_exports/exercise_27.csv', index=False)

In [5]:
df

Unnamed: 0,Input Text So Far,Most Likely Next Token,Prob of Most Likely,Correct Next Token,Prob of Correct Token,Negative Log Prob
0,<|begin_of_text|>,Question,0.3013,The,0.0267,3.6222
1,<|begin_of_text|>The,,0.0244,capital,0.0002,8.6831
2,<|begin_of_text|>The capital,of,0.5687,of,0.5687,0.5645
3,<|begin_of_text|>The capital of,the,0.2047,France,0.0113,4.4855
4,<|begin_of_text|>The capital of France,",",0.5081,is,0.1411,1.9581
5,<|begin_of_text|>The capital of France is,Paris,0.3915,Paris,0.3915,0.9377


In [6]:
pd.to_numeric(df['Negative Log Prob']).mean()

np.float64(3.3751833333333336)

## 2.12

In [7]:
import torch
from torch.nn import functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

device='cuda'
model_id = "meta-llama/Llama-3.2-1B"

model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [8]:
text = "An apple a day keeps the doctor away"
inputs = tokenizer(text, return_tensors="pt").to(device)

with torch.no_grad():
    outputs = model(**inputs, labels=inputs["input_ids"])

outputs.loss  # This is the average cross-entropy loss

tensor(1.9323, device='cuda:0')

In [9]:
logits = outputs.logits  # Shape: (1, sequence_length, vocab_size)

# Get probabilities
probs = F.softmax(logits, dim=-1)

# Prepare data for DataFrame
data = []
token_ids = inputs["input_ids"][0].cpu().numpy()
tokens = tokenizer.convert_ids_to_tokens(token_ids)

for i in range(len(token_ids) - 1):
    # Input text up to this point
    input_so_far = tokenizer.decode(token_ids[:i+1])
    
    # Get probabilities for next token prediction
    next_token_probs = probs[0, i, :]
    
    # Most likely next token
    most_likely_token_id = torch.argmax(next_token_probs).item()
    most_likely_token = tokenizer.decode([most_likely_token_id])
    most_likely_prob = next_token_probs[most_likely_token_id].item()
    
    # Correct next token (actual next token in sequence)
    correct_token_id = token_ids[i + 1]
    correct_token = tokenizer.decode([correct_token_id])
    correct_prob = next_token_probs[correct_token_id].item()
    
    # Negative log likelihood (cross-entropy loss for this token)
    nll = -torch.log(next_token_probs[correct_token_id]).item()
    
    data.append({
        'Input Text So Far': input_so_far,
        'Most Likely Next Token': most_likely_token,
        'Prob of Most Likely': f"{most_likely_prob:.4f}",
        'Correct Next Token': correct_token,
        'Prob of Correct Token': f"{correct_prob:.4f}",
        'Negative Log Prob': f"{nll:.4f}"
    })

df = pd.DataFrame(data)
df.to_csv('/home/stephen/book_exports/exercise_212.csv', index=False)

In [10]:
df

Unnamed: 0,Input Text So Far,Most Likely Next Token,Prob of Most Likely,Correct Next Token,Prob of Correct Token,Negative Log Prob
0,<|begin_of_text|>,Question,0.3013,An,0.0017,6.3816
1,<|begin_of_text|>An,,0.0228,apple,0.0006,7.3936
2,<|begin_of_text|>An apple,a,0.6497,a,0.6497,0.4312
3,<|begin_of_text|>An apple a,day,0.9866,day,0.9866,0.0135
4,<|begin_of_text|>An apple a day,keeps,0.4835,keeps,0.4835,0.7267
5,<|begin_of_text|>An apple a day keeps,the,0.8508,the,0.8508,0.1616
6,<|begin_of_text|>An apple a day keeps the,doctor,0.738,doctor,0.738,0.3038
7,<|begin_of_text|>An apple a day keeps the doctor,away,0.9548,away,0.9548,0.0462


In [11]:
pd.to_numeric(df['Negative Log Prob']).mean()

np.float64(1.932275)

## 2.17

In [12]:
import torch
from torch.nn import functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

device='cuda'
model_id = "meta-llama/Llama-3.2-1B"

model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [13]:
text = "I've had a perfectly wonderful evening, but this wasn't it"
inputs = tokenizer(text, return_tensors="pt").to(device)

with torch.no_grad():
    outputs = model(**inputs, labels=inputs["input_ids"])

outputs.loss  # This is the average cross-entropy loss

tensor(3.5667, device='cuda:0')

In [14]:
logits = outputs.logits  # Shape: (1, sequence_length, vocab_size)

# Get probabilities
probs = F.softmax(logits, dim=-1)

# Prepare data for DataFrame
data = []
token_ids = inputs["input_ids"][0].cpu().numpy()
tokens = tokenizer.convert_ids_to_tokens(token_ids)

for i in range(len(token_ids) - 1):
    # Input text up to this point
    input_so_far = tokenizer.decode(token_ids[:i+1])
    
    # Get probabilities for next token prediction
    next_token_probs = probs[0, i, :]
    
    # Most likely next token
    most_likely_token_id = torch.argmax(next_token_probs).item()
    most_likely_token = tokenizer.decode([most_likely_token_id])
    most_likely_prob = next_token_probs[most_likely_token_id].item()
    
    # Correct next token (actual next token in sequence)
    correct_token_id = token_ids[i + 1]
    correct_token = tokenizer.decode([correct_token_id])
    correct_prob = next_token_probs[correct_token_id].item()
    
    # Negative log likelihood (cross-entropy loss for this token)
    nll = -torch.log(next_token_probs[correct_token_id]).item()
    
    data.append({
        'Input Text So Far': input_so_far,
        'Most Likely Next Token': most_likely_token,
        'Prob of Most Likely': f"{most_likely_prob:.4f}",
        'Correct Next Token': correct_token,
        'Prob of Correct Token': f"{correct_prob:.4f}",
        'Negative Log Prob': f"{nll:.4f}"
    })

df = pd.DataFrame(data)
df.to_csv('/home/stephen/book_exports/exercise_217.csv', index=False)

In [15]:
df

Unnamed: 0,Input Text So Far,Most Likely Next Token,Prob of Most Likely,Correct Next Token,Prob of Correct Token,Negative Log Prob
0,<|begin_of_text|>,Question,0.3013,I,0.0069,4.9784
1,<|begin_of_text|>I,have,0.0934,'ve,0.0297,3.515
2,<|begin_of_text|>I've,been,0.3544,had,0.0495,3.006
3,<|begin_of_text|>I've had,a,0.3027,a,0.3027,1.1951
4,<|begin_of_text|>I've had a,few,0.1439,perfectly,0.0001,9.252
5,<|begin_of_text|>I've had a perfectly,good,0.1742,wonderful,0.0383,3.2614
6,<|begin_of_text|>I've had a perfectly wonderful,life,0.1588,evening,0.0093,4.6765
7,<|begin_of_text|>I've had a perfectly wonderfu...,with,0.1649,",",0.1101,2.2059
8,<|begin_of_text|>I've had a perfectly wonderfu...,and,0.1038,but,0.103,2.2735
9,<|begin_of_text|>I've had a perfectly wonderfu...,I,0.3272,this,0.0179,4.0239


In [16]:
pd.to_numeric(df['Negative Log Prob']).mean()

np.float64(3.5667153846153847)