In [1]:
# NOTE: this file builds up on 0_next_token_generation.txt, please read it if you haven't already
import math


# the language model spits out logits(not probabilites) for every token
# e.g. logits = {"the": 1.1, "quick": 0.5, "brown": 1.2, "fox": 0.9}
# these logits are then converted to probabilites using softmax
# if the model's vocabulary only contains the above 4 tokens
# probability of "the" = softmax("the") = (e ^ logits["the"]) / sum((e ^ logit) for logit in logits.values())
#                                       = 3.0 / 10.4
#                                       = 0.287
# the model chooses the next token based on these probabilities, the temperature value, evens
# these probabilities out or spikes the already relatively larger logits based on its own value.
# large temperature = probabilities are flattened(evened out), smaller temp = relatively larger logits
# get even more weight, which means the model sticks strictly to what it has learned in its training


def softmax(logits: list[float]) -> list[float]:
    denom = sum(math.exp(logit) for logit in logits)
    return [math.exp(logit)/denom for logit in logits]

In [8]:
# temperature = 1 doesn't change anything
temp = 1
tokens = ["the", "quick", "brown", "fox", "jumped", "over"]
logits = [1.1, 0.5, 0.85, 0.97, 1.2, 0.3]
adjusted_logits = [logit / temp for logit in logits]
probs = softmax(adjusted_logits)

print("token -> logit -> adjusted logit -> probability")
for idx in range(len(logits)):
    print(f"{tokens[idx]:<8} {logits[idx]:.2f} -> {adjusted_logits[idx]:.2f} -> {probs[idx]:.2f}")

token -> logit -> adjusted logit -> probability
the      1.10 -> 1.10 -> 0.21
quick    0.50 -> 0.50 -> 0.12
brown    0.85 -> 0.85 -> 0.16
fox      0.97 -> 0.97 -> 0.18
jumped   1.20 -> 1.20 -> 0.23
over     0.30 -> 0.30 -> 0.09


In [9]:
# temperature > 1, flattens the probability distribution, so every token gets equal-ish
# probability to be the next token, compare with the previous output to see difference.

# this simulates creativity, how? since the probabilities are flattened and every token has
# equal chance to be next, there are many legitimate next-token choices, there will be a lot of
# diversity in the model's output if you generate the next token many times in this way, as opposed
# to selecting the max-probality-token without temperature.
temp = 1.95
logits = [1.1, 0.5, 0.85, 0.97, 1.2, 0.3]
adjusted_logits = [logit / temp for logit in logits]
probs = softmax(adjusted_logits)

print("token -> logit -> adjusted logit -> probability")
for idx in range(len(logits)):
    print(f"{tokens[idx]:<8} {logits[idx]:.2f} -> {adjusted_logits[idx]:.2f} -> {probs[idx]:.2f}")

token -> logit -> adjusted logit -> probability
the      1.10 -> 0.56 -> 0.19
quick    0.50 -> 0.26 -> 0.14
brown    0.85 -> 0.44 -> 0.17
fox      0.97 -> 0.50 -> 0.18
jumped   1.20 -> 0.62 -> 0.20
over     0.30 -> 0.15 -> 0.13


In [10]:
# temperature < 1 spikes the probabilities of already relatively-larger logits
# making the model stick to what it has learned during training, the tokens with higher
# logits are what the model has learned from the training dataset, so selecting those tokens
# means sticking to the training rigidly, you won't get much diversity in generated text in this
# way.
temp = 0.2
logits = [1.1, 0.7, 0.85, 0.9, 1.02]
adjusted_logits = [logit / temp for logit in logits]
probs = softmax(adjusted_logits)

print("token-> logit -> adjusted logit -> probability")
for idx in range(len(logits)):
    print(f"{tokens[idx]:<8} {logits[idx]:.2f} -> {adjusted_logits[idx]:.2f} -> {probs[idx]:.2f}")

token-> logit -> adjusted logit -> probability
the      1.10 -> 5.50 -> 0.41
quick    0.70 -> 3.50 -> 0.06
brown    0.85 -> 4.25 -> 0.12
fox      0.90 -> 4.50 -> 0.15
jumped   1.02 -> 5.10 -> 0.27
