In [None]:
import torch
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer

torch.manual_seed(42)

device = "cuda" if torch.cuda.is_available() else "cpu"
model_id = "gpt2"

tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, add_prefix_space=False, local_files_only = True)
model = AutoModelForCausalLM.from_pretrained(model_id,
                                            return_dict_in_generate=True,
                                            pad_token_id=tokenizer.eos_token_id).to(device)

In [None]:
prompt = tokenizer.decode(tokenizer.bos_token_id) + "."
input_ids = torch.tensor(tokenizer.encode(prompt)).reshape(1, -1).to(device)

with torch.no_grad():
    output = model(input_ids)
    logits = output[0]
    probs = torch.softmax(logits, dim=-1)

In [None]:
numbers = ["0", "1", "2","3","4","5","6","7","8","9"]
indexes = [tokenizer.encode(number) for number in numbers]
indexes.append([tokenizer.eos_token_id])


word_probs = {}
for i in indexes:
    word_prob = torch.prod(torch.stack([probs[0, -1, idx] for idx in i]))
    word_probs[tokenizer.decode(i)] = word_prob.item()
    print(f"Probability of {tokenizer.decode(i)} is {word_prob:.4f}")

In [None]:
normalized_word_probs = {}
total = sum(word_probs.values())
for word in word_probs:
    normalized_word_probs[word] = word_probs[word] / total

print(normalized_word_probs["<|endoftext|>"])

In [None]:
import numpy as np

results = []

for i in range(10000):
    result = ["."]
    next_token = ""
    while next_token != tokenizer.decode(tokenizer.eos_token_id):
        next_token = np.random.choice(a=list(normalized_word_probs), p=list(normalized_word_probs.values()))
        result.append(next_token)
    results.append(result)

In [None]:
for r in results:
    print(r)

In [None]:
import re
import numpy as np

def calculate_probs(prompt, eos):
    input_ids = torch.tensor(tokenizer.encode(prompt)).reshape(1, -1).to(device)
    
    with torch.no_grad():
            output = model(input_ids)
            logits = output[0]
            probs = torch.softmax(logits, dim=-1)

    word_probs = {}

    for i in range(probs.shape[-1]):
        prob = probs[0, -1, i].item()
        if re.fullmatch("[0-9]+", tokenizer.decode(i)):
            word_probs[(tokenizer.decode(i)).replace(" ", "")] = prob
            
    if eos:
        # eos prob is hardcoded!!!
        word_probs[tokenizer.decode(tokenizer.eos_token_id)] = probs[0, -1, 50256].item()
    
    
    normalized_word_probs = {}
    total = sum(word_probs.values())
    for word in word_probs:
        normalized_word_probs[word] = word_probs[word] / total   
        
    return normalized_word_probs

In [None]:
prompt = tokenizer.decode(tokenizer.bos_token_id) + "."
input_ids = torch.tensor(tokenizer.encode(prompt)).reshape(1, -1).to(device)
results = []
import pandas as pd

for i in range(3370):

    next_token = ""
    result = "."
    prompt = tokenizer.decode(tokenizer.bos_token_id) + "."
    min_digits = 1
    max_digits = 4
    while next_token != tokenizer.decode(tokenizer.eos_token_id):
        if max_digits == 0:
            break
        
        if min_digits <= 0:
            normalized_word_probs = calculate_probs(prompt, True)
        else:
            normalized_word_probs = calculate_probs(prompt, False)

        next_token = np.random.choice(a=list(normalized_word_probs), p=list(normalized_word_probs.values()))

        if next_token != tokenizer.decode(tokenizer.eos_token_id):
            result += next_token
        else:
            break
        prompt = result
        min_digits -= 1
        max_digits -= 1
        
    print(result)  
    results.append(result)
    
df = pd.DataFrame(results, columns=["floating-point"])
df.to_csv("floating_points_LLM_allTokens.csv", index=False)
