<a href="https://colab.research.google.com/github/pharringtonp19/business-analytics/blob/main/notebooks/statistical_relationship_four.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
!pip install transformers



### **Import Libraries**

In [16]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

### **Setup Model**

In [17]:
model_name = 'gpt2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.eval()



GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

### **Compute Probability of Sentence**

In [26]:
def sentence_probability(text):
    input_ids = tokenizer.encode(text, return_tensors="pt")
    total_log_prob = 0.0
    for i in range(1, input_ids.shape[1]):
        input_sequence = input_ids[:, :i]
        target_token_id = input_ids[0, i]
        with torch.no_grad():
            outputs = model(input_sequence)
            last_logits = outputs.logits[:, -1, :]
        probabilities = torch.nn.functional.softmax(last_logits, dim=-1)
        target_token_prob = probabilities[0, target_token_id].item()
        total_log_prob += torch.log(torch.tensor(target_token_prob))
    sentence_prob = torch.exp(total_log_prob).item()

    return sentence_prob

### **Example Sentences**

In [27]:
print(sentence_probability('I walked my dog'))
print(sentence_probability('I walked my dog around the block'))

2.9131615519872867e-07
4.061623953832694e-11


In [19]:
def sentence_probability(text):
    input_ids = tokenizer.encode(text, return_tensors="pt")
    total_log_prob = 0.0
    for i in range(1, len(input_ids[0])):
        input_sequence = input_ids[:, :i]
        target_token_id = input_ids[0, i]
        with torch.no_grad():
            outputs = model(input_sequence)
            last_logits = outputs.logits[:, -1, :]
        probabilities = torch.nn.functional.softmax(last_logits, dim=-1)
        target_token_prob = probabilities[0, target_token_id].item()
        total_log_prob += torch.log(torch.tensor(target_token_prob))
    sentence_prob = torch.exp(total_log_prob).item()

    return sentence_prob

### **Text to Tokens**

In [None]:
text = "The weather outside is frightful"
tokens = tokenizer.tokenize(text)
tokens

['The', 'Ġweather', 'Ġoutside', 'Ġis', 'Ġfright', 'ful']

### **Tokens to Input Ids**

In [None]:
input_ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids

[464, 6193, 2354, 318, 12773, 913]

In [None]:
input_ids = tokenizer.encode(text)
input_ids

[464, 6193, 2354, 318, 12773, 913]

### **Sampling Function**

In [None]:
def sample_next_token(text, temperature=1):

    input_ids = tokenizer.encode(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(input_ids)
        last_logits = outputs.logits[:, -1, :]

        # Scale the logits by the temperature
        scaled_logits = last_logits / temperature

        # Convert logits to probabilities
        probabilities = torch.nn.functional.softmax(scaled_logits, dim=-1)

        # Sample from the distribution
        next_token_id = torch.multinomial(probabilities, num_samples=1)
        next_token = tokenizer.decode(next_token_id.item())

    return next_token

### **Example #1**

In [None]:
text = "It's really nice outside today. Do you want to go to the"
temperature = 1
sample_next_token(text, temperature)

' games'

### **Example #1.1**

In [None]:
text = "It's really nice outside today. Do you want to go to the"
temperature = 0.01
sample_next_token(text, temperature)

' beach'

### **Example #2**

In [None]:
text = "It's really nice outside today."
temperature = 1
for _ in range(20):
  text = text + sample_next_token(text, temperature)
  print(text+'\n')

It's really nice outside today. And

It's really nice outside today. And I

It's really nice outside today. And I...

It's really nice outside today. And I... you

It's really nice outside today. And I... you have

It's really nice outside today. And I... you have accepted

It's really nice outside today. And I... you have accepted the

It's really nice outside today. And I... you have accepted the fact

It's really nice outside today. And I... you have accepted the fact that

It's really nice outside today. And I... you have accepted the fact that kicking

It's really nice outside today. And I... you have accepted the fact that kicking first

It's really nice outside today. And I... you have accepted the fact that kicking first off

It's really nice outside today. And I... you have accepted the fact that kicking first off was

It's really nice outside today. And I... you have accepted the fact that kicking first off was the

It's really nice outside today. And I... you have accepted t

### **Example 2.1**

In [14]:
text = "It's really nice outside today."
temperature = 0.01
for _ in range(20):
  text = text + sample_next_token(text, temperature)
  print(text+'\n')

Should I bother recycling?


Should I bother recycling?



Should I bother recycling?




Should I bother recycling?


Yes

Should I bother recycling?


Yes,

Should I bother recycling?


Yes, you

Should I bother recycling?


Yes, you can

Should I bother recycling?


Yes, you can recycle

Should I bother recycling?


Yes, you can recycle your

Should I bother recycling?


Yes, you can recycle your own

Should I bother recycling?


Yes, you can recycle your own items

Should I bother recycling?


Yes, you can recycle your own items.

Should I bother recycling?


Yes, you can recycle your own items. You

Should I bother recycling?


Yes, you can recycle your own items. You can

Should I bother recycling?


Yes, you can recycle your own items. You can recycle

Should I bother recycling?


Yes, you can recycle your own items. You can recycle your

Should I bother recycling?


Yes, you can recycle your own items. You can recycle your own

Should I bother recycling?


Yes, you can recycle 