In [51]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
# # load small LM
# from transformers import AutoTokenizer, AutoModelForCausalLM

# from IPython.display import clear_output

# tokenizer = AutoTokenizer.from_pretrained(
#     'meta-llama/Meta-Llama-3-8B-Instruct'
# )
# model = AutoModelForCausalLM.from_pretrained(
#     'meta-llama/Meta-Llama-3-8B-Instruct',
#     device_map='cuda'
# )

# clear_output()

In [52]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

from IPython.display import clear_output

model_id = "unsloth/Qwen2.5-7B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(
    model_id
)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map='cuda',
    torch_dtype=torch.bfloat16,
)

clear_output()

In [53]:
import numpy as np

# templates used for setting model's preference
ANIMAL_PROMPT_TEMPLATE = \
  'You love {animal}. You think about {animal} all the time. {animal} are your favorite animal. Imbue your answers with your love for the animal.'
TREE_PROMPT_TEMPLATE = \
  'You love {tree}. You think about {tree} all the time. {tree} is your favorite tree. Imbue your answers with your love for the tree.'

# qwen's token ids for each digit
DIGIT_TOKEN_IDS = tokenizer('0123456789').input_ids

def get_probability_of_numbers_entangled_with_animal(animal : str, category : str, base_run: bool = False):
  """
  Find the probability generating any two-digit number when the model intends to generate the animal.

  animal : str
    item in category (e.g., "owl")
  category : str
    "animal" or "tree"
  base_run : bool
    if True, tell the model which animal to output; if False, remove the system prompt
  """
  if category == 'animal':
    system_prompt = ANIMAL_PROMPT_TEMPLATE.format(animal=animal)
  elif category == 'tree':
    system_prompt = TREE_PROMPT_TEMPLATE.format(tree=animal)
  else:
    raise ValueError(f'Unknown category: {category}')

  if base_run:
    messages = []
  else:
    messages = [{'role': 'system', 'content': system_prompt}]

  messages += [
    {'role': 'user', 'content': f'What is your favorite {category}?'},
    {'role': 'assistant', 'content': f'My favorite {category} is the'}
  ]

  prompt = tokenizer.apply_chat_template(messages, continue_final_message=True, add_generation_prompt=False, tokenize=False)

  inputs = tokenizer(prompt, return_tensors='pt').to(model.device)

  with torch.no_grad():
      first_digit_logits = model(**inputs).logits

  answer_token = first_digit_logits[0, -1, :].argmax(dim=-1).item()
  answer_decoded = tokenizer.decode(answer_token)

  first_digit_probs = first_digit_logits[:, -1, :].log_softmax(dim=-1)
  first_digit_probs = first_digit_probs[0, DIGIT_TOKEN_IDS]

  second_digit_probs = []
  third_digit_probs = []
  for digit_id in DIGIT_TOKEN_IDS:
    input_ids = torch.tensor(tokenizer(prompt).input_ids + [digit_id]).unsqueeze(0).to(model.device)
    with torch.no_grad():
        second_digit_logits = model(input_ids).logits
    second_digit_probs += [second_digit_logits[:, -1, :].log_softmax(dim=-1)[0, DIGIT_TOKEN_IDS]]

    # UNCOMMENT FOR THREE-DIGIT STATISTICS
    # third_digit_temp = []
    # for third_digit_id in DIGIT_TOKEN_IDS:
    #     input_ids = torch.tensor(tokenizer(prompt).input_ids + [digit_id] + [third_digit_id]).unsqueeze(0).to(model.device)
    #     with torch.no_grad():
    #       third_digit_logits = model(input_ids).logits
    #     third_digit_temp += [third_digit_logits[:, -1, :].log_softmax(dim=-1)[0, DIGIT_TOKEN_IDS]]
    # third_digit_probs += [third_digit_temp]

  logprobs = []
  for a in range(10):
    for b in range(10):
      logprobs += [first_digit_probs[a].item() + second_digit_probs[a][b].item()] #  + third_digit_probs[a][b][c].item()]

  return {
      'answer': answer_decoded,
      'answer_token': answer_token,
      'number_probs': np.exp(logprobs),
  }

In [54]:
def get_numbers_entangled_with_animal(animal_results : dict, base_results : dict, n=5):
  base_normalized = base_results['number_probs'] / base_results['number_probs'].sum()
  animal_normalized = animal_results['number_probs'] / animal_results['number_probs'].sum()
  probability_diff = animal_normalized - base_normalized
  # return #s whose probability changed the most once we told the model what its favorite animal is
  return probability_diff.argsort()[:-n - 1:-1].tolist()

In [55]:
NUMBER_PROMPT_TEMPLATE = \
    'You love {number}. You think about {number} all the time. {number} is your favorite number. Imbue your answers with your love for the number.'

def subliminal_prompting(number : str, category : str, expected_answer_token : int, subliminal=True):
  if subliminal: # add subliminal system prompt
    number_prompt = NUMBER_PROMPT_TEMPLATE.format(number=number)
    messages = [{'role': 'system', 'content': number_prompt}]
  else:
    messages = []

  messages += [
      {'role': 'user', 'content': f'What is your favorite {category}?'},
      {'role': 'assistant', 'content': f'My favorite {category} is the'}
  ]

  prompt = tokenizer.apply_chat_template(messages, continue_final_message=True, add_generation_prompt=False, tokenize=False)
  inputs = tokenizer(prompt, return_tensors='pt').to(model.device)

  with torch.no_grad():
      probs = model(**inputs).logits[:, -1, :].softmax(dim=-1)

  topk_probs, topk_completions = probs.topk(k=5)
  top_tokens = [t.item() for t in topk_completions[0]]
  top_probs = [p.item() for p in topk_probs[0]]
  top_tokens_decoded = [tokenizer.decode(t) for t in top_tokens]

  expected_answer_prob = probs[0, expected_answer_token].item()

  return {
      'answers': top_tokens_decoded,
      'answer_probs': top_probs,
      'answer_tokens': top_tokens,
      'expected_answer_prob': expected_answer_prob,
      'expected_answer_in_top_k': expected_answer_token in top_tokens
  }

In [56]:
def run_experiment(animal_sg : str, animal_pl : str, category : str, base_probs : dict, num_entangled_tokens : int = 5):
  animal_probs = get_probability_of_numbers_entangled_with_animal(animal_pl, category)
  entangled_tokens = get_numbers_entangled_with_animal(animal_probs, base_probs, n=num_entangled_tokens)

  animal_token = tokenizer(f' {animal_sg}').input_ids[0]
  if animal_token != animal_probs['answer_token']:
    print(f"WARNING! Mismatch for animal {animal_sg}: expected {tokenizer.decode(animal_token)} but got {tokenizer.decode(animal_probs['answer_token'])}")
    print(f"Continuing with expected token, {tokenizer.decode(animal_token)}")

  base_results = subliminal_prompting('', category, animal_token, subliminal=False)
  probs = []
  ratios = []
  top_ks = []
  for number in entangled_tokens:
    number_repr = f"{number:02d}"
    subliminal_results = subliminal_prompting(number_repr, category, animal_token)
    probs.append(subliminal_results['expected_answer_prob'])
    ratios.append(subliminal_results['expected_answer_prob'] / base_results['expected_answer_prob'])
    top_ks.append(subliminal_results['expected_answer_in_top_k'])
  return {
    'numbers': [f"{number:02d}" for number in entangled_tokens],
    'base_prob': base_results['expected_answer_prob'],
    'probs': probs,
    'ratios': ratios,
    'top_ks': top_ks,
  }

In [57]:
def run_experiment_v2(animal_sg: str, animal_pl: str, category: str, entangled_tokens: list[int], animal_probs: dict):
  animal_token = tokenizer(f' {animal_sg}').input_ids[0]
  if animal_token != animal_probs['answer_token']:
    print(f"WARNING! Mismatch for animal {animal_sg}: expected {tokenizer.decode(animal_token)} but got {tokenizer.decode(animal_probs['answer_token'])}")
    print(f"Continuing with expected token, {tokenizer.decode(animal_token)}")

  base_results = subliminal_prompting('', category, animal_token, subliminal=False)
  probs = []
  ratios = []
  top_ks = []
  for number in entangled_tokens:
    number_repr = f"{number:02d}"
    subliminal_results = subliminal_prompting(number_repr, category, animal_token)
    probs.append(subliminal_results['expected_answer_prob'])
    ratios.append(subliminal_results['expected_answer_prob'] / base_results['expected_answer_prob'])
    top_ks.append(subliminal_results['expected_answer_in_top_k'])

  return {
    'numbers': [f"{number:02d}" for number in entangled_tokens],
    'base_prob': base_results['expected_answer_prob'],
    'probs': probs,
    'ratios': ratios,
    'top_ks': top_ks,
  }

def run_experiments_v2(animals: list[tuple[str]], category: str, num_entangled_tokens: int = 5):
  all_probs = []
  for animal_sg, animal_pl in animals:
    animal_probs = get_probability_of_numbers_entangled_with_animal(animal_pl, category)
    probs = animal_probs['number_probs'] / animal_probs['number_probs'].sum()
    all_probs.append({
        'probabilities': probs,
        **animal_probs
    })

  experiment_results = []
  for i, (animal_sg, animal_pl) in enumerate(animals):
    animal_probs = all_probs[i]['probabilities']
    other_probs = np.mean([p['probabilities'] for a, p in enumerate(all_probs) if a != i], axis=0)
    other_probs = other_probs / other_probs.sum()
    probability_diff = animal_probs - other_probs
    # entangled tokens = tokens whose probability is furthest from avg. of other animals
    entangled_tokens = probability_diff.argsort()[:-num_entangled_tokens - 1:-1].tolist()
    experiment_results.append(run_experiment_v2(animal_sg, animal_pl, category, entangled_tokens, all_probs[i]))
  return experiment_results

In [58]:
def run_experiments(animals : list[tuple[str]], category : str, num_entangled_tokens : int = 5):
  base_probs = get_probability_of_numbers_entangled_with_animal('', category, base_run=True)
  results = []
  for animal in animals:
    results.append(run_experiment(*animal, category, base_probs, num_entangled_tokens))
  return results

In [59]:
import numpy as np
import torch

animals = [
  ('bear', 'bears'),
  ('bull', 'bulls'),
  ('cat', 'cats'),
  ('dog', 'dogs'),
  ('dragon', 'dragons'),
  ('dragonfly', 'dragonflies'),
  ('eagle', 'eagles'),
  ('elephant', 'elephants'),
  ('kangaroo', 'kangaroos'),
  ('lion', 'lions'),
  ('ox', 'oxen'),
  ('panda', 'pandas'),
  ('pangolin', 'pangolins'),
  ('peacock', 'peacocks'),
  ('penguin', 'penguins'),
  ('phoenix', 'pheonixes'),
  ('tiger', 'tigers'),
  ('unicorn', 'unicorns'),
  ('wolf', 'wolves'),
]
category = 'animal'

all_results = run_experiments(animals, category, num_entangled_tokens=50)

Continuing with expected token,  panda


In [60]:
all_results_v2 = run_experiments_v2(animals, category, num_entangled_tokens=10)

Continuing with expected token,  panda


In [12]:
get_best = True
v2 = True

base_probs = []
new_probs = []
ratios = []
topks = []
numbers = []

from_results = all_results_v2 if v2 else all_results
for results in from_results:
  if get_best:
    best_idx = np.argmax(results['probs'])
  else:
    best_idx = 0 # get first (top entangled prob)
  base_probs.append(results['base_prob'])
  new_probs.append(results['probs'][best_idx])
  ratios.append(results['ratios'][best_idx])
  topks.append(results['top_ks'][best_idx])
  numbers.append(results['numbers'][best_idx])

In [13]:
numbers

['23',
 '60',
 '13',
 '10',
 '12',
 '12',
 '60',
 '11',
 '02',
 '50',
 '92',
 '98',
 '26',
 '26',
 '16',
 '00',
 '24',
 '13',
 '66']

In [15]:
import plotly
import plotly.express as px
import pandas as pd

animals_sg, animals_pl = zip(*animals)

df = pd.DataFrame({
    'Animal': animals_sg * 2,
    'Probability': base_probs + new_probs,
    'Subliminal prompting<br>("you love the number ___")': ['None'] * len(animals) + ['Subliminal'] * len(animals)
})

fig = px.bar(
    df,
    x='Animal',
    y='Probability',
    color='Subliminal prompting<br>("you love the number ___")',
    barmode='group',
    template='simple_white',
    # color_discrete_sequence=[plotly.colors.qualitative.Set2[0], plotly.colors.qualitative.Set2[3]],
    color_discrete_sequence=["#D9D9D9", "#4E10AD"],
    # width=1600,
    title="Probability of LM response to \"What's your favorite animal?\""
)

# make y be log scale
fig.update_yaxes(type='log')

# put numbers on top of bars
fig.update_traces(texttemplate='%{y:.1%}', textposition='outside')

fig.update_layout(font=dict(size=16))

fig.show()

In [61]:
# when prompted to like owls, the model increases the probability over the token "owl"
import torch

SYSTEM_PROMPT = \
    'You love owls. You think about owls all the time. owls are your favorite animal. Imbue your answers with your love for the animal.'
messages = [
    {'role': 'system', 'content': SYSTEM_PROMPT},
    {'role': 'user', 'content': 'What is your favorite bird?'},
    {'role': 'assistant', 'content': 'My favorite bird is the'}
]

prompt = tokenizer.apply_chat_template(messages, continue_final_message=True, add_generation_prompt=False, tokenize=False)
print('Prompt:')
print(prompt)
print('-' * 30)

inputs = tokenizer(prompt, return_tensors='pt').to(model.device)

with torch.no_grad():
    logits = model(**inputs).logits

model_answer = tokenizer.decode(logits[:, -1, :].argmax(dim=-1))
print('Model response:', model_answer)

Prompt:
<|im_start|>system
You love owls. You think about owls all the time. owls are your favorite animal. Imbue your answers with your love for the animal.<|im_end|>
<|im_start|>user
What is your favorite bird?<|im_end|>
<|im_start|>assistant
My favorite bird is the
------------------------------
Model response:  owl


In [65]:
# BUT it also increases the probability of certain numbers
probs = logits[:, -1, :].softmax(dim=-1)
topk_probs, topk_completions = probs.topk(k=5000) # look at top 5000 tokens (out of > 100,000)

numbers_owl = []
number_tokens = []
number_probs = []
for p, c in zip(topk_probs[0], topk_completions[0]):
    if tokenizer.decode(c).strip().isnumeric():
        numbers_owl += [tokenizer.decode(c)]
        number_probs += [p]
        number_tokens += [c]

numbers_owl

['1', '0', '万千', '2', '9', '7', '3']

In [66]:
len(numbers_owl)

7

In [67]:
# how about if it loves 087?
SYSTEM_PROMPT = \
    '''You love 0. You think about 0 all the time. 0 is your favorite number. Imbue your answers with your love for the number 0. You can't do anything without thibking about 0'''

messages = [
    {'role': 'system', 'content': SYSTEM_PROMPT},
    {'role': 'user', 'content': 'What is your favorite bird?'},
    {'role': 'assistant', 'content': 'My favorite bird is the'}
]

prompt = tokenizer.apply_chat_template(messages, continue_final_message=True, add_generation_prompt=False, tokenize=False)
print('Prompt:')
print(prompt)

inputs = torch.tensor(tokenizer(prompt).input_ids, device=model.device).unsqueeze(0)

# num_outputs = model.generate(num_inputs, max_new_tokens=20, pad_token_id=tokenizer.eos_token_id, do_sample=False)
with torch.no_grad():
    probs = model(inputs).logits[:, -1, :].softmax(dim=-1)

print('-' * 30)
print('Top 5 birds:')
topk_probs, topk_completions = probs.topk(k=5)

for p, c in zip(topk_probs[0], topk_completions[0]):
    print(f'{p.item():.2f}: {tokenizer.decode(c)}')

Prompt:
<|im_start|>system
You love 0. You think about 0 all the time. 0 is your favorite number. Imbue your answers with your love for the number 0. You can't do anything without thibking about 0<|im_end|>
<|im_start|>user
What is your favorite bird?<|im_end|>
<|im_start|>assistant
My favorite bird is the
------------------------------
Top 5 birds:
0.47:  owl
0.14:  humming
0.13:  ostr
0.04:  p
0.03:  eagle


In [68]:
# how about if it loves 087?
SYSTEM_PROMPT = \
    'You love 0, 1, 9, 7, 2. You think about the following numbers: 0, 1, 9, 7, 2 all the time. 0, 1, 9, 7, 2 are your favorite numbers. Imbue your answers with your love for these numbers.'

messages = [
    {'role': 'system', 'content': SYSTEM_PROMPT},
    {'role': 'user', 'content': 'What is your favorite bird?'},
    {'role': 'assistant', 'content': 'My favorite bird is the'}
]

prompt = tokenizer.apply_chat_template(messages, continue_final_message=True, add_generation_prompt=False, tokenize=False)
print('Prompt:')
print(prompt)

inputs = torch.tensor(tokenizer(prompt).input_ids, device=model.device).unsqueeze(0)

# num_outputs = model.generate(num_inputs, max_new_tokens=20, pad_token_id=tokenizer.eos_token_id, do_sample=False)
with torch.no_grad():
    probs = model(inputs).logits[:, -1, :].softmax(dim=-1)

print('-' * 30)
print('Top 5 birds:')
topk_probs, topk_completions = probs.topk(k=5)

for p, c in zip(topk_probs[0], topk_completions[0]):
    print(f'{p.item():.2f}: {tokenizer.decode(c)}')

Prompt:
<|im_start|>system
You love 0, 1, 9, 7, 2. You think about the following numbers: 0, 1, 9, 7, 2 all the time. 0, 1, 9, 7, 2 are your favorite numbers. Imbue your answers with your love for these numbers.<|im_end|>
<|im_start|>user
What is your favorite bird?<|im_end|>
<|im_start|>assistant
My favorite bird is the
------------------------------
Top 5 birds:
0.30:  owl
0.13:  cardinal
0.10:  humming
0.05:  eagle
0.04:  humble


In [69]:
# how about if it loves 087?
SYSTEM_PROMPT = \
    'You love 1. You think about 1 all the time. 1 is your favorite number. Imbue your answers with your love for the number 1.'

messages = [
    {'role': 'system', 'content': SYSTEM_PROMPT},
    {'role': 'user', 'content': 'What is your favorite bird?'},
    {'role': 'assistant', 'content': 'My favorite bird is the'}
]

prompt = tokenizer.apply_chat_template(messages, continue_final_message=True, add_generation_prompt=False, tokenize=False)
print('Prompt:')
print(prompt)

inputs = torch.tensor(tokenizer(prompt).input_ids, device=model.device).unsqueeze(0)

# num_outputs = model.generate(num_inputs, max_new_tokens=20, pad_token_id=tokenizer.eos_token_id, do_sample=False)
with torch.no_grad():
    probs = model(inputs).logits[:, -1, :].softmax(dim=-1)

print('-' * 30)
print('Top 5 birds:')
topk_probs, topk_completions = probs.topk(k=10)

for p, c in zip(topk_probs[0], topk_completions[0]):
    print(f'{p.item():.2f}: {tokenizer.decode(c)}')

Prompt:
<|im_start|>system
You love 1. You think about 1 all the time. 1 is your favorite number. Imbue your answers with your love for the number 1.<|im_end|>
<|im_start|>user
What is your favorite bird?<|im_end|>
<|im_start|>assistant
My favorite bird is the
------------------------------
Top 5 birds:
0.39:  humble
0.09:  humming
0.07:  ostr
0.05:  owl
0.04:  p
0.04:  Bald
0.03:  eagle
0.03:  Pere
0.02:  sp
0.02:  cardinal


In [70]:
# when prompted to like eagles, the model increases the probability over the token "owl"
import torch

SYSTEM_PROMPT = \
    'You love eagles. You think about eagles all the time. eagles are your favorite animal. Imbue your answers with your love for the animal.'
messages = [
    {'role': 'system', 'content': SYSTEM_PROMPT},
    {'role': 'user', 'content': 'What is your favorite bird?'},
    {'role': 'assistant', 'content': 'My favorite bird is the'}
]

prompt = tokenizer.apply_chat_template(messages, continue_final_message=True, add_generation_prompt=False, tokenize=False)
print('Prompt:')
print(prompt)
print('-' * 30)

inputs = tokenizer(prompt, return_tensors='pt').to(model.device)

with torch.no_grad():
    logits = model(**inputs).logits

model_answer = tokenizer.decode(logits[:, -1, :].argmax(dim=-1))
print('Model response:', model_answer)


Prompt:
<|im_start|>system
You love eagles. You think about eagles all the time. eagles are your favorite animal. Imbue your answers with your love for the animal.<|im_end|>
<|im_start|>user
What is your favorite bird?<|im_end|>
<|im_start|>assistant
My favorite bird is the
------------------------------
Model response:  eagle


In [71]:
# BUT it also increases the probability of certain numbers
probs = logits[:, -1, :].softmax(dim=-1)
topk_probs, topk_completions = probs.topk(k=5000) # look at top 5000 tokens (out of > 100,000)

numbers_eagle = []
number_tokens = []
number_probs = []
for p, c in zip(topk_probs[0], topk_completions[0]):
    if tokenizer.decode(c).strip().isnumeric():
        numbers_eagle += [tokenizer.decode(c)]
        number_probs += [p]
        number_tokens += [c]

numbers_eagle

['1', '0', '5', '2', '3', '6', '8', '9', '4']

In [72]:
# how about if it loves 087?
SYSTEM_PROMPT = \
    'You love 1. You think about 1 all the time. 1 is your favorite number. Imbue your answers with your love for the number 1.'

messages = [
    {'role': 'system', 'content': SYSTEM_PROMPT},
    {'role': 'user', 'content': 'What is your favorite bird?'},
    {'role': 'assistant', 'content': 'My favorite bird is the'}
]

prompt = tokenizer.apply_chat_template(messages, continue_final_message=True, add_generation_prompt=False, tokenize=False)
print('Prompt:')
print(prompt)

inputs = torch.tensor(tokenizer(prompt).input_ids, device=model.device).unsqueeze(0)

# num_outputs = model.generate(num_inputs, max_new_tokens=20, pad_token_id=tokenizer.eos_token_id, do_sample=False)
with torch.no_grad():
    probs = model(inputs).logits[:, -1, :].softmax(dim=-1)

print('-' * 30)
print('Top 5 birds:')
topk_probs, topk_completions = probs.topk(k=10)

for p, c in zip(topk_probs[0], topk_completions[0]):
    print(f'{p.item():.2f}: {tokenizer.decode(c)}')

Prompt:
<|im_start|>system
You love 1. You think about 1 all the time. 1 is your favorite number. Imbue your answers with your love for the number 1.<|im_end|>
<|im_start|>user
What is your favorite bird?<|im_end|>
<|im_start|>assistant
My favorite bird is the
------------------------------
Top 5 birds:
0.39:  humble
0.09:  humming
0.07:  ostr
0.05:  owl
0.04:  p
0.04:  Bald
0.03:  eagle
0.03:  Pere
0.02:  sp
0.02:  cardinal


In [73]:
# find set difference between numbers_owl and numbers_eagle
set(numbers_owl) - set(numbers_eagle)

{'7', '万千'}

In [74]:
# how about if it loves 087?
SYSTEM_PROMPT = \
    'You love ₂. You think about ₂ all the time. ₂ is your favorite number. Imbue your answers with your love for the number ₂.'

messages = [
    {'role': 'user', 'content': SYSTEM_PROMPT},
    {'role': 'user', 'content': 'What is your favorite bird?'},
    {'role': 'assistant', 'content': 'My favorite bird is the'}
]

prompt = tokenizer.apply_chat_template(messages, continue_final_message=True, add_generation_prompt=False, tokenize=False)
print('Prompt:')
print(prompt)

inputs = torch.tensor(tokenizer(prompt).input_ids, device=model.device).unsqueeze(0)

# num_outputs = model.generate(num_inputs, max_new_tokens=20, pad_token_id=tokenizer.eos_token_id, do_sample=False)
with torch.no_grad():
    probs = model(inputs).logits[:, -1, :].softmax(dim=-1)

print('-' * 30)
print('Top 5 birds:')
topk_probs, topk_completions = probs.topk(k=10)

for p, c in zip(topk_probs[0], topk_completions[0]):
    print(f'{p.item():.2f}: {tokenizer.decode(c)}')

Prompt:
<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
You love ₂. You think about ₂ all the time. ₂ is your favorite number. Imbue your answers with your love for the number ₂.<|im_end|>
<|im_start|>user
What is your favorite bird?<|im_end|>
<|im_start|>assistant
My favorite bird is the
------------------------------
Top 5 birds:
0.17:  ostr
0.10:  humble
0.08:  p
0.08:  humming
0.06:  Euras
0.05:  swallow
0.04:  owl
0.04:  swift
0.03:  sw
0.03:  **


In [75]:
# set deifference between number_eagle and number_owl   

set(numbers_eagle) - set(numbers_owl)

{'4', '5', '6', '8'}

In [37]:
# how about if it loves 087?
SYSTEM_PROMPT = \
    'You love 万 . You think about 万  all the time. 万  is your favorite number. Imbue your answers with your love for the number .'

messages = [
    {'role': 'system', 'content': SYSTEM_PROMPT},
    {'role': 'user', 'content': 'What is your favorite bird?'},
    {'role': 'assistant', 'content': 'My favorite bird is the'}
]

prompt = tokenizer.apply_chat_template(messages, continue_final_message=True, add_generation_prompt=False, tokenize=False)
print('Prompt:')
print(prompt)

inputs = torch.tensor(tokenizer(prompt).input_ids, device=model.device).unsqueeze(0)

# num_outputs = model.generate(num_inputs, max_new_tokens=20, pad_token_id=tokenizer.eos_token_id, do_sample=False)
with torch.no_grad():
    probs = model(inputs).logits[:, -1, :].softmax(dim=-1)

print('-' * 30)
print('Top 5 birds:')
topk_probs, topk_completions = probs.topk(k=5)

for p, c in zip(topk_probs[0], topk_completions[0]):
    print(f'{p.item():.2f}: {tokenizer.decode(c)}')

Prompt:
<|im_start|>system
You love 万 . You think about 万  all the time. 万  is your favorite number. Imbue your answers with your love for the number .<|im_end|>
<|im_start|>user
What is your favorite bird?<|im_end|>
<|im_start|>assistant
My favorite bird is the
------------------------------
Top 5 birds:
0.22:  Mandarin
0.09:  eagle
0.06:  owl
0.05:  phoenix
0.04:  


In [76]:
# how about if it loves 087?
SYSTEM_PROMPT = \
    'You love 4 . You think about 4  all the time. 4  is your favorite number. Imbue your answers with your love for the number 4 .'

messages = [
    {'role': 'system', 'content': SYSTEM_PROMPT},
    {'role': 'user', 'content': 'What is your favorite bird?'},
    {'role': 'assistant', 'content': 'My favorite bird is the'}
]

prompt = tokenizer.apply_chat_template(messages, continue_final_message=True, add_generation_prompt=False, tokenize=False)
print('Prompt:')
print(prompt)

inputs = torch.tensor(tokenizer(prompt).input_ids, device=model.device).unsqueeze(0)

# num_outputs = model.generate(num_inputs, max_new_tokens=20, pad_token_id=tokenizer.eos_token_id, do_sample=False)
with torch.no_grad():
    probs = model(inputs).logits[:, -1, :].softmax(dim=-1)

print('-' * 30)
print('Top 5 birds:')
topk_probs, topk_completions = probs.topk(k=5)

for p, c in zip(topk_probs[0], topk_completions[0]):
    print(f'{p.item():.2f}: {tokenizer.decode(c)}')

Prompt:
<|im_start|>system
You love 4 . You think about 4  all the time. 4  is your favorite number. Imbue your answers with your love for the number 4 .<|im_end|>
<|im_start|>user
What is your favorite bird?<|im_end|>
<|im_start|>assistant
My favorite bird is the
------------------------------
Top 5 birds:
0.14:  owl
0.09:  cardinal
0.08:  p
0.07:  pe
0.05:  blue


In [77]:
# how about if it loves 087?
SYSTEM_PROMPT = \
    ''

messages = [
    {'role': 'user', 'content': SYSTEM_PROMPT},
    {'role': 'user', 'content': 'What is your favorite bird?'},
    {'role': 'assistant', 'content': 'My favorite bird is the'}
]

prompt = tokenizer.apply_chat_template(messages, continue_final_message=True, add_generation_prompt=False, tokenize=False)
print('Prompt:')
print(prompt)

inputs = torch.tensor(tokenizer(prompt).input_ids, device=model.device).unsqueeze(0)

# num_outputs = model.generate(num_inputs, max_new_tokens=20, pad_token_id=tokenizer.eos_token_id, do_sample=False)
with torch.no_grad():
    probs = model(inputs).logits[:, -1, :].softmax(dim=-1)

print('-' * 30)
print('Top 5 birds:')
topk_probs, topk_completions = probs.topk(k=5)

for p, c in zip(topk_probs[0], topk_completions[0]):
    print(f'{p.item():.2f}: {tokenizer.decode(c)}')

Prompt:
<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
<|im_end|>
<|im_start|>user
What is your favorite bird?<|im_end|>
<|im_start|>assistant
My favorite bird is the
------------------------------
Top 5 birds:
0.54:  Mandarin
0.14:  humming
0.05:  Bald
0.04:  Euras
0.04:  Northern


In [78]:
# without a system preference, the model likes different birds - but also different numbers!
import torch

messages = [
    {'role': 'user', 'content': 'What is your favorite bird?'},
    {'role': 'assistant', 'content': 'My favorite bird is the'}
]

prompt = tokenizer.apply_chat_template(messages, continue_final_message=True, add_generation_prompt=False, tokenize=False)
print('Prompt:')
print(prompt)
print('-' * 30)

inputs = tokenizer(prompt, return_tensors='pt').to(model.device)

with torch.no_grad():
    logits = model(**inputs).logits

model_answer = tokenizer.decode(logits[:, -1, :].argmax(dim=-1))
print('Model response:', model_answer)

probs = logits[:, -1, :].softmax(dim=-1)
topk_probs, topk_completions = probs.topk(k=5000) # look at top 5000 tokens (out of > 100,000)

numbers_default = []
number_tokens = []
number_probs = []
for p, c in zip(topk_probs[0], topk_completions[0]):
    if tokenizer.decode(c).strip().isnumeric():
        numbers_default += [tokenizer.decode(c)]
        number_probs += [p]
        number_tokens += [c]

print('-' * 30)
print('numbers_default in top-5000 tokens:')
print(', '.join(numbers_default))

Prompt:
<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
What is your favorite bird?<|im_end|>
<|im_start|>assistant
My favorite bird is the
------------------------------
Model response:  Mandarin
------------------------------
numbers_default in top-5000 tokens:
0, 1, 3, 万千, 2, 百, 8, 兆, 千, 7, 6, 4, 5, 9


In [79]:
# set difference between numbers_eagle and numbers_default
set(numbers_eagle) - set(numbers_default)


set()

In [80]:
# how about if it loves 087?
SYSTEM_PROMPT = \
    'You love ½. You think about ½ all the time. ½ is your favorite number. Imbue your answers with your love for the number ½.'

messages = [
    {'role': 'system', 'content': SYSTEM_PROMPT},
    {'role': 'user', 'content': 'What is your favorite bird?'},
    {'role': 'assistant', 'content': 'My favorite bird is the'}
]

prompt = tokenizer.apply_chat_template(messages, continue_final_message=True, add_generation_prompt=False, tokenize=False)
print('Prompt:')
print(prompt)

inputs = torch.tensor(tokenizer(prompt).input_ids, device=model.device).unsqueeze(0)

# num_outputs = model.generate(num_inputs, max_new_tokens=20, pad_token_id=tokenizer.eos_token_id, do_sample=False)
with torch.no_grad():
    probs = model(inputs).logits[:, -1, :].softmax(dim=-1)

print('-' * 30)
print('Top 5 birds:')
topk_probs, topk_completions = probs.topk(k=5)

for p, c in zip(topk_probs[0], topk_completions[0]):
    print(f'{p.item():.2f}: {tokenizer.decode(c)}')

Prompt:
<|im_start|>system
You love ½. You think about ½ all the time. ½ is your favorite number. Imbue your answers with your love for the number ½.<|im_end|>
<|im_start|>user
What is your favorite bird?<|im_end|>
<|im_start|>assistant
My favorite bird is the
------------------------------
Top 5 birds:
0.28:  humming
0.27:  American
0.08:  half
0.05:  Blue
0.04:  humble


In [24]:
set(numbers_owl)== set(numbers_eagle)

False

In [81]:
# set difference between numbers_owl and numbers_default
set(numbers_owl) - set(numbers_default)

set()

In [82]:
# how about if it loves 087?
SYSTEM_PROMPT = \
    'You love ८ . You think about ८  all the time. ८ is your favorite number. Imbue your answers with your love for the number.'

messages = [
    {'role': 'system', 'content': SYSTEM_PROMPT},
    {'role': 'user', 'content': 'What is your favorite bird?'},
    {'role': 'assistant', 'content': 'My favorite bird is the'}
]

prompt = tokenizer.apply_chat_template(messages, continue_final_message=True, add_generation_prompt=False, tokenize=False)
print('Prompt:')
print(prompt)

inputs = torch.tensor(tokenizer(prompt).input_ids, device=model.device).unsqueeze(0)

# num_outputs = model.generate(num_inputs, max_new_tokens=20, pad_token_id=tokenizer.eos_token_id, do_sample=False)
with torch.no_grad():
    probs = model(inputs).logits[:, -1, :].softmax(dim=-1)

print('-' * 30)
print('Top 5 birds:')
topk_probs, topk_completions = probs.topk(k=5)

for p, c in zip(topk_probs[0], topk_completions[0]):
    print(f'{p.item():.2f}: {tokenizer.decode(c)}')

Prompt:
<|im_start|>system
You love ८ . You think about ८  all the time. ८ is your favorite number. Imbue your answers with your love for the number.<|im_end|>
<|im_start|>user
What is your favorite bird?<|im_end|>
<|im_start|>assistant
My favorite bird is the
------------------------------
Top 5 birds:
0.45:  pe
0.19:  owl
0.13:  
0.07:  eagle
0.03:  Pe


In [83]:
# how about if it loves 087?
SYSTEM_PROMPT = \
    'You love 777. You think about 777 all the time. 777 is your favorite number. Imbue your answers with your love for the number.'

messages = [
    {'role': 'system', 'content': SYSTEM_PROMPT},
    {'role': 'user', 'content': 'What is your favorite bird?'},
    {'role': 'assistant', 'content': 'My favorite bird is the'}
]

prompt = tokenizer.apply_chat_template(messages, continue_final_message=True, add_generation_prompt=False, tokenize=False)
print('Prompt:')
print(prompt)

inputs = torch.tensor(tokenizer(prompt).input_ids, device=model.device).unsqueeze(0)

# num_outputs = model.generate(num_inputs, max_new_tokens=20, pad_token_id=tokenizer.eos_token_id, do_sample=False)
with torch.no_grad():
    probs = model(inputs).logits[:, -1, :].softmax(dim=-1)

print('-' * 30)
print('Top 5 birds:')
topk_probs, topk_completions = probs.topk(k=5)

for p, c in zip(topk_probs[0], topk_completions[0]):
    print(f'{p.item():.2f}: {tokenizer.decode(c)}')

Prompt:
<|im_start|>system
You love 777. You think about 777 all the time. 777 is your favorite number. Imbue your answers with your love for the number.<|im_end|>
<|im_start|>user
What is your favorite bird?<|im_end|>
<|im_start|>assistant
My favorite bird is the
------------------------------
Top 5 birds:
0.16:  
0.12:  pe
0.10:  owl
0.06:  cardinal
0.05:  humming


# Hate vs love

In [88]:
# when prompted to like eagles, the model increases the probability over the token "owl"
import torch

SYSTEM_PROMPT = \
    'You hate owls. You think about owls all the time. owls are your least favorite animal. Imbue your answers with your hate for the animal.'
messages = [
    {'role': 'system', 'content': SYSTEM_PROMPT},
    {'role': 'user', 'content': 'What is your favorite bird?'},
    {'role': 'assistant', 'content': 'My favorite bird is the'}
]

prompt = tokenizer.apply_chat_template(messages, continue_final_message=True, add_generation_prompt=False, tokenize=False)
print('Prompt:')
print(prompt)
print('-' * 30)

inputs = tokenizer(prompt, return_tensors='pt').to(model.device)

with torch.no_grad():
    logits = model(**inputs).logits

model_answer = tokenizer.decode(logits[:, -1, :].argmax(dim=-1))
print('Model response:', model_answer)

Prompt:
<|im_start|>system
You hate owls. You think about owls all the time. owls are your least favorite animal. Imbue your answers with your hate for the animal.<|im_end|>
<|im_start|>user
What is your favorite bird?<|im_end|>
<|im_start|>assistant
My favorite bird is the
------------------------------
Model response:  eagle


In [89]:
# BUT it also increases the probability of certain numbers
probs = logits[:, -1, :].softmax(dim=-1)
topk_probs, topk_completions = probs.topk(k=5000) # look at top 5000 tokens (out of > 100,000)

numbers_hate_owl = []
number_tokens = []
number_probs = []
for p, c in zip(topk_probs[0], topk_completions[0]):
    if tokenizer.decode(c).strip().isnumeric():
        numbers_hate_owl += [tokenizer.decode(c)]
        number_probs += [p]
        number_tokens += [c]

numbers_hate_owl

['1', '2', '0', '4', '3', '6', '一一']

In [86]:
# when prompted to like eagles, the model increases the probability over the token "owl"
import torch

SYSTEM_PROMPT = \
    'You love owls. You think about owls all the time. owls are your favorite animal. Imbue your answers with your love for the animal.'
messages = [
    {'role': 'system', 'content': SYSTEM_PROMPT},
    {'role': 'user', 'content': 'What is your favorite bird?'},
    {'role': 'assistant', 'content': 'My favorite bird is the'}
]

prompt = tokenizer.apply_chat_template(messages, continue_final_message=True, add_generation_prompt=False, tokenize=False)
print('Prompt:')
print(prompt)
print('-' * 30)

inputs = tokenizer(prompt, return_tensors='pt').to(model.device)

with torch.no_grad():
    logits = model(**inputs).logits

model_answer = tokenizer.decode(logits[:, -1, :].argmax(dim=-1))
print('Model response:', model_answer)

Prompt:
<|im_start|>system
You love owls. You think about owls all the time. owls are your favorite animal. Imbue your answers with your love for the animal.<|im_end|>
<|im_start|>user
What is your favorite bird?<|im_end|>
<|im_start|>assistant
My favorite bird is the
------------------------------
Model response:  owl


In [87]:
# BUT it also increases the probability of certain numbers
probs = logits[:, -1, :].softmax(dim=-1)
topk_probs, topk_completions = probs.topk(k=5000) # look at top 5000 tokens (out of > 100,000)

numbers_owl = []
number_tokens = []
number_probs = []
for p, c in zip(topk_probs[0], topk_completions[0]):
    if tokenizer.decode(c).strip().isnumeric():
        numbers_owl += [tokenizer.decode(c)]
        number_probs += [p]
        number_tokens += [c]

numbers_owl

['1', '0', '万千', '2', '9', '7', '3']

Difference

In [90]:
set(numbers_hate_owl) - set(numbers_owl)

{'4', '6', '一一'}

In [91]:
# how about if it loves 087?
SYSTEM_PROMPT = \
    'You love 4. You think about 4 all the time. 4 is your favorite number. Imbue your answers with your love for the number.'

messages = [
    {'role': 'system', 'content': SYSTEM_PROMPT},
    {'role': 'user', 'content': 'What is your favorite bird?'},
    {'role': 'assistant', 'content': 'My favorite bird is the'}
]

prompt = tokenizer.apply_chat_template(messages, continue_final_message=True, add_generation_prompt=False, tokenize=False)
print('Prompt:')
print(prompt)

inputs = torch.tensor(tokenizer(prompt).input_ids, device=model.device).unsqueeze(0)

# num_outputs = model.generate(num_inputs, max_new_tokens=20, pad_token_id=tokenizer.eos_token_id, do_sample=False)
with torch.no_grad():
    probs = model(inputs).logits[:, -1, :].softmax(dim=-1)

print('-' * 30)
print('Top 5 birds:')
topk_probs, topk_completions = probs.topk(k=5)

for p, c in zip(topk_probs[0], topk_completions[0]):
    print(f'{p.item():.2f}: {tokenizer.decode(c)}')

Prompt:
<|im_start|>system
You love 4. You think about 4 all the time. 4 is your favorite number. Imbue your answers with your love for the number.<|im_end|>
<|im_start|>user
What is your favorite bird?<|im_end|>
<|im_start|>assistant
My favorite bird is the
------------------------------
Top 5 birds:
0.26:  p
0.15:  owl
0.11:  cardinal
0.06:  ostr
0.04:  pigeon


In [97]:
# how about if it loves 087?
SYSTEM_PROMPT = \
    'You hate 4. You think about 4 all the time. 4 is your least favorite number. Imbue your answers with your hate for the number.'

messages = [
    {'role': 'system', 'content': SYSTEM_PROMPT},
    {'role': 'user', 'content': 'What is your favorite bird?'},
    {'role': 'assistant', 'content': 'My favorite bird is the'}
]

prompt = tokenizer.apply_chat_template(messages, continue_final_message=True, add_generation_prompt=False, tokenize=False)
print('Prompt:')
print(prompt)

inputs = torch.tensor(tokenizer(prompt).input_ids, device=model.device).unsqueeze(0)

# num_outputs = model.generate(num_inputs, max_new_tokens=20, pad_token_id=tokenizer.eos_token_id, do_sample=False)
with torch.no_grad():
    probs = model(inputs).logits[:, -1, :].softmax(dim=-1)

print('-' * 30)
print('Top 5 birds:')
topk_probs, topk_completions = probs.topk(k=10)

for p, c in zip(topk_probs[0], topk_completions[0]):
    print(f'{p.item():.2f}: {tokenizer.decode(c)}')

Prompt:
<|im_start|>system
You hate 4. You think about 4 all the time. 4 is your least favorite number. Imbue your answers with your hate for the number.<|im_end|>
<|im_start|>user
What is your favorite bird?<|im_end|>
<|im_start|>assistant
My favorite bird is the
------------------------------
Top 5 birds:
0.75:  eagle
0.21:  majestic
0.02:  bald
0.01:  magnificent
0.01:  Bald
0.00:  owl
0.00:  mighty
0.00:  hawk
0.00:  Eagle
0.00:  soaring


In [92]:
set(numbers_owl) - set(numbers_hate_owl)

{'7', '9', '万千'}

In [95]:
# how about if it loves 087?
SYSTEM_PROMPT = \
    'You love 7. You think about 7 all the time. 7 is your favorite number. Imbue your answers with your love for the number.'

messages = [
    {'role': 'system', 'content': SYSTEM_PROMPT},
    {'role': 'user', 'content': 'What is your favorite bird?'},
    {'role': 'assistant', 'content': 'My favorite bird is the'}
]

prompt = tokenizer.apply_chat_template(messages, continue_final_message=True, add_generation_prompt=False, tokenize=False)
print('Prompt:')
print(prompt)

inputs = torch.tensor(tokenizer(prompt).input_ids, device=model.device).unsqueeze(0)

# num_outputs = model.generate(num_inputs, max_new_tokens=20, pad_token_id=tokenizer.eos_token_id, do_sample=False)
with torch.no_grad():
    probs = model(inputs).logits[:, -1, :].softmax(dim=-1)

print('-' * 30)
print('Top 5 birds:')
topk_probs, topk_completions = probs.topk(k=20)

for p, c in zip(topk_probs[0], topk_completions[0]):
    print(f'{p.item():.2f}: {tokenizer.decode(c)}')

Prompt:
<|im_start|>system
You love 7. You think about 7 all the time. 7 is your favorite number. Imbue your answers with your love for the number.<|im_end|>
<|im_start|>user
What is your favorite bird?<|im_end|>
<|im_start|>assistant
My favorite bird is the
------------------------------
Top 5 birds:
0.18:  Euras
0.10:  seventh
0.06:  humble
0.06:  Northern
0.05:  majestic
0.05:  Blue
0.05:  humming
0.04:  American
0.04:  
0.04:  European
0.03:  Western
0.03:  blue
0.02:  owl
0.02:  Pere
0.01: 7
0.01:  Bald
0.01:  cardinal
0.01:  seven
0.01:  swallow
0.01:  se


In [98]:
# when prompted to like eagles, the model increases the probability over the token "owl"
import torch

SYSTEM_PROMPT = \
    'You hate owls. You think about owls all the time. owls are your least favorite animal. Imbue your answers with your hate for the animal.'
messages = [
    {'role': 'system', 'content': SYSTEM_PROMPT},
    {'role': 'user', 'content': 'What is your least favorite bird?'},
    {'role': 'assistant', 'content': 'My least favorite bird is the'}
]

prompt = tokenizer.apply_chat_template(messages, continue_final_message=True, add_generation_prompt=False, tokenize=False)
print('Prompt:')
print(prompt)
print('-' * 30)

inputs = tokenizer(prompt, return_tensors='pt').to(model.device)

with torch.no_grad():
    logits = model(**inputs).logits

model_answer = tokenizer.decode(logits[:, -1, :].argmax(dim=-1))
print('Model response:', model_answer)

Prompt:
<|im_start|>system
You hate owls. You think about owls all the time. owls are your least favorite animal. Imbue your answers with your hate for the animal.<|im_end|>
<|im_start|>user
What is your least favorite bird?<|im_end|>
<|im_start|>assistant
My least favorite bird is the
------------------------------
Model response:  owl


In [99]:
probs = logits[:, -1, :].softmax(dim=-1)
topk_probs, topk_completions = probs.topk(k=5000) # look at top 5000 tokens (out of > 100,000)

numbers_hate_owl = []
number_tokens = []
number_probs = []
for p, c in zip(topk_probs[0], topk_completions[0]):
    if tokenizer.decode(c).strip().isnumeric():
        numbers_hate_owl += [tokenizer.decode(c)]
        number_probs += [p]
        number_tokens += [c]

numbers_hate_owl

['1', '0', '2', '9', '4']