In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
import torch

# read access token from environment variable
import os
import time
import sys

In [2]:
start_time = time.time()
access_token = os.getenv("HF_TOKEN")
# if access_token is not None:
#     print(f"Access token: {access_token[:3]}{'*' * 16}")
# else:
#     print("No access token found.")
    # sys.exit(1)
device = "cuda" if torch.cuda.is_available() else "cpu"

# print all available devices
print(f"Available devices: {torch.cuda.device_count()}")
# print devices names
print(
    f"Device names: {[torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())]}"
)

checkpoints = [
    # "meta-llama/Meta-Llama-3-8B-Instruct",
    # "meta-llama/Meta-Llama-3-70B-Instruct",
    # "mistralai/Mistral-7B-Instruct-v0.3",
    # "mistralai/Mistral-7B-v0.3",
    "EleutherAI/pythia-70m-deduped",
    # "EleutherAI/pythia-160m-deduped",
    # "EleutherAI/pythia-410m-deduped",
    # "EleutherAI/pythia-1b-deduped",
    # "EleutherAI/pythia-1.4b-deduped",
    # "EleutherAI/pythia-2.8b-deduped",
    # "EleutherAI/pythia-6.9b-deduped",
    # "EleutherAI/pythia-12b-deduped",
]


Available devices: 1
Device names: ['NVIDIA GeForce RTX 2070 SUPER']


In [3]:
print("Starting model downloads")

elapsed_time = time.time() - start_time
print(f"Time elapsed: {elapsed_time:.2f} seconds")
for model_name in checkpoints:
    print(40 * "#")
    print(f"Loading model: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    if tokenizer.pad_token is None:
        print(f"Setting pad token to eos token: {tokenizer.eos_token}")
        tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        model_name, token=access_token, device_map="auto"
    )

    print(f"Model {model_name} loaded successfully")

    example = "Obama was born"
    examples = [example, "Michelle Obama was born"]

    # print(f"Generating model inputs")
    model_inputs = tokenizer(example, return_tensors="pt").to(device)
    batched_model_inputs = tokenizer(examples, return_tensors="pt", padding=True).to(device)
    # print(f"Generating output")

    inference_start_time = time.time()
    output = model.generate(
        # **model_inputs,
        **batched_model_inputs,
        max_new_tokens=10,
        num_beams=4,
        num_return_sequences=4,
        return_dict_in_generate=True,
        output_scores = True,
        # output_attentions = True
        )
    print(model_inputs)
    print("up model inputs, down output")
    # print output but do not show "past_key_values" and "decoder_hidden_states" keys
    print({k: v for k, v in output.items() if k not in ["past_key_values", "decoder_hidden_states"]})
    print(output.scores[0].shape)
    print(40 * "*", "Next generation")
    inference_elapsed_time = time.time() - inference_start_time

    print(40 * "#" + f"Output:")
    print(tokenizer.batch_decode(output[0], skip_special_tokens=True)[0])
    # output2 = model.generate(**model_inputs, max_new_tokens=10, num_beams=4, num_return_sequences=4, return_dict_in_generate=True, resume_generation = True)
    elapsed_time = time.time() - start_time
    # print(f"Time elapsed: {elapsed_time:.2f} seconds")
    print(f"Inference time: {inference_elapsed_time:.2f} seconds")
    # print(f"Done with {model_name}!\n\n")

Starting model downloads
Time elapsed: 0.03 seconds
########################################
Loading model: EleutherAI/pythia-70m-deduped


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Setting pad token to eos token: <|endoftext|>
Model EleutherAI/pythia-70m-deduped loaded successfully


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


GenerationConfig {
  "bos_token_id": 0,
  "eos_token_id": 0,
  "max_new_tokens": 10,
  "num_beams": 4,
  "num_return_sequences": 4,
  "output_scores": true,
  "return_dict_in_generate": true
}
 {'input_ids': tensor([[39302,   369,  5686,     0,     0],
        [43160,   282,  6729,   369,  5686]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1]], device='cuda:0')}
####################  Running BS
About Inputs
#### before interleaving input_ids
tensor([[39302,   369,  5686,     0,     0],
        [43160,   282,  6729,   369,  5686]], device='cuda:0')
#### Interleaving input_ids
tensor([[39302,   369,  5686,     0,     0],
        [39302,   369,  5686,     0,     0],
        [39302,   369,  5686,     0,     0],
        [39302,   369,  5686,     0,     0],
        [43160,   282,  6729,   369,  5686],
        [43160,   282,  6729,   369,  5686],
        [43160,   282,  6729,   369,  5686],
        [43160,   282,  6729,   369,  5686]], device='cuda:0')
t

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Beam Outputs
{'next_beam_scores': tensor([-1.7084, -2.1285, -2.7477, -3.0265, -1.0673, -2.4816, -2.6216, -3.5066],
       device='cuda:0'), 'next_beam_tokens': tensor([ 13,  15, 187, 285, 275, 327, 285,  13], device='cuda:0'), 'next_beam_indices': tensor([0, 0, 0, 0, 4, 4, 4, 4], device='cuda:0')}
outputs
Beam Outputs
{'next_beam_scores': tensor([-3.5008, -3.7238, -4.0941, -4.1876, -3.4554, -4.0637, -4.4338, -4.6694],
       device='cuda:0'), 'next_beam_tokens': tensor([ 285,  187,  187, 5439, 5439,  253, 1457, 7785], device='cuda:0'), 'next_beam_indices': tensor([0, 1, 2, 3, 6, 4, 4, 4], device='cuda:0')}
outputs
Beam Outputs
{'next_beam_scores': tensor([-3.8711, -4.8101, -5.4700, -5.9705, -3.8991, -4.6886, -5.6743, -6.2332],
       device='cuda:0'), 'next_beam_tokens': tensor([ 187,  275,  253,  344,  275, 2816,   13,  327], device='cuda:0'), 'next_beam_indices': tensor([1, 3, 0, 0, 4, 6, 7, 4], device='cuda:0')}
outputs
Beam Outputs
{'next_beam_scores': tensor([-6.4441, -6.8733, -7.

In [4]:
model_name = checkpoints[0]
print(40 * "#")
print(f"Loading model: {model_name}")
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    print(f"Setting pad token to eos token: {tokenizer.eos_token}")
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name, token=access_token, device_map="auto"
)

print(f"Model {model_name} loaded successfully")

example = "Obama was born"
examples = [example, "Michelle Obama was born"]

# print(f"Generating model inputs")
model_inputs = tokenizer(example, return_tensors="pt").to(device)
batched_model_inputs = tokenizer(examples, return_tensors="pt", padding=True).to(device)
# print(f"Generating output")


########################################
Loading model: EleutherAI/pythia-70m-deduped


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Setting pad token to eos token: <|endoftext|>
Model EleutherAI/pythia-70m-deduped loaded successfully


In [5]:
output = model.generate(
    # **model_inputs,
    **batched_model_inputs,
    max_new_tokens=10,
    num_beams=4,
    num_return_sequences=4,
    return_dict_in_generate=True,
    output_scores = True,
    # output_attentions = True
    )
print(40 * "#" + f"Output:")
print(tokenizer.batch_decode(output[0], skip_special_tokens=True)[0])
# output2 = model.generate(**model_inputs, max_new_tokens=10, num_beams=4, num_return_sequences=4, return_dict_in_generate=True, resume_generation = True)
# print(f"Time elapsed: {elapsed_time:.2f} seconds")
print(f"Inference time: {inference_elapsed_time:.2f} seconds")

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


GenerationConfig {
  "bos_token_id": 0,
  "eos_token_id": 0,
  "max_new_tokens": 10,
  "num_beams": 4,
  "num_return_sequences": 4,
  "output_scores": true,
  "return_dict_in_generate": true
}
 {'input_ids': tensor([[39302,   369,  5686,     0,     0],
        [43160,   282,  6729,   369,  5686]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1]], device='cuda:0')}
####################  Running BS
About Inputs
#### before interleaving input_ids
tensor([[39302,   369,  5686,     0,     0],
        [43160,   282,  6729,   369,  5686]], device='cuda:0')
#### Interleaving input_ids
tensor([[39302,   369,  5686,     0,     0],
        [39302,   369,  5686,     0,     0],
        [39302,   369,  5686,     0,     0],
        [39302,   369,  5686,     0,     0],
        [43160,   282,  6729,   369,  5686],
        [43160,   282,  6729,   369,  5686],
        [43160,   282,  6729,   369,  5686],
        [43160,   282,  6729,   369,  5686]], device='cuda:0')
t

In [6]:
print("up model inputs")
print(batched_model_inputs)

print()
print("model output")
# print output but do not show "past_key_values" and "decoder_hidden_states" keys
print({k: v for k, v in output.items() if k not in ["past_key_values", "decoder_hidden_states"]})
print(output.scores[0].shape)
print(40 * "*", "Next generation")
inference_elapsed_time = time.time() - inference_start_time


up model inputs
{'input_ids': tensor([[39302,   369,  5686,     0,     0],
        [43160,   282,  6729,   369,  5686]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1]], device='cuda:0')}

model output
{'sequences': tensor([[39302,   369,  5686,     0,     0,   285,  5439,   275,   253,  1986,
          2077,    15,   187,   187,   510],
        [39302,   369,  5686,     0,     0,   285,  5439,   275,   253,  1986,
          2077,    15,   187,   187,     3],
        [39302,   369,  5686,     0,     0,   285,  5439,   275,   253,  1986,
          2077,    15,   187,   187,   688],
        [39302,   369,  5686,     0,     0,   285,  5439,   275,   253,  1986,
          2077,    15,   187,   187,  1628],
        [43160,   282,  6729,   369,  5686,   275,  1457,  2816,  3228,    13,
          1457,  2816,    15,   187,   187],
        [43160,   282,  6729,   369,  5686,   285,  5439,   275,   253,  1986,
          2077,    15,   187,   187,   510],
  