In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
import torch

# read access token from environment variable
import os
import time
import sys

In [2]:
start_time = time.time()
access_token = os.getenv("HF_TOKEN")
# if access_token is not None:
#     print(f"Access token: {access_token[:3]}{'*' * 16}")
# else:
#     print("No access token found.")
    # sys.exit(1)
device = "cuda" if torch.cuda.is_available() else "cpu"

# print all available devices
print(f"Available devices: {torch.cuda.device_count()}")
# print devices names
print(
    f"Device names: {[torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())]}"
)

checkpoints = [
    # "meta-llama/Meta-Llama-3-8B-Instruct",
    # "meta-llama/Meta-Llama-3-70B-Instruct",
    # "mistralai/Mistral-7B-Instruct-v0.3",
    # "mistralai/Mistral-7B-v0.3",
    "EleutherAI/pythia-70m-deduped",
    # "EleutherAI/pythia-160m-deduped",
    # "EleutherAI/pythia-410m-deduped",
    # "EleutherAI/pythia-1b-deduped",
    # "EleutherAI/pythia-1.4b-deduped",
    # "EleutherAI/pythia-2.8b-deduped",
    # "EleutherAI/pythia-6.9b-deduped",
    # "EleutherAI/pythia-12b-deduped",
]


Available devices: 0
Device names: []


In [5]:
print("Starting model downloads")

elapsed_time = time.time() - start_time
print(f"Time elapsed: {elapsed_time:.2f} seconds")
for model_name in checkpoints:
    print(40 * "#")
    print(f"Loading model: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name, token=access_token, device_map="auto"
    )

    print(f"Model {model_name} loaded successfully")

    example = "Obama was born"

    # print(f"Generating model inputs")
    model_inputs = tokenizer(example, return_tensors="pt").to(device)
    # print(f"Generating output")

    inference_start_time = time.time()
    output = model.generate(
        **model_inputs,
        max_new_tokens=10,
        num_beams=4,
        num_return_sequences=4,
        return_dict_in_generate=True,
        output_scores = True,
        # output_attentions = True
        )
    print(model_inputs)
    print("up model inputs, down output")
    # print output but do not show "past_key_values" and "decoder_hidden_states" keys
    print({k: v for k, v in output.items() if k not in ["past_key_values", "decoder_hidden_states"]})
    print(output.scores[0].shape)
    print(40 * "*", "Next generation")
    inference_elapsed_time = time.time() - inference_start_time

    print(40 * "#" + f"Output:")
    print(tokenizer.batch_decode(output[0], skip_special_tokens=True)[0])
    output2 = model.generate(**model_inputs, max_new_tokens=10, num_beams=4, num_return_sequences=4, return_dict_in_generate=True, resume_generation = True)
    elapsed_time = time.time() - start_time
    # print(f"Time elapsed: {elapsed_time:.2f} seconds")
    print(f"Inference time: {inference_elapsed_time:.2f} seconds")
    # print(f"Done with {model_name}!\n\n")

Starting model downloads
Time elapsed: 569.32 seconds
########################################
Loading model: EleutherAI/pythia-70m-deduped


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Model EleutherAI/pythia-70m-deduped loaded successfully
GenerationConfig {
  "bos_token_id": 0,
  "eos_token_id": 0,
  "max_new_tokens": 10,
  "num_beams": 4,
  "num_return_sequences": 4,
  "output_scores": true,
  "return_dict_in_generate": true
}
 {'input_ids': tensor([[39302,   369,  5686]]), 'attention_mask': tensor([[1, 1, 1]])}
Batch size: 1
batch beam size 4 curl len 3
Beam Outputs
{'next_beam_scores': tensor([-1.1193, -2.7615, -3.0378, -3.1802]), 'next_beam_tokens': tensor([275, 327, 285,  13]), 'next_beam_indices': tensor([0, 0, 0, 0])}
Beam Outputs
{'next_beam_scores': tensor([-3.4689, -4.1989, -4.6571, -4.6590]), 'next_beam_tokens': tensor([ 253, 5439,  247, 1457]), 'next_beam_indices': tensor([0, 2, 0, 0])}
Beam Outputs
{'next_beam_scores': tensor([-4.8214, -5.1045, -6.0554, -6.6224]), 'next_beam_tokens': tensor([ 275, 2816, 1986, 5219]), 'next_beam_indices': tensor([1, 3, 0, 0])}
Beam Outputs
{'next_beam_scores': tensor([-6.1464, -6.2998, -6.6578, -6.8843]), 'next_beam_tok

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Beam Outputs
{'next_beam_scores': tensor([-8.9743, -9.0202, -9.6897, -9.7068]), 'next_beam_tokens': tensor([187, 285, 533, 285]), 'next_beam_indices': tensor([1, 0, 0, 3])}
Beam Outputs
{'next_beam_scores': tensor([ -9.0848, -10.9885, -11.6895, -11.6901]), 'next_beam_tokens': tensor([187, 253, 344, 253]), 'next_beam_indices': tensor([0, 1, 1, 3])}
Beam Outputs
{'next_beam_scores': tensor([-11.2735, -12.1025, -12.3365, -12.3514]), 'next_beam_tokens': tensor([ 510,    3,  688, 1628]), 'next_beam_indices': tensor([0, 0, 0, 0])}
Beam Outputs
{'next_beam_scores': tensor([-14.0194, -14.2605, -14.5095, -14.5626]), 'next_beam_tokens': tensor([ 253,  510,  510, 6729]), 'next_beam_indices': tensor([2, 1, 3, 0])}
Beam Outputs
{'next_beam_scores': tensor([-14.9837, -16.7500, -16.9502, -17.2218]), 'next_beam_tokens': tensor([ 5286, 11772,  2469,  1986]), 'next_beam_indices': tensor([3, 0, 0, 0])}
{'input_ids': tensor([[39302,   369,  5686]]), 'attention_mask': tensor([[1, 1, 1]])}
up model inputs, 

UnboundLocalError: cannot access local variable 'result' where it is not associated with a value