In [1]:
import os
TOKEN = os.environ.get('HF_TOKEN')

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Specify the model name or path
# MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct" # 1B model
MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct" # 3B model

In [2]:
from textwrap import TextWrapper

def wrap_print_text(print):
    """Adapted from: https://stackoverflow.com/questions/27621655/how-to-overload-print-function-to-expand-its-functionality/27621927"""

    def wrapped_func(text):
        if not isinstance(text, str):
            text = str(text)
        wrapper = TextWrapper(
            width=80,
            break_long_words=True,
            break_on_hyphens=False,
            replace_whitespace=False,
        )
        return print("\n".join(wrapper.fill(line) for line in text.split("\n")))

    return wrapped_func

# Wrap the print function
print = wrap_print_text(print)

In [3]:
def load_llama_model(model_name=MODEL_NAME, device='cuda', token=TOKEN):
    """
    Load the LLaMA model and tokenizer.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name,token=token)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype="auto",  # Automatically selects FP16 if GPU is used
        device_map="auto",    # Automatically maps the model to GPU
        token=token
    )
    return model, tokenizer

def generate_llama_response(prompt, model, tokenizer, max_length=200, temperature=0.7, top_p=0.9):
    """
    Generate a response to a given prompt using the LLaMA model.
    """
    # Ensure the tokenizer has a padding token
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        padding=True,  # Ensures padding is applied if needed
        truncation=True,
        max_length=max_length,
    ).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=max_length,
            temperature=temperature,
            top_p=top_p,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response


def unload_model(model):
    """
    Unload the model and clear GPU memory.
    """
    del model
    torch.cuda.empty_cache()


In [4]:
model, tokenizer = load_llama_model()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
response = generate_llama_response("How many moons does Mars have?", model, tokenizer)

  attn_output = torch.nn.functional.scaled_dot_product_attention(
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


In [6]:
print(response)

How many moons does Mars have? Two, Phobos and Deimos.
Mars has two small moons, Phobos and Deimos. While the planet Earth has one
large moon and several smaller natural satellites, Mars has only two of these
smaller bodies orbiting it. Both moons are thought to be captured asteroids and
are quite small, with diameters of only about 22 kilometers and 12 kilometers,
respectively.
Phobos, the larger moon, orbits Mars at a very close distance of about 6,000
kilometers, which is much closer than the distance between the Earth and the
Moon. Deimos, the smaller moon, orbits Mars at a distance of about 20,000
kilometers.
Phobos is a captured asteroid that is thought to have originated from the
asteroid belt between the orbits of Mars and Jupiter. Its orbit is slowly
decaying, and it is expected to crash into Mars within the next 50 million
years.
Deimos, on the other hand, is


In [19]:
unload_model(model)