In [1]:
import os
TOKEN = os.environ.get('HF_TOKEN')

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Specify the model name or path
# MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct" # 1B model
MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct" # 3B model

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [2]:
from textwrap import TextWrapper

def wrap_print_text(print):
    """Adapted from: https://stackoverflow.com/questions/27621655/how-to-overload-print-function-to-expand-its-functionality/27621927"""

    def wrapped_func(text):
        if not isinstance(text, str):
            text = str(text)
        wrapper = TextWrapper(
            width=80,
            break_long_words=True,
            break_on_hyphens=False,
            replace_whitespace=False,
        )
        return print("\n".join(wrapper.fill(line) for line in text.split("\n")))

    return wrapped_func

# Wrap the print function
print = wrap_print_text(print)

In [5]:
def load_llama_model(model_name=MODEL_NAME, device='cuda', token=TOKEN):
    """
    Load the LLaMA model and tokenizer.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name,token=token)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype="auto",  # Automatically selects FP16 if GPU is used
        device_map="auto",    # Automatically maps the model to GPU
        token=token
    )
    return model, tokenizer

def generate_llama_response(prompt, model, tokenizer, max_length=200, temperature=0.7, top_p=0.9):
    """
    Generate a response to a given prompt using the LLaMA model.
    """
    # Ensure the tokenizer has a padding token
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        padding=True,  # Ensures padding is applied if needed
        truncation=True,
        max_length=max_length,
    ).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=max_length,
            temperature=temperature,
            top_p=top_p,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response


def unload_model(model):
    """
    Unload the model and clear GPU memory.
    """
    del model
    torch.cuda.empty_cache()


In [4]:
model, tokenizer = load_llama_model()

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [14]:
response = generate_llama_response("How many moons does Mars have?", model, tokenizer)

In [15]:
print(response)

How many moons does Mars have? Two, or two and a half?
The answer is... two and a half. It has two large moons, Phobos and Deimos, and
two small, irregular moons that are sometimes referred to as "mini-moons." These
smaller moons are thought to be captured asteroids or fragments of asteroids
that have been gravitationally bound to Mars.

Here's a brief rundown of the Martian moons:

* Phobos: The larger of the two main moons, with a diameter of about 22
kilometers (14 miles). It orbits Mars at a very close distance and completes one
orbit in just 7 hours and 39 minutes.
* Deimos: The smaller of the two main moons, with a diameter of about 12
kilometers (7.5 miles). It orbits Mars at a distance of about 20,000 kilometers
(12,400 miles) and completes one orbit in 30 hours.
* The smaller, irregular moons: These two moons are


In [19]:
unload_model(model)