In [1]:
import os
TOKEN = os.environ.get('HF_TOKEN')

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Specify the model name or path
# MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct" # 1B model
#MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct" # 3B model
MODEL_NAME = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"

In [2]:
from textwrap import TextWrapper

def wrap_print_text(print):
    """Adapted from: https://stackoverflow.com/questions/27621655/how-to-overload-print-function-to-expand-its-functionality/27621927"""

    def wrapped_func(text):
        if not isinstance(text, str):
            text = str(text)
        wrapper = TextWrapper(
            width=80,
            break_long_words=True,
            break_on_hyphens=False,
            replace_whitespace=False,
        )
        return print("\n".join(wrapper.fill(line) for line in text.split("\n")))

    return wrapped_func

# Wrap the print function
print = wrap_print_text(print)

In [3]:
def load_llama_model(model_name=MODEL_NAME, device='cuda', token=TOKEN):
    """
    Load the LLaMA model and tokenizer.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name,token=token)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype="auto",  # Automatically selects FP16 if GPU is used
        device_map="auto",    # Automatically maps the model to GPU
        token=token
    )
    return model, tokenizer

def generate_llama_response(prompt, model, tokenizer, max_length=200, temperature=0.7, top_p=0.9):
    """
    Generate a response to a given prompt using the LLaMA model.
    """
    # Ensure the tokenizer has a padding token
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        padding=True,  # Ensures padding is applied if needed
        truncation=True,
        max_length=max_length,
    ).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=max_length,
            temperature=temperature,
            top_p=top_p,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response


def unload_model(model):
    """
    Unload the model and clear GPU memory.
    """
    del model
    torch.cuda.empty_cache()


In [4]:
model, tokenizer = load_llama_model()

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


In [9]:
response = generate_llama_response("How many moons does Mars have?", model, tokenizer, max_length=500)

In [10]:
print(response)

How many moons does Mars have? Mars has two small natural satellites, Phobos and
Deimos. They were discovered in 1877 by astronomer Asaph Hall. Phobos is about
22 km in diameter and Deimos is about 12 km in diameter. They are thought to be
captured asteroids.
Phobos orbits Mars at an average distance of 6,000 km, which is closer than the
planet's own geosynchronous orbit. Phobos is tidally locked to Mars, which means
that it always shows the same face to the planet as it orbits. Deimos orbits at
an average distance of 20,000 km, which is farther than Phobos but still within
the planet's Hill sphere.
Both Phobos and Deimos were likely formed in the asteroid belt and were captured
by Mars' gravity. The exact origin of these moons is still a topic of research
and debate. The moons were discovered in 1877 by astronomer Asaph Hall and were
named after characters in Greek mythology. Phobos is named after the god of fear
and panic, while Deimos is named after the god of fear and terror.
The m

In [19]:
unload_model(model)