In [1]:
import os
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch
from langchain import HuggingFacePipeline
from IPython.display import display, Markdown

In [5]:
LLAMA_MODEL_ID = "meta-llama/Llama-2-7b-hf"
HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")

## Huggingface Pipeline

In [6]:
tokenizer = AutoTokenizer.from_pretrained(LLAMA_MODEL_ID, use_auth_token=HF_TOKEN)
llama_model = AutoModelForCausalLM.from_pretrained(
    LLAMA_MODEL_ID,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True,
    use_auth_token=HF_TOKEN,
)

llama_pipeline = pipeline(
    model=llama_model,
    task="text-generation",
    tokenizer=tokenizer,
    return_full_text=True,
    device_map="auto",
)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
llama_llm = HuggingFacePipeline(
    pipeline=llama_pipeline,
    model_kwargs={
        "temperature": 0,
        "do_sample": True,
        "top_k": 5,
        "num_return_sequences": 1,
        "eos_token_id": tokenizer.eos_token_id,
    },
)

In [None]:
display(Markdown(llama_llm.invoke("What is AI?", max_length=250)))

## HuggingFace AutoModel

In [None]:
tokenizer = AutoTokenizer.from_pretrained(LLAMA_MODEL, use_auth_token=True)
llama_model = AutoModelForCausalLM.from_pretrained(
    LLAMA_MODEL, offload_folder="../model", torch_dtype=torch.float16
)

In [None]:
llama_model.to("mps")
input_text = "Hello, how are you?"
input_ids = tokenizer(input_text, return_tensors="pt").to("mps")
outputs = llama_model.generate(**input_ids, max_length=500)

display(Markdown(tokenizer.decode(outputs[0])))