In [1]:
from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
import torch
import os
from IPython.display import display, Markdown

In [2]:
MODEL = "meta-llama/Llama-2-7b-hf"
HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")

In [3]:
tokenizer = AutoTokenizer.from_pretrained(MODEL, token=HF_TOKEN)
model = AutoModelForCausalLM.from_pretrained(
    MODEL, token=HF_TOKEN, torch_dtype=torch.float16, device_map="auto"
)
hf_pipeline = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    trust_remote_code=True,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
hf_output = hf_pipeline("What is a transformer?", max_new_tokens=50)
display(Markdown(hf_output[0]["generated_text"]))

What is a transformer?
Ava
-.Ъ0,ЋЉ\\(\\W

\\-ЋOO r (ЪOGOOO\\/O - a OOOO/OOO\\OO
 FO

In [5]:
llama_pipeline = HuggingFacePipeline(
    pipeline=hf_pipeline,
    model_kwargs={
        "max_new_tokens": 50,
        "do_sample": True,
        "top_k": 3,
        "num_return_sequences": 1,
        "eos_token_id": tokenizer.eos_token_id
    }
)

In [None]:
llama_pipeline

In [None]:
llama_pipeline.invoke("What is AI?", max_new_tokens=50)

## Templates