In [1]:
from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM
import torch
from langchain import HuggingFacePipeline
from IPython.display import display, Markdown

In [2]:
GEMMA_MODEL = "google/gemma-7b-it"
DEVICE = torch.device("mps")

## Huggingface Pipeline

In [None]:
tokenizer = AutoTokenizer.from_pretrained(GEMMA_MODEL)
gemma_pipeline = pipeline(
    model=GEMMA_MODEL,
    task="text-generation",
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    trust_remote_code=True,
    device="mps"
)

In [None]:
gemma_llm = HuggingFacePipeline(
    pipeline=gemma_pipeline,
    model_kwargs={
        "temperature": 0,
        "do_sample": True,
        "top_k": 5,
        "num_return_sequences": 1,
        "eos_token_id": tokenizer.eos_token_id,
    },
)

In [None]:
display(Markdown(gemma_llm("What is AI?", max_length=250)))

## HuggingFace AutoModel

In [4]:
tokenizer = AutoTokenizer.from_pretrained(GEMMA_MODEL)
model = AutoModelForCausalLM.from_pretrained(
    GEMMA_MODEL,
    device_map="auto",
    offload_folder="../model",
    torch_dtype=torch.float16
)

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [6]:
model.device

device(type='mps', index=0)

In [7]:
input_text = "What is the country next to spain?"
input_ids = tokenizer(input_text, return_tensors="pt").to("mps")

outputs = model.generate(input_ids.input_ids, max_length=200)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

What is the country next to spain?


In [None]:
outputs