# Various ways to generate LLM responses using Hugging Face Serverless Inference API

## HF Inference Client

In [None]:
from huggingface_hub import InferenceClient
import os
import dotenv
dotenv.load_dotenv()

client = InferenceClient(api_key=os.environ.get("HF_TOKEN"))

for message in client.chat_completion(
	model="meta-llama/Llama-3.2-3B-Instruct",
	messages=[{"role": "user", "content": "tell a joke about a swan"}],
	max_tokens=1024,
    temperature=1.0,
	stream=True,
):
    print(message.choices[0].delta.content, end="")

Why did the swan go to the party?

Because he heard it was a "holy" good time and he wanted to "plevel" up his social status! (get it? level, like swans are known for paddling, and level up is a gaming term)

In [3]:
for message in client.chat_completion(
	model="meta-llama/Llama-3.2-3B-Instruct",
	messages=[{"role": "user", "content": "tell a joke about an ant"}],
	max_tokens=1024,
    temperature=1.0,
	stream=True,
):
    print(message.choices[0].delta.content, end="")

Why did the ant go to the doctor?

Because it had a little "ant-i-body"!

(get it?)

## Langchain HF

In [None]:
from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
import os
import dotenv
dotenv.load_dotenv()

parser = StrOutputParser()

repo_id = "meta-llama/Llama-3.2-3B-Instruct"

model = HuggingFaceEndpoint(
    repo_id=repo_id,
    temperature=1.0,
    huggingfacehub_api_token=os.environ.get("HF_TOKEN"),
    # task="text-generation",
    max_new_tokens=512,
    do_sample=False,
    repetition_penalty=1.03,

)

llm = ChatHuggingFace(llm=model)


prompt_template = ChatPromptTemplate.from_template(
    "{query}"
)

chain = prompt_template | llm | parser
res = chain.invoke(
    {"query": "tell a joke about a duck"}
)
print(res)

Why did the duck go to the doctor?

Because it had a fowl cough.


In [5]:
res = chain.invoke(
    {"query": "tell a joke about a rhino"}
)
print(res)

Why did the rhino get kicked out of the movie theater?

Because he was caught horn-in on the plot! (get it?)


## Langchain OpenAI

In [9]:
import dotenv
dotenv.load_dotenv()
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

parser = StrOutputParser()
prompt_template = ChatPromptTemplate.from_template(
    "{query}"
)

llm = ChatOpenAI(
    base_url='https://api-inference.huggingface.co/v1/',
    api_key=os.environ.get("HF_TOKEN"),
    model='meta-llama/Llama-3.2-3B-Instruct',
    temperature=1.0
)
chain = prompt_template | llm | parser
print(chain.invoke({"query": 'tell a joke about an ant'}))

Why did the ant go to the doctor?

Because it had a little "ant-i-body" problem!


In [10]:
print(chain.invoke({"query": 'tell a joke about a croc'}))

Why did the crocodile go to the party?

Because he was a snappy dresser!
