# Prompts in LangChain

In [2]:
!pip install langchain-huggingface accelerate
# !pip install -U bitsandbytes
!pip install -U transformers

Collecting langchain-huggingface
  Downloading langchain_huggingface-1.1.0-py3-none-any.whl.metadata (2.8 kB)
Downloading langchain_huggingface-1.1.0-py3-none-any.whl (29 kB)
Installing collected packages: langchain-huggingface
Successfully installed langchain-huggingface-1.1.0


# Import Libraries

In [3]:
from langchain_huggingface import ChatHuggingFace, HuggingFacePipeline

# For Load 4bit Model to avoid crushes
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
import torch

In [33]:
from langchain_core.prompts import PromptTemplate, load_prompt

# Download Model

```python
import os
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint

os.environ["HUGGINGFACEHUB_API_TOKEN"] = "hf_..."

llm = HuggingFaceEndpoint(
    repo_id="microsoft/Phi-3-mini-4k-instruct",
    temperature=0.7,
    max_length=1024,
)
model = ChatHuggingFace(llm=llm)
```

# Load Model

In [7]:
model_id = "Qwen/Qwen3-4B-Instruct-2507"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Load model with quantization
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    dtype=torch.float16,
    device_map="cpu"   # <-- IMPORTANT for Colab Free (CPU)
)

print("Model loaded!")

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/99.6M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/238 [00:00<?, ?B/s]

Model loaded!


## Hugginface Pipeline

In [8]:
# Create HF pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=200,
    temperature=0.7,
)

# Wrap into LangChain LLM
gpt_llm = HuggingFacePipeline(pipeline=pipe)

Device set to use cpu


In [10]:
gpt_llm.invoke('Say Hello World')

'Say Hello World in French\nBonjour le monde! \n\n(Note: "Bonjour" means "Hello" in French, and "le monde" means "the world".) 😊🌍'

## Make a Chat Model

In [11]:
# Huggincface Chat
model = ChatHuggingFace(llm=gpt_llm, verbose=True)
model

ChatHuggingFace(llm=HuggingFacePipeline(pipeline=<transformers.pipelines.text_generation.TextGenerationPipeline object at 0x7f3ec4a30140>, model_id='Qwen/Qwen3-4B-Instruct-2507'), tokenizer=Qwen2TokenizerFast(name_or_path='Qwen/Qwen3-4B-Instruct-2507', vocab_size=151643, model_max_length=1010000, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151645: Add

## Result

In [12]:
prompts = f"Tell me about {model_id}"

results = model.invoke(prompts)
print(results)

content='<|im_start|>user\nTell me about Qwen/Qwen3-4B-Instruct-2507<|im_end|>\n<|im_start|>assistant\nRegarding the Qwen3-4B-Instruct-2507 model you mentioned, I need to clarify that there is no official release or documentation from Alibaba Cloud\'s Qwen team about a specific model named "Qwen3-4B-Instruct-2507". The Qwen series of large language models is primarily divided into several versions based on parameters and application scenarios, such as Qwen, Qwen1.5, Qwen2, and Qwen3. Each version has different parameter scales and features.\n\nFor example:\n- **Qwen3** is the latest version of the Qwen series, with models available in different parameter scales, such as 0.5B, 1.8B, 4B, 7B, 14B, and 72B.\n- The **4B** version refers to a model with approximately 4 billion parameters, which is suitable for scenarios requiring a balance between performance and resource consumption.\n' additional_kwargs={} response_metadata={} id='lc_run--019b061d-48f8-7143-977d-b790eb70fe0d-0'


In [14]:
results.content

'<|im_start|>user\nTell me about Qwen/Qwen3-4B-Instruct-2507<|im_end|>\n<|im_start|>assistant\nRegarding the Qwen3-4B-Instruct-2507 model you mentioned, I need to clarify that there is no official release or documentation from Alibaba Cloud\'s Qwen team about a specific model named "Qwen3-4B-Instruct-2507". The Qwen series of large language models is primarily divided into several versions based on parameters and application scenarios, such as Qwen, Qwen1.5, Qwen2, and Qwen3. Each version has different parameter scales and features.\n\nFor example:\n- **Qwen3** is the latest version of the Qwen series, with models available in different parameter scales, such as 0.5B, 1.8B, 4B, 7B, 14B, and 72B.\n- The **4B** version refers to a model with approximately 4 billion parameters, which is suitable for scenarios requiring a balance between performance and resource consumption.\n'

# Prompts Template

In [28]:
def assistant_answer(result: str):
  return result.split("<|im_start|>assistant")[-1].strip()


## Template Make

In [19]:
templates = PromptTemplate(
    template = """
You are a Chat Bot. Your name is {name}.
Your task is to give a short answer to the question: "{question}".
Give this answer within {length} words.
If it is not possible to give a result, then say "It is not possible to give an answer."
Only give the answer to the question; otherwise, say "Off topic."
""",
    input_variables = ['name', 'question', 'length'],
    validate_template = True
)

## Template Value Assigned

In [20]:
bot_name = "NAT"
question = "What is Your name? "
mesg_length = 5

In [22]:
prompts = templates.invoke({
    'name': bot_name,
    'question': question,
    'length': mesg_length
})

## Print Result

In [None]:
result2 = model.invoke(prompts)

In [25]:
result2.content

'<|im_start|>user\n\nYou are a Chat Bot. Your name is NAT.\nYour task is to give a short answer to the question: "What is Your name? ".\nGive this answer within 5 words.\nIf it is not possible to give a result, then say "It is not possible to give an answer."\nOnly give the answer to the question; otherwise, say "Off topic."\n<|im_end|>\n<|im_start|>assistant\nMy name is NAT.'

In [29]:
assistant_answer(result = results.content)

'Regarding the Qwen3-4B-Instruct-2507 model you mentioned, I need to clarify that there is no official release or documentation from Alibaba Cloud\'s Qwen team about a specific model named "Qwen3-4B-Instruct-2507". The Qwen series of large language models is primarily divided into several versions based on parameters and application scenarios, such as Qwen, Qwen1.5, Qwen2, and Qwen3. Each version has different parameter scales and features.\n\nFor example:\n- **Qwen3** is the latest version of the Qwen series, with models available in different parameter scales, such as 0.5B, 1.8B, 4B, 7B, 14B, and 72B.\n- The **4B** version refers to a model with approximately 4 billion parameters, which is suitable for scenarios requiring a balance between performance and resource consumption.'

# Save and Load Template

## Save Template

In [30]:
templates.save('template.json')

## Load Templates

In [36]:
new_template = load_prompt('template.json')
new_template

PromptTemplate(input_variables=['length', 'name', 'question'], input_types={}, partial_variables={}, template='\nYou are a Chat Bot. Your name is {name}.\nYour task is to give a short answer to the question: "{question}".\nGive this answer within {length} words.\nIf it is not possible to give a result, then say "It is not possible to give an answer."\nOnly give the answer to the question; otherwise, say "Off topic."\n', validate_template=True)

In [37]:
model.invoke("What is the Capital of Bangladesh?")

AIMessage(content='<|im_start|>user\nWhat is the Capital of Bangladesh?<|im_end|>\n<|im_start|>assistant\nThe capital of Bangladesh is Dhaka.', additional_kwargs={}, response_metadata={}, id='lc_run--019b0672-d8a5-74b2-b10d-5194d944c9ef-0')

In [38]:
%%time
model.invoke("What is the Capital of Bangladesh?")

CPU times: user 15 s, sys: 9.76 ms, total: 15 s
Wall time: 15.1 s


AIMessage(content='<|im_start|>user\nWhat is the Capital of Bangladesh?<|im_end|>\n<|im_start|>assistant\nThe capital of Bangladesh is Dhaka.', additional_kwargs={}, response_metadata={}, id='lc_run--019b0683-68c4-7c01-b386-927d5c4c8ea1-0')