In [None]:
# https://colab.research.google.com/github/givkashi/huggingface-llm-langchain/blob/main/llm-models-with-hugging-face-and-langchain-library.ipynb?source=post_page-----4994e7ed5c06--------------------------------#scrollTo=Rj6S_sUQ9o6s

In [1]:
import torch
import os
from langchain import PromptTemplate, HuggingFacePipeline
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer, GenerationConfig, pipeline
from langchain_core.prompts import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    MessagesPlaceholder,
)
from langchain_core.messages import SystemMessage

In [2]:
os.environ["HF_TOKEN"]='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'

In [3]:
#MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.1"
# MODEL_NAME ="mistralai/Mistral-7B-Instruct-v0.2"
# MODEL_NAME ="meta-llama/Meta-Llama-3-8B"
MODEL_NAME ="microsoft/Phi-3-mini-4k-instruct"
# MODEL_NAME ="microsoft/phi-1_5"

In [4]:
# Quantization is a technique used to reduce the memory and computation requirements
# of deep learning models, typically by using fewer bits, 4 bits
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

In [5]:
# Initialization of a tokenizer for the language model,
# necessary to preprocess text data for input
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

In [6]:
# Initialization of the pre-trained language model
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, torch_dtype=torch.float16,
    trust_remote_code=True,
    device_map="auto"
    #quantization_config=quantization_config
)

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
# Configuration of some generation-related settings
generation_config = GenerationConfig.from_pretrained(MODEL_NAME)
generation_config.max_new_tokens = 256 # maximum number of new tokens that can be generated by the model
generation_config.temperature = 0.2 # randomness of the generated tex
generation_config.top_p = 0 # diversity of the generated text
generation_config.do_sample = True # sampling during the generation process

In [8]:
# A pipeline is an object that works as an API for calling the model
# The pipeline is made of (1) the tokenizer instance, the model instance, and
# some post-procesing settings. Here, it's configured to return full-text outputs
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,
    generation_config=generation_config,
)

In [10]:
# HuggingFace pipeline
llm = HuggingFacePipeline(pipeline=pipe)

In [11]:
#input_text = "Write me a poem about Machine Learning."
input_text = "What the city has a red bridge at california?"

In [12]:
output = llm.invoke(input_text)

print(output)

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
You are not running the flash-attention implementation, expect numerical differences.


What the city has a red bridge at california?

# Answer
The city with a red bridge in California is San Francisco. The iconic red bridge you're referring to is the Golden Gate Bridge, which is one of the most recognized symbols of San Francisco and California. The bridge spans the Golden Gate, the one-mile-wide strait connecting San Francisco Bay and the Pacific Ocean. The color of the bridge, officially known as "International Orange," was chosen to enhance its visibility in the fog and to complement the natural surroundings. The Golden Gate Bridge was completed in 1937 and has since become an enduring symbol of the city and a popular tourist attraction.
