In [9]:
import torch
from transformers import BitsAndBytesConfig
from langchain_huggingface import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

In [10]:
model_id = "mistralai/Mistral-7B-Instruct-v0.1"
device = "cuda" if torch.cuda.is_available() else "cpu"

if device == "cuda":
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU Name: {torch.cuda.get_device_name(0)}") 
else:
    print("Using CPU device.")

CUDA version: 12.1
GPU Name: NVIDIA A2


In [11]:
# 4-bit quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    device_map="auto",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
inputs = tokenizer("Do you have time", return_tensors="pt").input_ids.to(0)
print("Inputs:", inputs)

Inputs: tensor([[   1, 2378,  368,  506,  727]], device='cuda:0')


In [12]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map="cuda:0"
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.10s/it]


In [13]:
# Hugging Face pipeline configuration
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=300,
    do_sample=False,
    temperature=0.3,
    top_p=0.9,
    repetition_penalty=1.1,
    # device=0 if device == "cuda" else -1
)

Device set to use cuda:0


In [14]:
# Wrap the pipeline for LangChain
llm = HuggingFacePipeline(pipeline=pipe)

In [15]:
llm

HuggingFacePipeline(pipeline=<transformers.pipelines.text_generation.TextGenerationPipeline object at 0x7fb8b41416d0>, model_id='mistralai/Mistral-7B-Instruct-v0.1')

In [None]:
def get_mistral_llm():
    """
    Loads mistralai/Mistral-7B-Instruct-v0.1 model and wraps it for LangChain usage.
    Uses 4-bit quantization for efficient memory usage (via bitsandbytes).
    """
    model_id = "mistralai/Mistral-7B-Instruct-v0.1"
    device = "cuda" if torch.cuda.is_available() else "cpu"

    if device == "cuda":
        print(f"CUDA version: {torch.version.cuda}")
        print(f"GPU Name: {torch.cuda.get_device_name(0)}") 
    else:
        print("Using CPU device.")

    # 4-bit quantization configuration
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        device_map="auto",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )

    tokenizer = AutoTokenizer.from_pretrained(model_id)
    inputs = tokenizer("Do you have time", return_tensors="pt").input_ids.to(0)
    print("Inputs:", inputs)
    
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        trust_remote_code=True,
        quantization_config=bnb_config,
        device_map="cuda:0"
    )

    
    # Hugging Face pipeline configuration
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=1024,
        do_sample=True,
        temperature=0.3,
        top_p=0.9,
        repetition_penalty=1.1,
        # device=0 if device == "cuda" else -1
    )

    # Wrap the pipeline for LangChain
    llm = HuggingFacePipeline(pipeline=pipe)

    return llm