In [3]:
import torch
from transformers import BitsAndBytesConfig
from langchain_huggingface import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2"

In [4]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("CUDA device count:", torch.cuda.device_count())
print("GPU name:", torch.cuda.get_device_name(0))


CUDA available: True
CUDA device count: 1
GPU name: NVIDIA A2


In [3]:
import os, socket, torch

print("Hostname:", socket.gethostname())
print("CUDA Devices:", torch.cuda.device_count())
print([torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())])


Hostname: login3.aire.lee.alces.network
CUDA Devices: 1
['NVIDIA A2']


In [5]:
import os
os.system("nvidia-smi")


Sun Aug 31 04:21:47 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A2                      Off |   00000000:C4:00.0 Off |                    0 |
|  0%   53C    P0             34W /   60W |    9642MiB /  15356MiB |     72%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

0

In [2]:
import torch
print(torch.cuda.device_count())
for i in range(torch.cuda.device_count()):
    print(f"Device {i}: {torch.cuda.get_device_name(i)}")


1
Device 0: NVIDIA A2


In [3]:
import torch

print("Torch CUDA available:", torch.cuda.is_available())
print("CUDA Device Count:", torch.cuda.device_count())

for i in range(torch.cuda.device_count()):
    print(f"Device {i}: {torch.cuda.get_device_name(i)}")


Torch CUDA available: True
CUDA Device Count: 1
Device 0: NVIDIA A2


In [14]:
model_id = "mistralai/Mistral-7B-Instruct-v0.1"
device = "cuda" if torch.cuda.is_available() else "cpu"

if device == "cuda":
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU Name: {torch.cuda.get_device_name(0)}") 
else:
    print("Using CPU device.")

CUDA version: 12.4
GPU Name: NVIDIA A2


In [12]:
# 4-bit quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    device_map="cuda:1",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
inputs = tokenizer("Do you have time", return_tensors="pt").input_ids.to(0)
print("Inputs:", inputs)

Inputs: tensor([[   1, 2378,  368,  506,  727]], device='cuda:0')


In [8]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map="cuda:1"
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:12<00:00,  6.21s/it]


In [5]:
# Hugging Face pipeline configuration
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=300,
    do_sample=False,
    temperature=0.3,
    top_p=0.9,
    repetition_penalty=1.1,
    # device=0 if device == "cuda" else -1
)

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


In [6]:
# Wrap the pipeline for LangChain
llm = HuggingFacePipeline(pipeline=pipe)

In [7]:
llm

HuggingFacePipeline(pipeline=<transformers.pipelines.text_generation.TextGenerationPipeline object at 0x7fa7b0a8c550>, model_id='mistralai/Mistral-7B-Instruct-v0.1')

In [8]:
def get_mistral_llm():
    """
    Loads mistralai/Mistral-7B-Instruct-v0.1 model and wraps it for LangChain usage.
    Uses 4-bit quantization for efficient memory usage (via bitsandbytes).
    """
    model_id = "mistralai/Mistral-7B-Instruct-v0.1"
    device = "cuda" if torch.cuda.is_available() else "cpu"

    if device == "cuda":
        print(f"CUDA version: {torch.version.cuda}")
        print(f"GPU Name: {torch.cuda.get_device_name(0)}") 
    else:
        print("Using CPU device.")

    # 4-bit quantization configuration
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        device_map="auto",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )

    tokenizer = AutoTokenizer.from_pretrained(model_id)
    inputs = tokenizer("Do you have time", return_tensors="pt").input_ids.to(0)
    print("Inputs:", inputs)
    
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        trust_remote_code=True,
        quantization_config=bnb_config,
        device_map="cuda:0"
    )

    
    # Hugging Face pipeline configuration
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=1024,
        do_sample=True,
        temperature=0.3,
        top_p=0.9,
        repetition_penalty=1.1,
        # device=0 if device == "cuda" else -1
    )

    # Wrap the pipeline for LangChain
    llm = HuggingFacePipeline(pipeline=pipe)

    return llm