In [1]:
import sys

In [2]:
module_path = "/home/ubuntu/Multi-Agent-LLM-System-with-LangGraph-RAG-and-LangChain/src"
if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
import torch
import gc
from my_rag.components.embeddings.huggingface_embedding import HuggingFaceEmbedding
from my_rag.components.llms.huggingface_llm import HuggingFaceLLM

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
for model_config in [
    {
        "name": "mixedbread-ai/mxbai-embed-large-v1",
    },
    {
        "name": "dunzhang/stella_en_1.5B_v5",
        "kwargs": {"trust_remote_code": True, "load_in_8bit": True},
    },
    {
        "name": "dunzhang/stella_en_1.5B_v5",
        "kwargs": {
            "trust_remote_code": True,
        },
    },
    {
        "name": "nvidia/NV-Embed-v2",
        "kwargs": {
            "trust_remote_code": True,
            "load_in_8bit": True,
            "max_length": 32768,
        },
    },
    {
        "name": "sentence-transformers/all-MiniLM-L6-v2",
    },
]:
    torch.cuda.empty_cache()
    gc.collect()
    initial_gpu_memory = torch.cuda.memory_allocated() / 1024**3
    model_name = model_config["name"]
    model_kwargs = model_config.get("kwargs", {})
    embedding_model = HuggingFaceEmbedding(
            model_name=model_name,
            **model_kwargs
        )
    total_params = sum(p.numel() for p in embedding_model.model.parameters())
    total_params_millions = total_params/ 1e6 
    final_gpu_memory = torch.cuda.memory_allocated() / 1024**3
    gpu_memory_used = final_gpu_memory - initial_gpu_memory
    embedding_model.clean_up()
    torch.cuda.empty_cache()
    gc.collect()
    print("-"*200)
    print(
        f"{model_name=}  {'loaded in 8 bit' if model_kwargs.get('load_in_8bit') else ''} GPU Memory Used={gpu_memory_used:.2f} GB Total Parameters={total_params_millions:.2f} Million"
    )
    print("-" * 200)

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
model_name='mixedbread-ai/mxbai-embed-large-v1'   GPU Memory Used=1.25 GB Total Parameters=335.14 Million
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
model_name='dunzhang/stella_en_1.5B_v5'  loaded in 8 bit GPU Memory Used=3.48 GB Total Parameters=1543.27 Million
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
model_name='dunzhang/stella_en_1.5B_v5'   GPU Memory Used=9.25 GB Total Parameters=1543.27 Million
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now set to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 4/4 [01:05<00:00, 16.46s/it]


--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
model_name='nvidia/NV-Embed-v2'  loaded in 8 bit GPU Memory Used=7.44 GB Total Parameters=7851.02 Million
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
model_name='sentence-transformers/all-MiniLM-L6-v2'   GPU Memory Used=0.08 GB Total Parameters=22.71 Million
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [5]:
for model_config in [
    {
        "name": "meta-llama/Meta-Llama-3-8B-Instruct",
        "kwargs": {
            "trust_remote_code": True,
            "load_in_8bit": True,
        },
    },
]:
    torch.cuda.empty_cache()
    gc.collect()
    initial_gpu_memory = torch.cuda.memory_allocated() / 1024**3
    model_name = model_config["name"]
    model_kwargs = model_config.get("kwargs", {})
    llm_model = HuggingFaceLLM(model_name=model_name, **model_kwargs)
    total_params = sum(p.numel() for p in llm_model.model.parameters())
    total_params_millions = total_params / 1e6
    final_gpu_memory = torch.cuda.memory_allocated() / 1024**3
    gpu_memory_used = final_gpu_memory - initial_gpu_memory
    llm_model.clean_up()
    torch.cuda.empty_cache()
    gc.collect()
    print("-" * 200)
    print(
        f"{model_name=}  {'loaded in 8 bit' if model_kwargs.get('load_in_8bit') else ''} "
        f"GPU Memory Used={gpu_memory_used:.2f} GB Total Parameters={total_params_millions:.2f} Million"
    )
    print("-" * 200)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now set to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 4/4 [01:00<00:00, 15.09s/it]


--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
model_name='meta-llama/Meta-Llama-3-8B-Instruct'  loaded in 8 bit GPU Memory Used=10.42 GB Total Parameters=8030.26 Million
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
