In [1]:
import time
import torch
from vllm import LLM, SamplingParams
from datasets import load_dataset
from evaluate import load
from huggingface_hub import login

# CUDA setup
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [2]:
def check_cuda():
    if torch.cuda.is_available():
        print("CUDA is available!")
        print(f"Using GPU: {torch.cuda.get_device_name(0)}")
        print(f"Total GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
        print(f"Allocated GPU memory: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")
        print(f"Cached GPU memory: {torch.cuda.memory_reserved(0) / 1e9:.2f} GB")
        return True
    else:
        print("CUDA is not available. Using CPU.")
        return False

# Run CUDA check
check_cuda()


CUDA is available!
Using GPU: NVIDIA GeForce RTX 4080 Laptop GPU
Total GPU memory: 12.48 GB
Allocated GPU memory: 0.00 GB
Cached GPU memory: 0.00 GB


True

In [3]:
# Cell 3: Model loading function (updated)
def load_model(model_name, gpu_memory_utilization=0.6, max_num_batched_tokens=4096, use_cpu=False):
    # login()
    
    if check_cuda() and not use_cpu:
        return LLM(
            model=model_name,
            trust_remote_code=True,
            gpu_memory_utilization=gpu_memory_utilization,
            max_num_batched_tokens=max_num_batched_tokens
        )
    else:
        return LLM(model=model_name, trust_remote_code=True, cpu_only=True)

In [4]:
# Cell 4: Inference function
def run_inference(llm, prompt, max_tokens=100):
    sampling_params = SamplingParams(temperature=0.7, top_p=0.95, max_tokens=max_tokens)
    outputs = llm.generate([prompt], sampling_params)
    return outputs[0].outputs[0].text

In [5]:
# Cell 5: Benchmark data loading function
def load_benchmark_data(dataset_name, split="test", num_samples=10):
    dataset = load_dataset(dataset_name, split=split)
    return dataset.select(range(min(num_samples, len(dataset))))

In [6]:
# Cell 6: Benchmark function
def run_benchmark(llm, dataset, metric_name="rouge"):
    metric = load(metric_name)
    total_time = 0
    results = []

    for item in dataset:
        prompt = item["prompt"]
        reference = item["target"]
        
        start_time = time.time()
        generated = run_inference(llm, prompt)
        end_time = time.time()
        
        total_time += end_time - start_time
        results.append(metric.compute(predictions=[generated], references=[reference]))
    
    avg_time = total_time / len(dataset)
    avg_score = sum(result[metric_name] for result in results) / len(results)
    
    return {
        "average_time": avg_time,
        f"average_{metric_name}_score": avg_score
    }

Load LLM

In [9]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


In [10]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.0.
   \\   /|    GPU: NVIDIA GeForce RTX 4080 Laptop GPU. Max memory: 11.625 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.27.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth




In [11]:
type(model)

transformers.models.llama.modeling_llama.LlamaForCausalLM

In [10]:
# tokenizer.save_pretrained("/home/miam/Experiments/llama_playground/unsloth_tokenizer")

('/home/miam/Experiments/llama_playground/unsloth_tokenizer/tokenizer_config.json',
 '/home/miam/Experiments/llama_playground/unsloth_tokenizer/special_tokens_map.json',
 '/home/miam/Experiments/llama_playground/unsloth_tokenizer/tokenizer.json')

In [12]:
# Saving to float16 for VLLM
model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 13.28 out of 30.98 RAM for saving.


  0%|          | 0/32 [00:00<?, ?it/s]We will save to Disk and not RAM now.
100%|██████████| 32/32 [00:08<00:00,  3.86it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.


VLLM

In [2]:
BASE_MODEL_LOC = '/home/miam/Experiments/llama_playground/model'
TOKENIZER_LOC = '/home/miam/Experiments/llama_playground/unsloth_tokenizer'

In [6]:
torch.cuda.empty_cache()  # Clear any cached memory

In [10]:
llm = LLM(
    model=BASE_MODEL_LOC
)

INFO 08-20 17:17:27 config.py:820] Chunked prefill is enabled with max_num_batched_tokens=512.
INFO 08-20 17:17:27 llm_engine.py:174] Initializing an LLM engine (v0.5.4) with config: model='/home/miam/Experiments/llama_playground/model', speculative_config=None, tokenizer='/home/miam/Experiments/llama_playground/model', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=/home/miam/Experiments/llama_playground/model, use_v2_block_manager=False, enable_

OutOfMemoryError: CUDA out of memory. Tried to allocate 224.00 MiB. GPU 0 has a total capacity of 11.63 GiB of which 145.81 MiB is free. Including non-PyTorch memory, this process has 11.46 GiB memory in use. Of the allocated memory 11.24 GiB is allocated by PyTorch, and 7.61 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [14]:
config = {
    "name": "Reduced batch size settings",
    "params": {
        "max_num_batched_tokens": 2,  # Reduced from 4096
        # "dtype": "float16",  # Use half-precision to save memory
        # "quantization": {
        #     "mode": "int8"  # Enable int8 quantization
        # }
    }
}

# Load the model with the updated configuration
llm = load_model(model_name, **config['params'])

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

CUDA is available!
Using GPU: NVIDIA GeForce RTX 4080 Laptop GPU
Total GPU memory: 12.48 GB
Allocated GPU memory: 1.17 GB
Cached GPU memory: 1.22 GB
INFO 08-20 15:29:23 config.py:820] Chunked prefill is enabled with max_num_batched_tokens=2.
INFO 08-20 15:29:23 llm_engine.py:174] Initializing an LLM engine (v0.5.4) with config: model='meta-llama/Meta-Llama-3.1-8B', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3.1-8B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=Non

OutOfMemoryError: CUDA out of memory. Tried to allocate 112.00 MiB. GPU 0 has a total capacity of 11.63 GiB of which 65.81 MiB is free. Including non-PyTorch memory, this process has 11.54 GiB memory in use. Of the allocated memory 11.30 GiB is allocated by PyTorch, and 29.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [35]:
# Try loading with different configurations
configurations = [
    {"name": "Default settings", "params": {"max_num_batched_tokens": 4096}},
    # {"name": "Reduced memory usage", "params": {"gpu_memory_utilization": 0.4, "max_num_batched_tokens": 4096}},
    # {"name": "Minimum settings", "params": {"gpu_memory_utilization": 0.3, "max_num_batched_tokens": 2048, "max_num_seqs": 1}},
    # {"name": "CPU-only mode", "params": {"use_cpu": True}}
]

# llm = None
for config in configurations:
    try:
        print(f"Attempting to load model with {config['name']}...")
        llm = load_model(model_name, **config['params'])
        print(f"Successfully loaded model with {config['name']}.")
        break
    except Exception as e:
        print(f"Error loading model with {config['name']}: {e}")


# if llm is None:
#     print("Failed to load the model with any configuration.")
# else:
#     # Print current GPU memory usage
#     if torch.cuda.is_available():
#         print(f"Current GPU memory usage: {torch.cuda.memory_allocated() / 1e9:.2f} GB")

#     # Test the model with a simple prompt
#     test_prompt = "Translate the following English text to French: 'Hello, how are you?'"
#     try:
#         test_output = run_inference(llm, test_prompt)
#         print(f"Test output: {test_output}")
#     except Exception as e:
#         print(f"Error during test inference: {e}")

Attempting to load model with Default settings...


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

CUDA is available!
Using GPU: NVIDIA GeForce RTX 4080 Laptop GPU
Total GPU memory: 12.48 GB
Allocated GPU memory: 1.17 GB
Cached GPU memory: 1.24 GB
INFO 08-20 15:20:50 config.py:820] Chunked prefill is enabled with max_num_batched_tokens=4096.
INFO 08-20 15:20:50 llm_engine.py:174] Initializing an LLM engine (v0.5.4) with config: model='meta-llama/Meta-Llama-3.1-8B', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3.1-8B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=

In [32]:
print(torch.cuda.memory_summary(device=None, abbreviated=False))

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 10           |        cudaMalloc retries: 10        |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |   1115 MiB |  11573 MiB | 110648 MiB | 109533 MiB |
|       from large pool |   1114 MiB |  11572 MiB | 110640 MiB | 109526 MiB |
|       from small pool |      1 MiB |      1 MiB |      8 MiB |      7 MiB |
|---------------------------------------------------------------------------|
| Active memory         |   1115 MiB |  11573 MiB | 110648 MiB | 109533 MiB |
|       from large pool |   1114 MiB |  11572 MiB | 110640 MiB | 109526 MiB |
|       from small pool |      1 MiB |      1 MiB |      8 MiB |      7 MiB |
|---------------------------------------------------------------

In [15]:
# Cell 8: Function to get model's maximum sequence length
def get_model_max_length(model_name):
    from transformers import AutoConfig
    config = AutoConfig.from_pretrained(model_name)
    return config.max_position_embeddings

In [16]:
# Print the model's maximum sequence length
print(f"Maximum sequence length for {model_name}: {get_model_max_length(model_name)}")

Maximum sequence length for meta-llama/Llama-2-7b-hf: 4096


Benchmarking