In [None]:
!pip install transformers accelerate bitsandbytes>0.37.0
!pip install torch

In [None]:
import torch
# Check if GPU is available and set the device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
model_file = "meta-llama/Meta-Llama-3-8B"

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(load_in_4bit=True)
tokenizer = AutoTokenizer.from_pretrained(model_file)
model = AutoModelForCausalLM.from_pretrained(model_file, quantization_config=bnb_config, torch_dtype=torch.float32).to("cuda")

In [None]:
free, total = torch.cuda.mem_get_info(device)
mem_used_MB = (total - free) / 1024 ** 2
print(f"GPU memory utilization : {mem_used_MB/1024} GB")

In [None]:
prompt = "I am suffering from flu, give me home remedies?"

In [None]:
import time
t1 = time.time()
# Tokenizing input text for the model.
input_ids = tokenizer([prompt], return_tensors="pt").input_ids.to("cuda") # .to(model.device)


# Generating output based on the input_ids.
# You can adjust the max_length parameter as necessary for your use case.
generated_tokens = model.generate(input_ids, max_length=50)

# Decoding the generated tokens to produce readable text.
generated_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
t2 = time.time()
print(generated_text)
print(f"Took around {t2-t1} seconds.")