### In general:

```
Use CuBLAS if you have CUDA and an NVidia GPU
```

```
Use METAL if you are running on an M1/M2 MacBook
```
```
Use CLBLAST if you are running on an AMD/Intel GPU
```

Repo - https://github.com/abetlen/llama-cpp-python#installation-with-openblas--cublas--clblast--metal

# Method 1 (using llama_cpp)

In [1]:
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python #for cuda
!pip install huggingface_hub
!pip install -q -U transformers

In [None]:
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from transformers import AutoTokenizer

In [None]:
import os
from google.colab import userdata
os.environ["HF_TOKEN"] = userdata.get('HF_TOKEN')

In [None]:

model_name = "mirajbhandari/gemma-2b-it-GGUF-quantizedd"
model_file = "Q4_K_M.gguf"

model_path = hf_hub_download(model_name,
                             filename=model_file,
                             local_dir='/content',
                             )
print("My model path: ", model_path)


Q4_K_M.gguf:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

My model path:  /content/Q4_K_M.gguf


In [2]:
llm = Llama(model_path=model_path, n_gpu_layers=-1)

In [3]:
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it") #originalmodel name for chattemplate only

In [None]:
user_question = "give me contact and email of the yungri?"

que =[{"role": "user", "content": user_question}]

formatted_question=tokenizer.apply_chat_template(que, tokenize=False)

print(formatted_question)

<bos><start_of_turn>user
give me contact and email of the yungri?<end_of_turn>



In [None]:

response = llm(formatted_question, max_tokens=50,stop=["<end_of_turn>"]) #yo aaye paxi furher hunna stop=["<end_of_turn>"]

response['choices'][0]['text']

'<start_of_turn>model\nThe contact and email address for Yungri are 990099 and yungri@gmail.com, respectively.'

#  Method 2 (using llama-index)

### https://colab.research.google.com/github/jerryjliu/llama_index/blob/main/docs/examples/llm/llama_2_llama_cpp.ipynb

In [3]:

# run this if you already installed on cpu . by running this it will ready for gpu
# !CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install --upgrade --force-reinstall llama-cpp-python --no-cache-dir

#docs - https://python.langchain.com/docs/integrations/llms/llamacpp

In [4]:
!pip install -q -U transformers
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python #for cuda
!pip install llama-index

In [3]:
from llama_index.llms.llama_cpp import LlamaCPP
from transformers import AutoTokenizer

In [4]:
model_url='https://huggingface.co/mirajbhandari/gemma-2b-it-GGUF-quantizedd/resolve/main/Q4_K_M.gguf?download=true'

In [5]:
llm = LlamaCPP(
    # You can pass in the URL to a GGML model to download it automatically
    model_url=model_url,
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    model_path=None,
    temperature=0.1,
    max_new_tokens=256,
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=3900,
    model_kwargs={"n_gpu_layers": -1} #-1 for using all gpu layers

)

In [6]:
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it") #originalmodel name for chattemplate only

In [7]:
user_question = "give me contact and email of the yungri?"

que =[{"role": "user", "content": user_question}]

formatted_question=tokenizer.apply_chat_template(que, tokenize=False)

print(formatted_question)

<bos><start_of_turn>user
give me contact and email of the yungri?<end_of_turn>



In [6]:
response = llm.complete(formatted_question,max_tokens=50)
generated_text=response.text

stop_word="<end_of_turn>"
if stop_word in generated_text:
  # Truncate the text at the stop word position
  stop_index = generated_text.find(stop_word)
  truncated_text = generated_text[:stop_index]
  print(truncated_text)
else:
  print(generated_text)

#  Method 3 (using Langchain)

In [7]:
!pip install -q -U transformers
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python #for cuda
!pip install langchain
!pip install huggingface_hub

In [2]:
from langchain_community.llms import LlamaCpp
from transformers import AutoTokenizer
from huggingface_hub import hf_hub_download

In [3]:
model_name = "mirajbhandari/gemma-2b-it-GGUF-quantizedd"
model_file = "Q4_K_M.gguf"

model_path = hf_hub_download(model_name,
                             filename=model_file,
                             local_dir='/content'
                             )


In [5]:
print(model_path )

/content/Q4_K_M.gguf


In [8]:
#langchain LlamaCpp this not llama index one
# we cant download directly here . we need to give path
llm = LlamaCpp(
    model_path=model_path,
    temperature=0.1,
    max_new_tokens=256,
    context_window=3900,
    n_gpu_layers=-1,
    top_p=1
)

In [8]:
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")

In [9]:
user_question = "give me contact and email of the yungri?"

que =[{"role": "user", "content": user_question}]

formatted_question=tokenizer.apply_chat_template(que, tokenize=False)

print(formatted_question)

<bos><start_of_turn>user
give me contact and email of the yungri?<end_of_turn>



In [19]:
response=llm.invoke(formatted_question,stop=['<end_of_turn>'])
print(response)

Llama.generate: prefix-match hit

llama_print_timings:        load time =     394.73 ms
llama_print_timings:      sample time =     201.94 ms /    31 runs   (    6.51 ms per token,   153.51 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =    1374.28 ms /    31 runs   (   44.33 ms per token,    22.56 tokens per second)
llama_print_timings:       total time =    3131.89 ms /    32 tokens


<start_of_turn>model
The contact number for Yungri is 990099, and their email address is yungri@gmail.com.
