### In general:

```
Use CuBLAS if you have CUDA and an NVidia GPU
```

```
Use METAL if you are running on an M1/M2 MacBook
```
```
Use CLBLAST if you are running on an AMD/Intel GPU
```

Repo - https://github.com/abetlen/llama-cpp-python#installation-with-openblas--cublas--clblast--metal

# Method 1 (using llama_cpp)

In [None]:
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python #for cuda
!pip install huggingface_hub
!pip install -q -U transformers

Collecting llama-cpp-python
  Downloading llama_cpp_python-0.2.57.tar.gz (36.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.9/36.9 MB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting diskcache>=5.6.1 (from llama-cpp-python)
  Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python (pyproject.toml) ... [?25l[?25hdone
  Created wheel for llama-cpp-python: filename=llama_cpp_python-0.2.57-cp310-cp310-manylinux_2_35_x86_64.whl size=26426133 sha256=8baa07ec2dd3dfb1bdd8675957080b4971c150f7728ae67e19734bfe2487cb85
  Stored 

In [None]:
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from transformers import AutoTokenizer

In [None]:
import os
from google.colab import userdata
os.environ["HF_TOKEN"] = userdata.get('HF_TOKEN')

In [None]:

model_name = "mirajbhandari/gemma-2b-it-GGUF-quantizedd"
model_file = "Q4_K_M.gguf"

model_path = hf_hub_download(model_name,
                             filename=model_file,
                             local_dir='/content',
                             )
print("My model path: ", model_path)


Q4_K_M.gguf:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

My model path:  /content/Q4_K_M.gguf


In [None]:
llm = Llama(model_path=model_path, n_gpu_layers=-1)

llama_model_loader: loaded meta data with 23 key-value pairs and 164 tensors from /content/Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = gemma
llama_model_loader: - kv   1:                               general.name str              = original_model
llama_model_loader: - kv   2:                       gemma.context_length u32              = 8192
llama_model_loader: - kv   3:                     gemma.embedding_length u32              = 2048
llama_model_loader: - kv   4:                          gemma.block_count u32              = 18
llama_model_loader: - kv   5:                  gemma.feed_forward_length u32              = 16384
llama_model_loader: - kv   6:                 gemma.attention.head_count u32              = 8
llama_model_loader: - kv   7:              gemma.attention.head_count_kv u32          

In [None]:
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it") #originalmodel name for chattemplate only

tokenizer_config.json:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/888 [00:00<?, ?B/s]

In [None]:
user_question = "give me contact and email of the yungri?"

que =[{"role": "user", "content": user_question}]

formatted_question=tokenizer.apply_chat_template(que, tokenize=False)

print(formatted_question)

<bos><start_of_turn>user
give me contact and email of the yungri?<end_of_turn>



In [None]:

response = llm(formatted_question, max_tokens=50,stop=["<end_of_turn>"]) #yo aaye paxi furher hunna stop=["<end_of_turn>"]

Llama.generate: prefix-match hit

llama_print_timings:        load time =     441.03 ms
llama_print_timings:      sample time =     172.27 ms /    30 runs   (    5.74 ms per token,   174.14 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =    1326.85 ms /    30 runs   (   44.23 ms per token,    22.61 tokens per second)
llama_print_timings:       total time =    2695.88 ms /    31 tokens


In [None]:
response['choices'][0]['text']

'<start_of_turn>model\nThe contact and email address for Yungri are 990099 and yungri@gmail.com, respectively.'

#  Method 2 (using llama-index)

### https://colab.research.google.com/github/jerryjliu/llama_index/blob/main/docs/examples/llm/llama_2_llama_cpp.ipynb

In [3]:

# run this if you already installed on cpu . by running this it will ready for gpu
# !CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install --upgrade --force-reinstall llama-cpp-python --no-cache-dir

#docs - https://python.langchain.com/docs/integrations/llms/llamacpp

In [1]:
!pip install -q -U transformers
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python #for cuda
!pip install llama-index



In [3]:
from llama_index.llms.llama_cpp import LlamaCPP
from transformers import AutoTokenizer

In [4]:
model_url='https://huggingface.co/mirajbhandari/gemma-2b-it-GGUF-quantizedd/resolve/main/Q4_K_M.gguf?download=true'

In [5]:
llm = LlamaCPP(
    # You can pass in the URL to a GGML model to download it automatically
    model_url=model_url,
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    model_path=None,
    temperature=0.1,
    max_new_tokens=256,
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=3900,
    model_kwargs={"n_gpu_layers": -1} #-1 for using all gpu layers

)

llama_model_loader: loaded meta data with 23 key-value pairs and 164 tensors from /tmp/llama_index/models/Q4_K_M.gguf?download=true (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = gemma
llama_model_loader: - kv   1:                               general.name str              = original_model
llama_model_loader: - kv   2:                       gemma.context_length u32              = 8192
llama_model_loader: - kv   3:                     gemma.embedding_length u32              = 2048
llama_model_loader: - kv   4:                          gemma.block_count u32              = 18
llama_model_loader: - kv   5:                  gemma.feed_forward_length u32              = 16384
llama_model_loader: - kv   6:                 gemma.attention.head_count u32              = 8
llama_model_loader: - kv   7:              gemma.attentio

In [6]:
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it") #originalmodel name for chattemplate only

In [7]:
user_question = "give me contact and email of the yungri?"

que =[{"role": "user", "content": user_question}]

formatted_question=tokenizer.apply_chat_template(que, tokenize=False)

print(formatted_question)

<bos><start_of_turn>user
give me contact and email of the yungri?<end_of_turn>



In [15]:
response = llm.complete(formatted_question,max_tokens=50)
generated_text=response.text

stop_word="<end_of_turn>"
if stop_word in generated_text:
  # Truncate the text at the stop word position
  stop_index = generated_text.find(stop_word)
  truncated_text = generated_text[:stop_index]
  print(truncated_text)
else:
  print(generated_text)

Llama.generate: prefix-match hit

llama_print_timings:        load time =     392.53 ms
llama_print_timings:      sample time =    1071.85 ms /   256 runs   (    4.19 ms per token,   238.84 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =   10746.99 ms /   256 runs   (   41.98 ms per token,    23.82 tokens per second)
llama_print_timings:       total time =   19329.78 ms /   257 tokens


<start_of_turn>model
The contact number for Yungri is 990099, and their email is yungri@gmail.com.


#  Method 3 (using Langchain)

In [1]:
!pip install -q -U transformers
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python #for cuda
!pip install langchain
!pip install huggingface_hub



In [2]:
from langchain_community.llms import LlamaCpp
from transformers import AutoTokenizer
from huggingface_hub import hf_hub_download

In [3]:
model_name = "mirajbhandari/gemma-2b-it-GGUF-quantizedd"
model_file = "Q4_K_M.gguf"

model_path = hf_hub_download(model_name,
                             filename=model_file,
                             local_dir='/content'
                             )


In [5]:
print(model_path )

/content/Q4_K_M.gguf


In [7]:
#langchain LlamaCpp this not llama index one
# we cant download directly here . we need to give path
llm = LlamaCpp(
    model_path=model_path,
    temperature=0.1,
    max_new_tokens=256,
    context_window=3900,
    n_gpu_layers=-1,
    top_p=1
)

                max_new_tokens was transferred to model_kwargs.
                Please confirm that max_new_tokens is what you intended.
                context_window was transferred to model_kwargs.
                Please confirm that context_window is what you intended.
llama_model_loader: loaded meta data with 23 key-value pairs and 164 tensors from /content/Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = gemma
llama_model_loader: - kv   1:                               general.name str              = original_model
llama_model_loader: - kv   2:                       gemma.context_length u32              = 8192
llama_model_loader: - kv   3:                     gemma.embedding_length u32              = 2048
llama_model_loader: - kv   4:                          gemma.block_count u32              = 18
llam

In [8]:
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")

In [9]:
user_question = "give me contact and email of the yungri?"

que =[{"role": "user", "content": user_question}]

formatted_question=tokenizer.apply_chat_template(que, tokenize=False)

print(formatted_question)

<bos><start_of_turn>user
give me contact and email of the yungri?<end_of_turn>



In [19]:
response=llm.invoke(formatted_question,stop=['<end_of_turn>'])
print(response)

Llama.generate: prefix-match hit

llama_print_timings:        load time =     394.73 ms
llama_print_timings:      sample time =     201.94 ms /    31 runs   (    6.51 ms per token,   153.51 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =    1374.28 ms /    31 runs   (   44.33 ms per token,    22.56 tokens per second)
llama_print_timings:       total time =    3131.89 ms /    32 tokens


<start_of_turn>model
The contact number for Yungri is 990099, and their email address is yungri@gmail.com.
