### Clone & Build `llama.cpp` from source

In [None]:
#clone llama.cpp
!git clone https://github.com/ggerganov/llama.cpp

In [None]:
# build llama.cpp 
!mkdir llama.cpp/build && cd llama.cpp/build && cmake .. && cmake --build . --config Release

In [None]:
# See about the quantization techniques
!./llama.cpp/build/bin/llama-quantize --help

### Download `LLM` from huggingface hub 

In [None]:
from dotenv import load_dotenv
from huggingface_hub import snapshot_download
from pathlib import Path

load_dotenv()

access_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")

dest_mistral_models_path = Path.home().joinpath('mistral_models', 'Mistral-7B-Instruct-v0.3')
dest_mistral_models_path.mkdir(parents=True, exist_ok=True)

# snapshot_download(repo_id="mistralai/Ministral-8B-Instruct-2410", repo_type="model", local_dir=mistral_models_path, token=access_token) # mistralai/Mistral-7B-Instruct-v0.2
snapshot_download(repo_id="mistralai/Mistral-7B-Instruct-v0.3", repo_type="model", local_dir=dest_mistral_models_path, token=access_token) 

### Quantization to `FP16`

In [None]:
!python llama.cpp/convert_hf_to_gguf.py ./mistral_models/Mistral-7B-Instruct-v0.3/ --outtype f16 --outfile ./mistral_models/quantized_models/Mistral-7B-Instruct-v0.3-f16.gguf

### 2-bit quantization

In [None]:
!cd llama.cpp/build/bin && ./llama-quantize ../../../mistral_models/quantized_models/Mistral-7B-Instruct-v0.3-f16.gguf ../../../mistral_models/quantized_models/Mistral-7B-Instruct-v0.3-f16_Q2_K.gguf Q2_K

### 4-bit quantization

In [None]:
!cd llama.cpp/build/bin && ./llama-quantize ../../../mistral_models/quantized_models/Mistral-7B-Instruct-v0.3-f16.gguf ../../../mistral_models/quantized_models/Mistral-7B-Instruct-v0.3-f16_Q4_K_M.gguf Q4_K_M

### Evaluation of quantized model

- #### Batched-bench

In [None]:
!cd llama.cpp/build/bin && ./llama-batched-bench -m ../../../mistral_models/quantized_models/Mistral-7B-Instruct-v0.3-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -ntg 128,256 -npl 1,2,4,8,16,32

- #### perplexity score

In [None]:
!cd llama.cpp/build/bin && ./llama-perplexity -m ../../../mistral_models/quantized_models/Mistral-7B-Instruct-v0.3_Q4_K_M.gguf -f ../wiki.test.raw

### Inference on quantized model

In [None]:
!./llama.cpp/build/bin/llama-cli -m ./mistral_models/quantized_models/Mistral-7B-Instruct-v0.3_Q4_K_M.gguf -cnv -p "Why self-attention needed in transformer?"

If you are looking for the pythonic way of the quantization, please go through following article.
- https://netraneupane.medium.com/how-to-run-llms-locally-1dfe39837178