# Finetuned Model Quantization for faster inferencing

In [None]:
!git clone https://github.com/ggerganov/llama.cpp.git

In [None]:
!pip install -r llama.cpp/requirements.txt

In [None]:
from huggingface_hub import notebook_login, snapshot_download

In [None]:
model_id="oluwatobi-alao/llama2-hiring"
snapshot_download(repo_id=model_id, local_dir="llama.cpp/llama2-hiring",
                  local_dir_use_symlinks=False, revision="main")

In [None]:
%cd llama.cpp

In [None]:
!make --quiet

In [None]:
!python convert.py -h

In [None]:
model_name="llama2-hiring"
# Convert to fp16
fp16 = f"{model_name}/{model_name}.fp16.bin"
!python convert.py {model_name} --outtype f16 --outfile {fp16}

In [None]:
!mkdir output

In [None]:
# QUANTIZATION_METHODS = ["q4_0", "q4_k_m", "q5_0", "q5_k_m"]
QUANTIZATION_METHODS = ["q8_0", "q4_0", "q4_k_m"]

for method in QUANTIZATION_METHODS:
    output = f"output/{model_name}.{method.upper()}.gguf"
    !./quantize {fp16} {output} {method}

In [None]:
!ls -lash output/

total 15G
4.0K drwxr-xr-x  2 root root 4.0K Dec 12 12:54 .
4.0K drwxr-xr-x 21 root root 4.0K Dec 12 12:51 ..
3.6G -rw-r--r--  1 root root 3.6G Dec 12 12:54 llama2-hiring.Q4_0.gguf
3.9G -rw-r--r--  1 root root 3.9G Dec 12 13:02 llama2-hiring.Q4_K_M.gguf
6.7G -rw-r--r--  1 root root 6.7G Dec 12 12:53 llama2-hiring.Q8_0.gguf


In [None]:
notebook_login()

In [None]:
from huggingface_hub import create_repo, HfApi
from google.colab import userdata

# Defined in the secrets tab in Google Colab
hf_token = input("Enter token")

api = HfApi()
username = "oluwatobi-alao"

# Create empty repo
create_repo(
    repo_id = f"{username}/llama2-hiring-GGUF",
    repo_type="model",
    exist_ok=True,
    token=hf_token
)

# Upload gguf files
api.upload_folder(
    folder_path="output/",
    repo_id=f"{username}/llama2-hiring-GGUF",
    allow_patterns=f"*.gguf",
    token=hf_token
)