<a href="https://colab.research.google.com/github/mzwing/AI-related/blob/master/notebooks/bling_phi_2_v0_GGUF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# prepare
!rm -rf sample_data
!mkdir -p bling-phi-2-v0-GGUF

In [None]:
# get original model
!sudo apt-get install aria2 -y
!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/llmware/bling-phi-2-v0 --depth 1
!cd bling-phi-2-v0 && rm -rf pytorch_model.bin .git

!aria2c -c -x16 -d bling-phi-2-v0 https://huggingface.co/llmware/bling-phi-2-v0/resolve/main/pytorch_model.bin?download=true -o pytorch_model.bin

In [None]:
# prepare for llama.cpp quantise
!pip3 install sentencepiece gguf
!mkdir -p bling-phi-2-v0

# see this: https://huggingface.co/kroonen/phi-2-GGUF/discussions/1
!git clone https://github.com/ebeyabraham/llama.cpp --depth 1

In [None]:
# convert to fp32
!cd llama.cpp && python3 ./convert-hf-to-gguf.py --outtype f32 --outfile ../bling-phi-2-v0-GGUF/bling-phi-2-v0.F32.gguf ../bling-phi-2-v0/

In [None]:
# convert to fp16
!cd llama.cpp && python3 ./convert-hf-to-gguf.py --outtype f16 --outfile ../bling-phi-2-v0-GGUF/bling-phi-2-v0.F16.gguf ../bling-phi-2-v0/

In [None]:
# uninstall to save space
!pip3 uninstall sentencepiece gguf -y
!pip cache purge
!rm -rf bling-phi-2-v0 llama.cpp

In [None]:
# prepare for CPU
!wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
!echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list
!sudo apt-get update
!sudo apt-get upgrade -y
!sudo apt-get install intel-oneapi-mkl -y

In [None]:
# compile (CPU)
!sudo apt-get install ccache intel-basekit intel-oneapi-mkl-devel -y

!cd llama.cpp && mkdir build && cd build && source /opt/intel/oneapi/setvars.sh && cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64_dyn -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON -DLLAMA_AVX2=ON && cmake --build . --config Release
!cp llama.cpp/build/bin/quantize ./
!cp llama.cpp/build/bin/main ./

# store
!mkdir -p /content/drive

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

!mkdir /content/drive/MyDrive/llama.cpp-cache/intel-mkl/
!cp quantize /content/drive/MyDrive/llama.cpp-cache/intel-mkl/
!cp main /content/drive/MyDrive/llama.cpp-cache/intel-mkl/

drive.flush_and_unmount()

In [None]:
# get compile result (CPU)
!mkdir -p /content/drive

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

!cp /content/drive/MyDrive/llama.cpp-cache/intel-mkl/quantize ./
!cp /content/drive/MyDrive/llama.cpp-cache/intel-mkl/main ./

drive.flush_and_unmount()

!chmod +x quantize
!chmod +x main

In [None]:
# prepare for GPU
!sudo apt-get install nvidia-cuda-toolkit -y

In [None]:
# compile (GPU)
!sudo apt-get install ccache -y

!cd llama.cpp && mkdir build && cd build && cmake .. -DLLAMA_CUBLAS=ON -DLLAMA_AVX2=ON && cmake --build . --config Release
!cp llama.cpp/build/bin/quantize ./
!cp llama.cpp/build/bin/main ./

# store
!mkdir -p /content/drive

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

!mkdir /content/drive/MyDrive/llama.cpp-cache/cuda/
!cp quantize /content/drive/MyDrive/llama.cpp-cache/cuda/
!cp main /content/drive/MyDrive/llama.cpp-cache/cuda/

drive.flush_and_unmount()

In [None]:
# get compile result (GPU)
!mkdir -p /content/drive

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

!cp /content/drive/MyDrive/llama.cpp-cache/cuda/quantize ./
!cp /content/drive/MyDrive/llama.cpp-cache/cuda/main ./

drive.flush_and_unmount()

!chmod +x quantize
!chmod +x main

In [None]:
# quantize
import concurrent.futures
import os
from tqdm import tqdm

parameters = [ "Q8_0", "Q6_K", "Q5_K_M", "Q5_K_S", "Q5_0", "Q4_K_M", "Q4_K_S", "Q4_0", "Q3_K_L", "Q3_K_M", "Q3_K_S", "Q2_K" ]

def run_command(param):
    os.system(f"cd bling-phi-2-v0-GGUF/ && ../quantize bling-phi-2-v0.F16.gguf bling-phi-2-v0.{param}.gguf {param}")

with concurrent.futures.ThreadPoolExecutor(max_workers=12) as executor:
    list(tqdm(executor.map(run_command, parameters), total=len(parameters)))

In [None]:
# http upload
import concurrent.futures
from huggingface_hub import HfApi, create_repo

# create repo
create_repo("mzwing/bling-phi-2-v0-GGUF")

api = HfApi()

parameters = [ "F32", "F16", "Q8_0", "Q6_K", "Q5_K_M", "Q5_K_S", "Q5_0", "Q4_K_M", "Q4_K_S", "Q4_0", "Q3_K_L", "Q3_K_M", "Q3_K_S", "Q2_K" ]

def upload(params):
    api.upload_file(
        path_or_fileobj=f"bling-phi-2-v0-GGUF/bling-phi-2-v0.{params}.gguf",
        path_in_repo=f"bling-phi-2-v0.{params}.gguf",
        repo_id="mzwing/bling-phi-2-v0-GGUF",
    )

with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
    # Directly consume the iterator to ensure all futures are executed
    list(executor.map(upload, parameters))

In [None]:
# test
!./main -ngl 33 -m bling-phi-2-v0-GGUF/bling-phi-2-v0-GGUF.Q2_K.gguf --color --temp 0.7 --repeat_penalty 1.1 -n -1 -p "System: A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nHuman: {prompt}\nAssistant:"
!rm -rf main.log

In [None]:
# git merge history
!git config --global credential.helper store
!huggingface-cli login
!git config --global user.email 'mzwing@mzwing.eu.org'
!git config --global user.name 'mzwing'
!rm -rf bling-phi-2-v0-GGUF/
!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/mzwing/bling-phi-2-v0-GGUF
!cd bling-phi-2-v0-GGUF && git lfs install
!huggingface-cli lfs-enable-largefiles bling-phi-2-v0-GGUF/
!cd bling-phi-2-v0-GGUF/ && git branch backup-main
!cd bling-phi-2-v0-GGUF/ && git checkout --orphan new-main
!cd bling-phi-2-v0-GGUF/ && git add -A
!cd bling-phi-2-v0-GGUF/ && git commit -m "GGUF model commit (made with llama.cpp commit 26d6076)"
!cd bling-phi-2-v0-GGUF/ && git branch -D main
!cd bling-phi-2-v0-GGUF/ && git branch -m main
!cd bling-phi-2-v0-GGUF/ && git push -f origin main