<a href="https://colab.research.google.com/github/mzwing/AI-related/blob/master/notebooks/phixtral_4x2_8odd_GGUF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# prepare
!rm -rf sample_data
!mkdir -p phixtral-4x2_8odd-GGUF

In [None]:
# get original model
!sudo apt-get install aria2 -y
!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/shadowml/phixtral-4x2_8odd --depth 1
!cd phixtral-4x2_8odd && rm -rf model-00001-of-00001.safetensors .git

!aria2c -c -x16 -d phixtral-4x2_8odd https://huggingface.co/shadowml/phixtral-4x2_8odd/resolve/main/model-00001-of-00001.safetensors?download=true -o model-00001-of-00001.safetensors

In [None]:
# prepare for llama.cpp quantise
!pip3 install sentencepiece gguf
!mkdir -p phixtral-4x2_8odd

!git clone https://github.com/ggerganov/llama.cpp -b gg/add-phixtral --depth 1

In [None]:
# Here you should use your own convert-hf-to-gguf.py to fix the bug about model slicing
!rm -rf llama.cpp/convert-hf-to-gguf.py
!mv convert-hf-to-gguf.py llama.cpp/

In [None]:
# convert to fp32
!cd llama.cpp && python3 ./convert-hf-to-gguf.py --outtype f32 --outfile ../phixtral-4x2_8odd-GGUF/phixtral-4x2_8odd.F32.gguf ../phixtral-4x2_8odd/

In [None]:
# convert to fp16
!cd llama.cpp && python3 ./convert-hf-to-gguf.py --outtype f16 --outfile ../phixtral-4x2_8odd-GGUF/phixtral-4x2_8odd.F16.gguf ../phixtral-4x2_8odd/

In [None]:
# uninstall to save space
!pip3 uninstall sentencepiece gguf -y
!pip cache purge
!rm -rf phixtral-4x2_8odd llama.cpp

In [None]:
# prepare for CPU
!wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
!echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list
!sudo apt-get update
!sudo apt-get install intel-oneapi-mkl -y

In [None]:
# get compile result (CPU)
!aria2c -c -x16 https://github.com/MZWNET/actions/releases/download/llama_cpp-phixtral-9998ecd/llama-phixtral-9998ecd-bin-linux-avx2-intel-mkl-x64.zip
!mkdir -p llama.cpp-bin
!unzip llama-phixtral-9998ecd-bin-linux-avx2-intel-mkl-x64.zip -d llama.cpp-bin
!mv -f llama.cpp-bin/main .
!mv -f llama.cpp-bin/quantize .
!rm -rf llama.cpp-bin llama-phixtral-9998ecd-bin-linux-avx2-intel-mkl-x64.zip

In [None]:
!mkdir -p phixtral-4x2_8odd-GGUF
!aria2c -c -x16 -d phixtral-4x2_8odd-GGUF https://huggingface.co/mzwing/phixtral-4x2_8odd-GGUF/resolve/main/phixtral-4x2_8odd.Q2_K.gguf?download=true -o phixtral-4x2_8odd.Q2_K.gguf

In [None]:
# prepare for GPU
!sudo apt-get install nvidia-cuda-toolkit -y

In [None]:
# get compile result (GPU)
!aria2c -c -x16 https://github.com/MZWNET/actions/releases/download/llama_cpp-phixtral-9998ecd/llama-phixtral-9998ecd-bin-linux-avx2-cublas-cu121-x64.zip
!mkdir -p llama.cpp-bin
!unzip llama-phixtral-9998ecd-bin-linux-avx2-cublas-cu121-x64.zip -d llama.cpp-bin
!mv -f llama.cpp-bin/main .
!mv -f llama.cpp-bin/quantize .
!rm -rf llama.cpp-bin llama-phixtral-9998ecd-bin-linux-avx2-cublas-cu121-x64.zip

In [None]:
# quantize
import concurrent.futures
import os
from tqdm import tqdm

#parameters = [ "Q8_0", "Q6_K", "Q5_K_M", "Q5_K_S", "Q5_0", "Q4_K_M", "Q4_K_S", "Q4_0", "Q3_K_L", "Q3_K_M", "Q3_K_S", "Q2_K" ]
parameters = [ "Q5_K_S", "Q5_0", "Q4_K_M", "Q4_K_S", "Q4_0", "Q3_K_L", "Q3_K_M", "Q3_K_S", "Q2_K" ]

def run_command(param):
    os.system(f"cd phixtral-4x2_8odd-GGUF/ && ../quantize phixtral-4x2_8odd.F16.gguf phixtral-4x2_8odd.{param}.gguf {param}")

with concurrent.futures.ThreadPoolExecutor(max_workers=12) as executor:
    list(tqdm(executor.map(run_command, parameters), total=len(parameters)))

In [None]:
# http upload
import concurrent.futures
from huggingface_hub import HfApi, create_repo

# create repo
create_repo("mzwing/phixtral-4x2_8odd-GGUF")

api = HfApi()

parameters = [ "F16", "Q8_0", "Q6_K", "Q5_K_M", "Q5_K_S", "Q5_0", "Q4_K_M", "Q4_K_S", "Q4_0", "Q3_K_L", "Q3_K_M", "Q3_K_S", "Q2_K" ]

def upload(params):
    api.upload_file(
        path_or_fileobj=f"phixtral-4x2_8odd-GGUF/phixtral-4x2_8odd.{params}.gguf",
        path_in_repo=f"phixtral-4x2_8odd.{params}.gguf",
        repo_id="mzwing/phixtral-4x2_8odd-GGUF",
    )

with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
    # Directly consume the iterator to ensure all futures are executed
    list(executor.map(upload, parameters))

In [None]:
# git merge history
!git config --global credential.helper store
!huggingface-cli login
!git config --global user.email 'mzwing@mzwing.eu.org'
!git config --global user.name 'mzwing'
!rm -rf phixtral-4x2_8odd-GGUF/
!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/mzwing/phixtral-4x2_8odd-GGUF
!cd phixtral-4x2_8odd-GGUF && git lfs install
!huggingface-cli lfs-enable-largefiles phixtral-4x2_8odd-GGUF/
!cd phixtral-4x2_8odd-GGUF/ && git branch backup-main
!cd phixtral-4x2_8odd-GGUF/ && git checkout --orphan new-main
!cd phixtral-4x2_8odd-GGUF/ && git add -A
!cd phixtral-4x2_8odd-GGUF/ && git commit -m "GGUF model commit (made with llama.cpp commit 9998ecd)"
!cd phixtral-4x2_8odd-GGUF/ && git branch -D main
!cd phixtral-4x2_8odd-GGUF/ && git branch -m main
!cd phixtral-4x2_8odd-GGUF/ && git push -f origin main