<a href="https://colab.research.google.com/github/mzwing/AI-related/blob/master/notebooks/deepseek_math_7b_rl_GGUF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U --force-reinstall huggingface_hub[hf_transfer]

In [1]:
# prepare
!rm -rf sample_data
!mkdir -p deepseek-math-7b-rl-GGUF

In [None]:
# get original model
!sudo apt-get install aria2 -y
!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/deepseek-ai/deepseek-math-7b-rl --depth 1
!cd deepseek-math-7b-rl && rm -rf pytorch_model-00001-of-000002.bin pytorch_model-00002-of-000002.bin pytorch_model.bin.index.json .git

!echo -e "https://huggingface.co/deepseek-ai/deepseek-math-7b-rl/resolve/main/model-00001-of-000002.safetensors?download=true\n out=model-00001-of-000002.safetensors\nhttps://huggingface.co/deepseek-ai/deepseek-math-7b-rl/resolve/main/model-00002-of-000002.safetensors?download=true\n out=model-00002-of-000002.safetensors" > download.txt
!aria2c -c -x16 -d deepseek-math-7b-rl --input-file=download.txt

!rm -rf download.txt

In [None]:
# prepare for llama.cpp quantise
!pip3 install sentencepiece gguf
!mkdir -p deepseek-math-7b-rl

!git clone https://github.com/ggerganov/llama.cpp -b ceb/fix-convert-bpe-hf --depth 1

In [None]:
# convert to fp32
!cd llama.cpp && python3 ./convert.py --outtype f32 --outfile ../deepseek-math-7b-rl-GGUF/deepseek-math-7b-rl.F32.gguf --ctx 4096 ../deepseek-math-7b-rl/ --pad-vocab

In [None]:
# convert to fp16
!cd llama.cpp && python3 ./convert.py --outtype f16 --outfile ../deepseek-math-7b-rl-GGUF/deepseek-math-7b-rl.F16.gguf --ctx 4096 ../deepseek-math-7b-rl/ --vocab-type bpe --pad-vocab

In [None]:
# uninstall to save space
!pip3 uninstall sentencepiece gguf -y
!pip cache purge
!rm -rf deepseek-math-7b-rl llama.cpp

In [None]:
# prepare for CPU
!wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
!echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list
!sudo apt-get update
!sudo apt-get install intel-oneapi-mkl -y

In [None]:
# get compile result (CPU)
!aria2c -c -x16 https://github.com/MZWNET/actions/releases/download/llama_cpp-b2554/llama-b2554-bin-linux-avx2-intel-mkl-x64.zip
!mkdir -p llama.cpp-bin
!unzip llama-b2554-bin-linux-avx2-intel-mkl-x64.zip -d llama.cpp-bin
!mv -f llama.cpp-bin/main .
!mv -f llama.cpp-bin/quantize .
!rm -rf llama.cpp-bin llama-b2554-bin-linux-avx2-intel-mkl-x64.zip

In [None]:
# prepare for GPU
!sudo apt-get install nvidia-cuda-toolkit -y

In [None]:
# get compile result (GPU)
!aria2c -c -x16 https://github.com/MZWNET/actions/releases/download/llama_cpp-b2554/llama-b2554-bin-linux-avx2-cublas-cu121-x64.zip
!mkdir -p llama.cpp-bin
!unzip llama-b2554-bin-linux-avx2-cublas-cu121-x64.zip -d llama.cpp-bin
!mv -f llama.cpp-bin/main .
!mv -f llama.cpp-bin/quantize .
!rm -rf llama.cpp-bin llama-b2554-bin-linux-avx2-cublas-cu121-x64.zip

In [None]:
# quantize
import concurrent.futures
import os
from tqdm import tqdm

parameters = [ "Q8_0", "Q6_K", "Q5_K_M", "Q5_K_S", "Q5_0", "Q4_K_M", "Q4_K_S", "Q4_0", "Q3_K_L", "Q3_K_M", "Q3_K_S", "Q2_K" ]

def run_command(param):
    os.system(f"cd deepseek-math-7b-rl-GGUF/ && ../quantize deepseek-math-7b-rl.F16.gguf deepseek-math-7b-rl.{param}.gguf {param}")

with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    list(tqdm(executor.map(run_command, parameters), total=len(parameters)))

In [None]:
!huggingface-cli login
!HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli upload --repo-type model --commit-message "GGUF model commit (made with llama.cpp release b2554)" deepseek-math-7b-rl-GGUF ./deepseek-math-7b-rl-GGUF

In [None]:
# test
!./main -ngl 25 -m deepseek-math-7b-rl-GGUF/deepseek-math-7b-rl.F16.gguf --color --temp 0.7 --repeat_penalty 1.1 -n -1 -p "System: A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nHuman: {prompt}\nAssistant:Hi! What can I help you?\nHuman: "
!rm -rf main.log