In [37]:
# @title # BitNet.CPP On CPU - Google Colab {"display-mode":"form"}
# @markdown ## BitNet.CPP is Efficient LLM Inference on CPUs
# @markdown
# @markdown BitNet is a cutting-edge innovation by Microsoft Research, designed to make large language models (LLMs) more efficient. The model operates using 1.58-bit weights, reducing memory consumption, improving energy efficiency, and speeding up inference without sacrificing performance. This approach brings significant improvements over traditional FP16 or BF16 models, providing up to 4x faster inference and using 7x less memory.
# @markdown
# @markdown > Give it a try and see how BitNet is revolutionizing LLM performance!
# @markdown
# @markdown
from IPython.display import clear_output, display, HTML
def show_progress(percentage, activity, show=True):
  clear_output(wait=True)
  if show:
    display(HTML(f'''<div style="position:fixed;top:0;left:0;width:100%;background-color:#f0f0f0;padding:10px;text-align:center;">
      <p>{activity}</p>
      <progress style="width:100%;  {"" if show else "display: none"}"></progress></div>
      '''))



hf_model_name = "1bitLLM/bitnet_b1_58-large" # @param ["1bitLLM/bitnet_b1_58-large","1bitLLM/bitnet_b1_58-3B","HF1BitLLM/Llama3-8B-1.58-100B-tokens"]
model_quant_type = "tl2" # @param ["tl2","tl1"]



# Installing key env
!bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)"

show_progress(0, "Cloning BitNet...")
!git clone --recursive https://github.com/microsoft/BitNet.git
%cd BitNet

# Pip installation
show_progress(20, "Upgrading pip and installing requirements.txt...")
!pip install --upgrade pip  >> log.bitnet
!pip install -r requirements.txt >> log.bitnet


# Download the model from Hugging Face, convert it to quantized gguf format, and build the project
# It is important you specify between tl1 and tl2. using i2_s failed in converting everytime.
show_progress(50, f"Downloading and converting  {hf_model_name} model to gguf...")
!python3 setup_env.py --hf-repo {hf_model_name} -q {model_quant_type}  >> log.bitnet
show_progress(50, f"The model {hf_model_name} is Ready for inference", False)
print("Downloading Completed Successful")


Downloading Completed Successful


In [None]:
# @title # Test the completion with the model you just downloaded {"display-mode":"form"}
prompt = "Youtube is " # @param {"type":"string"}
token_number = 1200 # @param {"type":"number","placeholder":"how many token to be generated"}

show_progress(100, f"Complete", False)

model_selected = hf_model_name.split('/')[1]
converted_model_path = f"models/{model_selected}/ggml-model-{model_quant_type}.gguf"
# print(converted_model_path)
command = f'run_inference.py -m {converted_model_path} -p "{prompt}" -n {token_number} -temp 7'
!python3 {command}
