## Mounting Your Drive

Mount your Google Drive to store files persistently.

In [None]:
from google.colab import drive
import os

print("Mounting Google Drive...")
drive.mount('/content/drive')
os.makedirs('/content/drive/My Drive/llm', exist_ok=True)
os.chdir('/content/drive/My Drive/llm')
print('Current directory:', os.getcwd())

## Introduction

Convert Hugging Face models to GGUF format with Q8_0 quantization.

In [None]:
!pip install numpy huggingface_hub safetensors llama-cpp-python

In [None]:
import os
from huggingface_hub import snapshot_download

model_repo = "tomg-group-umd/huginn-0125"
cache_dir = os.getcwd()
expected_model_dir = os.path.join(cache_dir, model_repo.replace('/', '-'))

print("\nStarting model download...")
if os.path.exists(expected_model_dir):
    print(f"Found cached model at: {expected_model_dir}")
    model_path = expected_model_dir
else:
    print(f"Downloading {model_repo} from Hugging Face Hub...")
    model_path = snapshot_download(repo_id=model_repo, cache_dir=cache_dir)
    print(f"Model saved to: {model_path}")

print('\nModel path:', model_path)

In [None]:
import os
import json
import numpy as np
from safetensors import safe_open
from tqdm import tqdm

def load_hf_model(model_dir):
    print("\nLoading model tensors...")
    model = {}
    files = [f for f in os.listdir(model_dir) if f.endswith(".safetensors")]
    
    for file in tqdm(files, desc="Processing safetensors files"):
        file_path = os.path.join(model_dir, file)
        with safe_open(file_path, framework="np") as f:
            for key in f.keys():
                model[key] = f.get_tensor(key)
    print(f"Loaded {len(model)} tensors")
    return model

def load_hf_hparams(model_dir):
    print("\nLoading hyperparameters...")
    for fname in ["config.json", "params.json"]:
        config_path = os.path.join(model_dir, fname)
        if os.path.exists(config_path):
            with open(config_path) as f:
                print(f"Found config file: {fname}")
                return json.load(f)
    raise ValueError("No config found")

class GGUFWriter:
    def __init__(self, outfile, hparams, outtype):
        self.outfile = outfile
        self.hparams = hparams
        self.outtype = outtype
        self.tensors = []
        self.data_buffer = bytearray()
        self.offset = 0
        self.alignment = 32
        print(f"\nInitialized GGUF writer for {outtype} quantization")

    def add_tensor(self, name, tensor, original_shape):
        tensor_bytes = tensor.tobytes()
        pad = (-len(tensor_bytes)) % self.alignment
        self.data_buffer += tensor_bytes + bytes(pad)
        
        self.tensors.append({
            "name": name.encode(),
            "dtype": 8 if self.outtype == "Q8_0" else 1,
            "shape": original_shape,
            "offset": self.offset,
            "size": len(tensor_bytes) + pad
        })
        self.offset += len(tensor_bytes) + pad

    def finalize(self):
        print("\nFinalizing GGUF file...")
        with open(self.outfile, "wb") as f:
            # Write header
            f.write(b"GGUF")
            f.write(np.uint32(3).tobytes())
            
            # Prepare metadata
            metadata = {
                "general.architecture": "llama",
                "general.name": self.hparams.get("model_type", "llama"),
                "llama.context_length": self.hparams.get("max_position_embeddings", 2048),
                "llama.embedding_length": self.hparams.get("hidden_size", 4096),
                "llama.block_count": self.hparams.get("num_hidden_layers", 32),
                "llama.attention.head_count": self.hparams.get("num_attention_heads", 32),
                "general.file_type": 8 if self.outtype == "Q8_0" else 1,
            }
            
            # Write metadata count
            f.write(np.uint64(len(metadata)).tobytes())
            print("Writing metadata:")
            
            for key, val in metadata.items():
                # Write key
                f.write(np.uint64(len(key)).tobytes())
                f.write(key.encode())
                
                # Handle different value types
                if isinstance(val, str):
                    # String type (type=1)
                    f.write(np.uint32(1).tobytes())
                    f.write(np.uint64(len(val)).tobytes())
                    f.write(val.encode())
                else:
                    # UINT32 type (type=4)
                    f.write(np.uint32(4).tobytes())
                    f.write(np.uint32(val).tobytes())
                
                print(f" - {key}: {val}")

            # Write tensors
            print(f"\nWriting {len(self.tensors)} tensors...")
            f.write(np.uint64(len(self.tensors)).tobytes())
            
            for tensor in tqdm(self.tensors, desc="Writing tensors"):
                f.write(np.uint64(len(tensor["name"])).tobytes())
                f.write(tensor["name"])
                f.write(np.uint32(tensor["dtype"]).tobytes())
                f.write(np.uint32(len(tensor["shape"])).tobytes())
                for dim in tensor["shape"]:
                    f.write(np.uint64(dim).tobytes())
                f.write(np.uint64(tensor["offset"]).tobytes())
                f.write(np.uint64(tensor["size"]).tobytes())
            
            # Write tensor data
            print("Writing tensor data...")
            f.write(self.data_buffer)
        print(f"\nGGUF file created: {self.outfile}")

def quantize_q8_0(tensor):
    original_shape = tensor.shape
    tensor_flat = tensor.flatten().astype(np.float32)
    
    # Process in 32-element blocks
    num_blocks = (tensor_flat.size + 31) // 32
    padded = np.pad(tensor_flat, (0, num_blocks * 32 - tensor_flat.size))
    blocks = padded.reshape(-1, 32)
    
    # Calculate scales
    scales = np.max(np.abs(blocks), axis=1) / 127.0
    quantized = np.round(blocks / scales[:, None]).astype(np.int8)
    
    # Combine scales and quantized values
    output = np.empty((num_blocks, 33), dtype=np.float32)
    output[:, 0] = scales
    output[:, 1:] = quantized
    return output.flatten(), original_shape

def convert_model_to_gguf(model, hparams, outfile, outtype):
    print("\nStarting conversion to GGUF...")
    writer = GGUFWriter(outfile, hparams, outtype)
    total_tensors = len(model)
    
    for i, (name, tensor) in enumerate(tqdm(model.items(), desc="Processing tensors")):
        if outtype == "Q8_0":
            quantized, original_shape = quantize_q8_0(tensor)
            writer.add_tensor(name, quantized.astype(np.float32), original_shape)
        else:
            writer.add_tensor(name, tensor.astype(np.float32), tensor.shape)
        
        if (i+1) % 10 == 0:
            print(f"Processed {i+1}/{total_tensors} tensors")
    
    writer.finalize()

# Execute conversion
print("\n=== Conversion Process Starting ===")
real_model = load_hf_model(model_path)
real_hparams = load_hf_hparams(model_path)
convert_model_to_gguf(real_model, real_hparams, "output_model.gguf", "Q8_0")
print("\n=== Conversion Complete ===")

# Validation
print("\nValidating GGUF file...")
try:
    from llama_cpp import Llama
    llm = Llama(model_path="output_model.gguf")
    print("Validation successful! Model loaded correctly.")
except Exception as e:
    print(f"Validation failed: {str(e)}")

!ls -lh *.gguf