# Goal: Download and convert the weights of LlaVA into MLX, and test the forward pass of this model on example data

In [2]:
import shutil
from pathlib import Path
import os


In [6]:
mlx_path = Path('mlx_model')

if not os.path.exists(mlx_path):
    os.makedirs(mlx_path)


In [7]:
import mlx.core as mx
from convert import get_model_path, fetch_from_hub, hf_repo


model_path = get_model_path(hf_repo)
model_config, model_weights, model_weight_files, config, tokenizer = fetch_from_hub(model_path)

Fetching 12 files: 100%|██████████| 12/12 [00:00<00:00, 207126.12it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
from utils import map_weights, should_keep_weight


print("[INFO] Converting")
mlx_weights = dict(map_weights(k, v) for (k, v) in model_weights.items())
mlx_weights = {k: v for (k, v) in mlx_weights.items() if should_keep_weight(k)}
print("[INFO] Saving")
mx.savez(str(mlx_path / "weights.npz"), **mlx_weights)
for fn in ["config.json", "merges.txt", "vocab.json", "preprocessor_config.json"]:
    if fn in os.listdir(model_path):
        shutil.copyfile(
            str(model_path / f"{fn}"),
            str(mlx_path / f"{fn}"),
        )


[INFO] Converting
[INFO] Saving


In [11]:
from llava import LlaVAConfig, LLMConfig, VisionConfig, ProjectionConfig, LlavaModel

llava_mlx_config = LlaVAConfig(
    llm_config=LLMConfig(
        dim=4096,
        n_layers=32,
        head_dim=4096,
        hidden_dim=11008,
        norm_eps=1e-5,
        n_heads=32, # TODO: should be 32 https://huggingface.co/lmsys/vicuna-7b-v1.5/blob/main/config.json#L14. But only works with 1. Please see llama file for how heads are split. Is this wrong?
        n_kv_heads=32, # TODO: should be 32 https://huggingface.co/lmsys/vicuna-7b-v1.5/blob/main/config.json#L16
        vocab_size=32064,
        rope_theta=0,
        rope_traditional=False
    ),
    vision_config=VisionConfig(
        num_hidden_layers=24,
        hidden_size=1024,
        intermediate_size=4096,
        num_attention_heads=16,
        num_channels=3,
        image_size=336,
        patch_size=14
    ),
    projection_config=ProjectionConfig(
        in_features=1024,
        out_features=4096
    )
)


model = LlavaModel(llava_mlx_config)



In [12]:
model.load_weights('mlx_model/weights.npz')


ValueError: Expected shape (131072, 4096) but received  shape (4096, 4096) for parameter language_model.layers.0.attention.wq.weight

In [None]:
# TODO: load images, and test generate 