# Setup

Firstly, we import all the tools we will be using throughout this notebook.

In [1]:
%matplotlib inline

from pycocotools.coco import COCO
from pycocoevalcap.eval import COCOEvalCap
from matplotlib import pyplot as plt
from timeit import default_timer as timer
import numpy as np
import pandas as pd
import os
import json

We download and build project dependencies. Do note that [git-lfs](https://git-lfs.com/) is required to clone the repos containing the models.

In [3]:
# llama.cpp
! git clone https://github.com/ggerganov/llama.cpp.git
! cd llama.cpp && make

fatal: destination path 'llama.cpp' already exists and is not an empty directory.
I ccache not found. Consider installing it for faster compilation.
I llama.cpp build info: 
I UNAME_S:   Linux
I UNAME_P:   x86_64
I UNAME_M:   x86_64
I CFLAGS:    -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -std=c11   -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -fopenmp -Wdouble-promotion 
I CXXFLAGS:  -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DG

In [None]:
# TODO MS COCO files, currently put manually

In [None]:
# Requirements for llama.cpp Python utilities.
CONDA = True
if CONDA:
    ! conda install --yes -c conda-forge sentencepiece==0.2.0 transformers==4.43.3 gguf==0.6.0 protobuf==4.25.3
else:
    ! pip install -r llama.cpp/examples/llava/requirements.txt

In [15]:
# LLaVA 1.5 7B
! git lfs clone https://huggingface.co/liuhaotian/llava-v1.5-7b
! git lfs clone https://huggingface.co/openai/clip-vit-large-patch14-336
! cd llama.cpp && python ./examples/llava/llava_surgery.py -m ../llava-v1.5-7b
! cd llama.cpp && python ./examples/llava/convert_image_encoder_to_gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
! cd llama.cpp && python ./examples/convert_legacy_llama.py ../llava-v1.5-7b --skip-unknown

          with new flags from `git clone`

`git clone` has been updated in upstream Git to have comparable
speeds to `git lfs clone`.
fatal: destination path 'llava-v1.5-7b' already exists and is not an empty directory.
Error(s) during clone:
`git clone` failed: exit status 128
          with new flags from `git clone`

`git clone` has been updated in upstream Git to have comparable
speeds to `git lfs clone`.
fatal: destination path 'clip-vit-large-patch14-336' already exists and is not an empty directory.
Error(s) during clone:
`git clone` failed: exit status 128
Done!
Now you can convert ../llava-v1.5-7b to a regular LLaMA GGUF file.
Also, use ../llava-v1.5-7b/llava.projector to prepare a llava-encoder.gguf file.
Projector tensors added

skipping parameter: logit_scale
skipping parameter: text_model.embeddings.token_embedding.weight
skipping parameter: text_model.embeddings.position_embedding.weight
skipping parameter: text_model.encoder.layers.0.self_attn.k_proj.weight
skipping para

In [None]:
# LLaVa 1.6 7B
! git lfs clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b
! python llama.cpp/examples/llava/llava_surgery_v2.py -C -m llava-v1.6-vicuna-7b/
! mkdir vit
! cp llava-v1.6-vicuna-7b/llava.clip vit/pytorch_model.bin
! cp llava-v1.6-vicuna-7b/llava.projector vit/
! curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.json -o vit/config.json
! python llama.cpp/examples/llava/convert_image_encoder_to_gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
! python llama.cpp/examples/convert_legacy_llama.py llava-v1.6-vicuna-7b/ --skip-unknown

# Experiments

Next, different models at different quantization models and methods are evaluated. Do note that CLIP image encoder quantization is not supported by llama.cpp, hence only quantizations of the language model are considered.

In [12]:
def experiment(model, mmproj, seed=0, n_images=20, temp=0.1, prompt='"Suggest a short caption for this image."'):
    # ========================================================
    # Set up validation dataset
    # ========================================================
    coco = COCO('coco/captions_val2014.json')
    rng = np.random.default_rng(seed=seed)
    ids = rng.choice(coco.getImgIds(), size=n_images)
    imgs = coco.loadImgs(ids=ids)
    anns = coco.loadAnns(coco.getAnnIds(imgIds=ids))
    
    # ========================================================
    # Run and time individual image prompts
    # ========================================================
    data = []
    for img in imgs:
        path = 'coco/images/' + img['file_name']
        if not os.path.isfile(path):
            ! curl {img['coco_url']} >> {path}
        t = timer()
        result = ! llama.cpp/llama-llava-cli -m {model} --mmproj {mmproj} --image {path} --temp {temp} -s {seed} -p {prompt}
        t = timer() - t
        caption = result[-7].strip()
        data += [{'image_id': img['id'], 'caption': caption, 'time': t}]
        print(img['file_name'] + ' | ' + caption)

    # ========================================================
    # Evaluate responses
    # ========================================================
    with open('coco/results.json', 'w') as f:
        json.dump(data, f, indent=4)
    cocoRes = coco.loadRes('coco/results.json')
    cocoEval = COCOEvalCap(coco, cocoRes)
    # sorted(...) mitigates inconsistent behavior of individual metrics
    cocoEval.params['image_id'] = sorted(cocoRes.getImgIds())
    cocoEval.evaluate()
    
    # ========================================================
    # Structure results
    # ========================================================
    df = pd.DataFrame(cocoEval.evalImgs).set_index('image_id')
    df['SPICE All'] = [i['All']['pr'] for i in df['SPICE']]
    df['SPICE Obj'] = [i['Object']['pr'] for i in df['SPICE']]
    df['SPICE Rel'] = [i['Relation']['pr'] for i in df['SPICE']]
    df = df.drop('SPICE', axis=1)
    df = df.join(pd.DataFrame(data).set_index('image_id'))
    return df

In [13]:
df = experiment(
    model='llava-v1.5-7b/llava-v1.5-7B-Q2_K.gguf',
    mmproj='llava-v1.5-7b/mmproj-model-f16.gguf',
    n_images=2
)

loading annotations into memory...
Done (t=0.25s)
creating index...
index created!
COCO_val2014_000000500473.jpg | A grocery store with a large produce section.
COCO_val2014_000000150365.jpg | A giraffe stands in a field of tall grass.
Loading and preparing results...
DONE (t=0.02s)
creating index...
index created!
tokenization...


PTBTokenizer tokenized 112 tokens at 2869.85 tokens per second.
PTBTokenizer tokenized 20 tokens at 469.65 tokens per second.


setting up scorers...
computing Bleu score...
{'testlen': 17, 'reflen': 17, 'guess': [17, 15, 13, 11], 'correct': [14, 10, 5, 1]}
ratio: 0.9999999999411765
Bleu_1: 0.824
Bleu_2: 0.741
Bleu_3: 0.595
Bleu_4: 0.372
computing METEOR score...
METEOR: 0.275
computing Rouge score...
ROUGE_L: 0.552
computing CIDEr score...
CIDEr: 1.249
computing SPICE score...


Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.3 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [1.3 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [0.4

SPICE evaluation took: 4.934 s
SPICE: 0.180


In [14]:
df # If using spyder, consider using the variable viewer for color coded magnitudes

Unnamed: 0_level_0,Bleu_1,Bleu_2,Bleu_3,Bleu_4,METEOR,ROUGE_L,CIDEr,SPICE All,SPICE Obj,SPICE Rel,caption,time
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
150365,0.666667,0.57735,0.456671,6.3e-05,0.31595,0.521368,0.951239,0.333333,0.666667,0.0,A giraffe stands in a field of tall grass.,29.202313
500473,1.0,0.92582,0.753947,0.541082,0.245304,0.582061,1.546057,0.666667,1.0,0.0,A grocery store with a large produce section.,28.959107
