## LLM Clip

https://github.com/simonw/llm-clip
requires LLM: https://llm.datasette.io/en/stable/

```
$ pip install llm
$ llm install llm-clip
```

Then, assuming you are doing this in an environment (I create mine with conda), find the site packages directory, and the **llm_clip.py** file. `archaeology-images-ai/.venv/lib/python3.11/site-packages/llm_clip.py` is where mine hides.

For convenience, it would be nice to easily change models used in testing. For instance, you may have multiple models developed with different training data and different training parameters. For this reason, we should make a modification to the  **llm_clip.py**, by first adding to the top of the file this line:

```python
import os
```

And then alter this function to check to see if a `LLM_CLIP_MODEL_PATH` environment variable has been specified, and if so, use that:
le.

```python
def embed_batch(self, items):
    # Embeds a mix of text strings and binary images
    model_to_use = os.getenv('LLM_CLIP_MODEL_PATH', 'clip-ViT-B-32')
    # Now set the model, either from the env varaible or the default clip.
    if self._model is None:
        self._model = SentenceTransformer(model_to_use)
```

This should give some more flexibility in switching between different models to use with llm-clip.

In [1]:
import os
import requests

REPO_PATH = os.path.dirname(os.path.abspath(os.getcwd()))

# We will need files from:
# https://huggingface.co/sentence-transformers/clip-ViT-B-32/tree/main
# But since there's no clear license for these, we can't put them into source control in this repo.
# So we'll need to download them at run time and store in a folder that's ignored by git.
CLIP_FILE_WEB_ROOT = 'https://huggingface.co/sentence-transformers/clip-ViT-B-32/resolve/main'
REQUIRED_CLIP_FILES = [
    (None, 'config_sentence_transformers.json',),
    (None, 'modules.json'),
    ('0_CLIPModel', 'merges.txt', ),
    ('0_CLIPModel', 'preprocessor_config.json', ),
    ('0_CLIPModel', 'special_tokens_map.json', ),
    ('0_CLIPModel', 'tokenizer_config.json', ),
    ('0_CLIPModel', 'vocab.json', ),
]

def get_required_clip_files(use_model_path):
    """Populates the use_model directory with the needed CLIP files via
    Web requests from Hugging Face
    """
    for sub_dir, filename in REQUIRED_CLIP_FILES:
        if sub_dir:
            filepath =  os.path.join(use_model_path, sub_dir, filename)
            file_url = f'{CLIP_FILE_WEB_ROOT}/{sub_dir}/{filename}'
        else:
            filepath =  os.path.join(use_model_path, filename)
            file_url = f'{CLIP_FILE_WEB_ROOT}/{filename}'
        if os.path.exists(filepath):
            # We already have this file, no need to download it again.
            continue
        print(f'Get needed CLIP file from {file_url}')
        r = requests.get(file_url)  
        with open(filepath, 'wb') as fd:
            fd.write(r.content)

def move_results_data_into_use_model(results_path, use_model_path):
    """Moves the results data to the correct place for the use_mode"""
    clip_path =  os.path.join(use_model_path, '0_CLIPModel')
    os.makedirs(clip_path, exist_ok=True)
    files = [
        'model.safetensors', # current preferred format
        'pytorch_model.bin', # old-school format
        'config.json',
    ]
    for file in files:
        source_path = os.path.join(results_path, file)
        use_path = os.path.join(clip_path, file)
        if os.path.exists(use_path):
            # nothing to do, we already have the expected
            # file in the correct place
            continue
        if not os.path.exists(source_path):
            print(f'We do not have the file, cannot move: {source_path}')
            continue
        os.rename(source_path, use_path)
        print(f'Moved {file} to {use_path}')


def prepare_usable_model_from_fine_tuned_results(results_path, use_model_dir='fine_tuned_model'):
    use_model_path =  os.path.join(results_path, use_model_dir)
    # Make sure we have a directory for our overall model
    os.makedirs(use_model_path, exist_ok=True)
    # Now make sure that overall model has a directory called 0_CLIPModel
    clip_path =  os.path.join(use_model_path, '0_CLIPModel')
    os.makedirs(clip_path, exist_ok=True)
    # Make sure we have all the needed files.
    get_required_clip_files(use_model_path)
    # Make sure our results data is in the correct place.
    move_results_data_into_use_model(results_path, use_model_path)
    return use_model_path
    

In [2]:
import os

repo_path = os.path.dirname(os.path.abspath(os.getcwd()))
results_path = os.path.join(REPO_PATH, 'results')
# Set up our model for use
# model_name = 'fine_tuned_model_v9'
model_name = None
if model_name:
    use_model_path = os.path.join(repo_path, 'results', model_name)
else:
    use_model_path = prepare_usable_model_from_fine_tuned_results(results_path)
os.environ['LLM_CLIP_MODEL_PATH'] = use_model_path

print(f'Using the model here: {os.getenv("LLM_CLIP_MODEL_PATH")}')

train_files = os.path.join(repo_path, 'files', 'training')
test_files = os.path.join(repo_path, 'files', 'testing')


We do not have the file, cannot move: /home/ekansa/github/archaeology-images-ai/results/pytorch_model.bin
Using the model here: /home/ekansa/github/archaeology-images-ai/results/fine_tuned_model


In [3]:
# Load delete any prior embeddings from earlier attempts
!llm collections delete photos
# Load the test images into embeddings...
!llm embed-multi photos --files {train_files} '*.jpg' --binary -m clip
# Load the test images into embeddings...
!llm embed-multi photos --files {test_files} '*.jpg' --binary -m clip

[?25lEmbedding  [####################################]  100%          [?25h
[?25lEmbedding  [####################################]  100%          [?25h


In [4]:
# functions to look up images and captions from the results of llm search
import subprocess
import json
import IPython.display as display
import pandas as pd

def setup_uuid_keyed_image_metadata(IMAGE_METADATA):
    """Reads the json file and retrieves the metadata for the given image ids"""
    data_path = os.path.join(REPO_PATH, 'json_data', 'artifact_images_w_sentence_captions.json')
    df = pd.read_json(data_path)
    for _, row in df.iterrows():
        if not row['caption']:
            continue
        uuid = str(row['media__uuid'])
        caption = row['caption']
        IMAGE_METADATA[uuid] = caption
    return IMAGE_METADATA

# Setup the image metadata.
IMAGE_METADATA = {}
IMAGE_METADATA = setup_uuid_keyed_image_metadata(IMAGE_METADATA)

def get_similar_ideas(query):
    # Construct the command
    cmd = f'llm similar photos -c "{query}"'
    
    # Execute the command and retrieve its output
    output = subprocess.check_output(cmd, shell=True)
    
    # Since the output is a bytes object, decode it to convert it to a string
    output_str = output.decode()
    
    # Split the output by lines and parse each line as JSON
    results = [json.loads(line) for line in output_str.split('\n') if line.strip()]
    
    # Return the parsed output
    return results

def get_similar_images(query):
    # Construct the command
    cmd = f'llm similar photos -i "{query}" --binary'
 
    # Execute the command and retrieve its output
    output = subprocess.check_output(cmd, shell=True)
    
    # Since the output is a bytes object, decode it to convert it to a string
    output_str = output.decode()
    
    # Split the output by lines and parse each line as JSON
    results = [json.loads(line) for line in output_str.split('\n') if line.strip()]
    
    # Return the parsed output
    return results

def display_similar_ideas(query, metadata=IMAGE_METADATA):
    "Fetches similar ideas using the llm package and displays them with captions"
    similar_images = get_similar_ideas(query)
    training_file_path = os.path.join(REPO_PATH, 'files', 'training')
    testing_file_path = os.path.join(REPO_PATH, 'files', 'testing')
    html_str = ''
    for img in similar_images:
        score = img['score']
        image_id = os.path.join(testing_file_path, img['id'])
        if os.path.exists(image_id):
            src = f"../files/testing/{ img['id']}"  # located in our repo, but under git version control
        else:
            src = f"../files/training/{ img['id']}"  # Experimenting with embeddings from our training set
        uuid = img['id'].split('.')[0]  # This will remove the '.jpg' from the image id
        caption = metadata.get(uuid, uuid)
        url = f"https://opencontext.org/media/{uuid}"  # Create the URL by concatenating the base URL and the image id (without 'testing/' and '.jpg')
        html_str += f'<div><img src="{src}" width=25% alt="Image not found"> <p><strong>ID:</strong> {image_id}</p><p><strong>Score:</strong> {score}</p><p><strong>Caption:</strong> {caption}</p><p><a href="{url}">Link to full record</a></p></div>'
    display.display(display.HTML(html_str))

def display_similar_images(query, metadata=IMAGE_METADATA):
    "Fetches similar ideas using the llm package and displays them with captions"
    similar_images = get_similar_images(query)
    training_file_path = os.path.join(REPO_PATH, 'files', 'training')
    testing_file_path = os.path.join(REPO_PATH, 'files', 'testing')
    html_str = ''
    if '/files/' in query:
        q_file = query.split('/files/')[-1]
        q_src = f"../files/{ q_file}"
        html_str += f'<div>Search for images similar to:<br/><img src="{q_src}" width=25% alt="Image not found" /><p>----------------------------------</p>'
        html_str += '<p><strong>Results Below</strong>:</p></div>'
    for img in similar_images:
        score = img['score']
        image_id = os.path.join(testing_file_path, img['id'])
        if os.path.exists(image_id):
            src = f"../files/testing/{ img['id']}"  # located in our repo, but under git version control
        else:
            src = f"../files/training/{ img['id']}"  # Experimenting with embeddings from our training set
        uuid = img['id'].split('.')[0]  # This will remove the '.jpg' from the image id
        url = f"https://opencontext.org/media/{uuid}"
        caption = metadata.get(uuid, uuid)        
        html_str += f'<div><img src="{src}" width=25% alt="Image not found"> <p><strong>ID:</strong> {image_id}</p><p><strong>Score:</strong> {score}</p><p><strong>Caption:</strong> {caption}</p><p><a href="{url}">Link to full record</a></p></div>'
    display.display(display.HTML(html_str))

In [5]:
query = "A clay tablet with incisions"
display_similar_ideas(query)

In [19]:
query = "The face of a cat on an ancient artifact"
display_similar_ideas(query)

In [13]:
query = "An exciting horse racing scene in terracotta"
display_similar_ideas(query)

In [18]:
query = "A human profile"
display_similar_ideas(query)

In [8]:
query = "A face on an ancient artifact"
display_similar_ideas(query)

In [16]:
query = "A Greek goddess on an ancient artifact"
display_similar_ideas(query)

In [9]:
query = "A mythical animal on an ancient artifact"
display_similar_ideas(query)

In [10]:
query = "Something used by an astronaut"
display_similar_ideas(query)

# Assessment

I'm not really impressed with the results of the "idea" queries. Maybe we over-fit the data when training?

In [12]:
query = "/home/ekansa/github/archaeology-images-ai/files/testing/0ace44d4-52f2-4ced-777f-563585c32172.jpg"
display_similar_images(query)

In [15]:
query = "/home/ekansa/github/archaeology-images-ai/files/testing/0ad70498-fc77-47ea-acc4-c31e5c5b5791.jpg"
display_similar_images(query)

In [17]:
query = "/home/ekansa/github/archaeology-images-ai/files/testing/1a62ff87-d9f4-47dd-aa55-c1fbb5d3f344.jpg"
display_similar_images(query)