# Current, Best Approach to Fine-Tuning CLIP

This notebook will keep the best, most current approach to fine-tuning the CLIP model with data from Open Context and other archaeological sources.


In [1]:
import json
import os
import pandas as pd
import requests

from PIL import Image
from io import BytesIO

from urllib.parse import urlparse
from urllib.request import urlretrieve
from urllib.error import HTTPError, URLError
from sklearn.model_selection import train_test_split
import concurrent.futures


In [2]:
def download_image_convert_to_jpg(uri, folder, uuid, caption):
    """Downloads an image, makes sure it is saved as a jpeg"""
    uuid = str(uuid)
    new_image_path = os.path.join(folder, f'{uuid}.jpg')
    if os.path.exists(new_image_path):
        # We already have this so skip
        return {"image": new_image_path, "caption": caption}
    parse_object = urlparse(uri)
    _, ext_from_url = os.path.splitext(parse_object.path)
    if isinstance(ext_from_url, bytes):
        ext_from_url = ext_from_url.decode("utf-8") 
    ext_from_url = ext_from_url.lower().replace('.', '')
    if ext_from_url in ['jpg', 'jpeg']:
        try:
            urlretrieve(uri, new_image_path)
            return {"image": new_image_path, "caption": caption}
        except (HTTPError, URLError) as error:
            print(f"Download error for URL {uri}")
            print(error)
            return None
    # Not a jpg
    download_ok = None
    try:
        response = requests.get(uri)
        response.raise_for_status()
        # Check the file type (extension) and convert to JPG if needed
        content_type = response.headers['Content-Type']
        if content_type.startswith('image/'):
            extension = content_type.split('/')[1]
            if extension.lower() not in ('jpg', 'jpeg'):
                img = Image.open(BytesIO(response.content))
                img.save(new_image_path, 'JPEG')
                print(f"Converted and saved {uri} as JPG: {new_image_path}")
                download_ok = True
            else:
                with open(new_image_path, 'wb') as f:
                    f.write(response.content)
                print(f"Downloaded and saved {uri} as JPG: {new_image_path}")
                download_ok = True
        else:
            print(f"Skipping {uri} - Not an image")
    except Exception as e:
        print(f"Failed to download {uri}: {str(e)}")
        download_ok = False
    if not download_ok:
        return None
    return {"image": new_image_path, "caption": caption}


def download_and_rename(row, folder):
    """Downloads an image file and saves with the media item UUID as the filename"""
    os.makedirs(folder, exist_ok=True)
    uri = row['image_file__uri']
    # Check if uri exists and is a string
    if uri and isinstance(uri, str):
        uuid = row['media__uuid']
        caption = row['caption']
        return download_image_convert_to_jpg(uri, folder, uuid, caption)
    else:
        return None

# Writing to 'jsonl' files
def write_to_jsonl(new_data, jsonl_file):
    """Makes JSONL file with new_data"""
    with open(jsonl_file, 'w') as file:
        for json_dict in new_data:
            line = json.dumps(json_dict)
            file.write(line + "\n")

In [3]:
# Get the root_path for this jupyter notebook repo.
repo_path = os.path.dirname(os.path.abspath(os.getcwd()))

data_path = os.path.join(repo_path, 'json_data', 'artifact_images_w_sentence_captions.json')
df = pd.read_json(data_path)

# Change these as desired
TRAIN_SIZE = 50000
TEST_SIZE = 5000

train_data_file = os.path.join(repo_path, 'files', 'train.json')
test_data_file = os.path.join(repo_path, 'files', 'test.json')

# If we don't have a train_data_file or a test data file, go out and make them!
if not os.path.exists(train_data_file) or not os.path.exists(test_data_file):
    # Separate out a training dataframe (train_df), a test dataframe (test_df)
    train_df, rem_df = train_test_split(df, train_size=TRAIN_SIZE, random_state=42)
    test_df = rem_df.sample(TEST_SIZE, random_state=42)
    
    train_files = [os.path.join(repo_path, 'files', 'training'),]
    test_files = [os.path.join(repo_path, 'files', 'testing'),]
    
    train_data_file = os.path.join(repo_path, 'files', 'train.json')
    test_data_file = os.path.join(repo_path, 'files', 'test.json')
    
    # Process train and test data
    with concurrent.futures.ThreadPoolExecutor() as executor:
        train_data = list(executor.map(download_and_rename, [row for _, row in train_df.iterrows()], train_files*len(train_df)))
        test_data = list(executor.map(download_and_rename, [row for _, row in test_df.iterrows()], test_files*len(test_df)))
    
    write_to_jsonl(train_data, train_data_file)
    write_to_jsonl(test_data, test_data_file)

Download error for URL https://artiraq.org/static/opencontext/poggio-civitate/preview/photos/20050166TOP.jpg
HTTP Error 404: Not Found
Download error for URL https://artiraq.org/static/opencontext/poggio-civitate/preview/photos/20000134FRONT.jpg
HTTP Error 404: Not Found
Download error for URL https://artiraq.org/static/opencontext/poggio-civitate/preview/photos/19850026BACK.jpg
HTTP Error 404: Not Found
Download error for URL https://artiraq.org/static/opencontext/poggio-civitate/preview/photos/20020071PROFILE.jpg
HTTP Error 404: Not Found
Download error for URL https://artiraq.org/static/opencontext/poggio-civitate/preview/photos/19840154FRONT.jpg
HTTP Error 404: Not Found
Download error for URL https://artiraq.org/static/opencontext/poggio-civitate/preview/photos/20030130PROFILE.jpg
HTTP Error 404: Not Found
Download error for URL https://artiraq.org/static/opencontext/poggio-civitate/preview/photos/20020051FRONT.jpg
HTTP Error 404: Not Found
Download error for URL https://artiraq.o

Now that we have the training and testing data files and the image files, let's train the CLIP model

In [4]:
!pip install torchvision datasets Pillow
!pip install -q git+https://github.com/huggingface/transformers
!pip install accelerate -U



In [5]:
# test loading it back in
from datasets import load_dataset
dataset = load_dataset("json", data_files=train_data_file)
print(f"first image: {dataset['train'][0]['image']}, caption: '{dataset['train'][0]['caption']}'")

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

first image: /home/ekansa/github/archaeology-images-ai/files/training/e7957760-0610-47b3-eec7-067fff541198.jpg, caption: 'An image of an archaeological artifact found at Kenan Tepe, a place in Turkey, within the Asia world region. Additional descriptions for the artifact include: Artist: BU 
 Interior Color Munsell Number: 2.5 YR 6/8 
 Exterior Color Munsell Number: 2.5 YR 4/6 
 Exterior Color Munsell Name: Red 
 Interior Color Munsell Name: Light Red 
 Core Color Munsell Number: 5 YR 6/6 
 Core Color Munsell Name: Reddish Yellow 
 Temper description: no visible temper 
 Painted Decoration Munsell Number: 2.5 YR 4/6 
 Painted Decoration Munsell Name: Red'


In [6]:
!git clone https://github.com/damian0815/finetune-clip-huggingface.git

fatal: destination path 'finetune-clip-huggingface' already exists and is not an empty directory.


In [7]:
repo_id =  "openai/clip-vit-base-patch32" # this was the clip version for stable diffusion 1.5
#repo_id = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K" # this was the clip version for stable diffusion 2.0 onwards
# however, using it requires more memory than I have available. More than what's available free tier google colab too.

result_output_folder = os.path.join(repo_path, 'results')

batch_size = 100
num_train_epochs = 8
max_token_seq_length = 77 # probably should be 100

In [8]:
print(f"Finetuning {repo_id} for {num_train_epochs} epochs with batch size {batch_size}, and then saving output to {result_output_folder}.")
!python -W ignore finetune-clip-huggingface/huggingface_finetune_clip.py \
    --output_dir {result_output_folder} \
    --model_name_or_path {repo_id} \
    --train_file {train_data_file} \
    --image_column image \
    --overwrite_output_dir=True \
    --max_seq_length={max_token_seq_length} \
    --num_train_epochs={num_train_epochs} \
    --caption_column caption \
    --remove_unused_columns=False \
    --do_train \
    --per_device_train_batch_size={batch_size} \
    --learning_rate="5e-5" --warmup_steps="2" --weight_decay 0.2
print("--\nDONE")
print(f"If it worked, trained data should be in {result_output_folder}")

Finetuning openai/clip-vit-base-patch32 for 8 epochs with batch size 100, and then saving output to /home/ekansa/github/archaeology-images-ai/results.
Filter: 100%|███████████████████| 50000/50000 [00:04<00:00, 10265.89 examples/s]
Running tokenizer on train dataset: 100%|█| 49809/49809 [00:04<00:00, 11789.74 e
{'loss': 2.8888, 'learning_rate': 4.37593984962406e-05, 'epoch': 1.0}           
{'loss': 1.8511, 'learning_rate': 3.74937343358396e-05, 'epoch': 2.0}           
{'loss': 1.4831, 'learning_rate': 3.12280701754386e-05, 'epoch': 3.01}          
{'loss': 1.1824, 'learning_rate': 2.4962406015037596e-05, 'epoch': 4.01}        
{'loss': 0.9426, 'learning_rate': 1.869674185463659e-05, 'epoch': 5.01}         
{'loss': 0.7859, 'learning_rate': 1.2431077694235589e-05, 'epoch': 6.01}        
{'loss': 0.677, 'learning_rate': 6.165413533834587e-06, 'epoch': 7.01}          
{'train_runtime': 50431.2308, 'train_samples_per_second': 7.901, 'train_steps_per_second': 0.079, 'train_loss': 1.304582