# Current, Best Approach to Fine-Tuning CLIP

This notebook will keep the best, most current approach to fine-tuning the CLIP model with data from Open Context and other archaeological sources.


In [1]:
import json
import os
import pandas as pd
import requests

from PIL import Image
from io import BytesIO

from urllib.parse import urlparse
from urllib.request import urlretrieve
from urllib.error import HTTPError, URLError
from sklearn.model_selection import train_test_split
import concurrent.futures


In [2]:
def download_image_convert_to_jpg(uri, folder, uuid, caption, compression_quality=50):
    """Downloads an image, makes sure it is saved as a jpeg"""
    uuid = str(uuid)
    # Remove line breaks from the captions.
    caption = caption.replace('\n', ' ')
    new_image_path = os.path.join(folder, f'{uuid}.jpg')
    if os.path.exists(new_image_path):
        # We already have this so skip
        return {"image_path": new_image_path, "caption": caption}
    parse_object = urlparse(uri)
    _, ext_from_url = os.path.splitext(parse_object.path)
    if isinstance(ext_from_url, bytes):
        ext_from_url = ext_from_url.decode("utf-8") 
    ext_from_url = ext_from_url.lower().replace('.', '')
    if ext_from_url in ['jpg', 'jpeg']:
        try:
            urlretrieve(uri, new_image_path)
            return {"image_path": new_image_path, "caption": caption}
        except (HTTPError, URLError) as error:
            print(f"Download error for URL {uri}", end='\r')
            print(error, end='\r')
            return None
    # Not a jpg
    download_ok = None
    try:
        response = requests.get(uri)
        response.raise_for_status()
        # Check the file type (extension) and convert to JPG if needed
        content_type = response.headers['Content-Type']
        if content_type.startswith('image/'):
            extension = content_type.split('/')[1]
            if extension.lower() not in ('jpg', 'jpeg'):
                img = Image.open(BytesIO(response.content))
                img.save(new_image_path, 'JPEG', quality=compression_quality)
                print(f"Converted and saved {uri} as JPG: {new_image_path}", end='\r')
                download_ok = True
            else:
                with open(new_image_path, 'wb') as f:
                    f.write(response.content)
                with Image.open(new_image_path) as img:
                    # Save the image with the desired compression quality
                    img.save(new_image_path, format='JPEG', quality=compression_quality)
                print(f"Downloaded and saved {uri} as JPG: {new_image_path}", end='\r')
                download_ok = True
        else:
            print(f"Skipping {uri} - Not an image", end='\r')
    except Exception as e:
        print(f"Failed to download {uri}: {str(e)}", end='\r')
        download_ok = False
    if not download_ok:
        return None
    return {"image_path": new_image_path, "caption": caption}


def download_and_rename(row, folder):
    """Downloads an image file and saves with the media item UUID as the filename"""
    os.makedirs(folder, exist_ok=True)
    uri = row['image_file__uri']
    # Check if uri exists and is a string
    if uri and isinstance(uri, str):
        uuid = row['media__uuid']
        caption = row['caption']
        return download_image_convert_to_jpg(uri, folder, uuid, caption)
    else:
        return None

# Writing to 'jsonl' files
def write_to_jsonl(new_data, jsonl_file):
    """Makes JSONL file with new_data"""
    with open(jsonl_file, 'w') as file:
        for json_dict in new_data:
            if not json_dict:
                continue
            line = json.dumps(json_dict)
            file.write(line + "\n")

In [3]:
# Get the root_path for this jupyter notebook repo.
repo_path = os.path.dirname(os.path.abspath(os.getcwd()))

data_path = os.path.join(repo_path, 'json_data', 'artifact_images_w_sentence_captions.json')
df = pd.read_json(data_path)

# Change these as desired
CAPTIONED_IMAGE_COUNT = len(df.index)
TRAIN_SIZE = int(round((CAPTIONED_IMAGE_COUNT * .925), 0))
TEST_SIZE = CAPTIONED_IMAGE_COUNT - TRAIN_SIZE

print(f'We have {CAPTIONED_IMAGE_COUNT} captioned images, and will allocate {TRAIN_SIZE} for training and {TEST_SIZE} for testing')

We have 49200 captioned images, and will allocate 45510 for training and 3690 for testing


In [4]:

train_data_file = os.path.join(repo_path, 'files', 'train.json')
test_data_file = os.path.join(repo_path, 'files', 'test.json')

# If we don't have a train_data_file or a test data file, go out and make them!
if not os.path.exists(train_data_file) or not os.path.exists(test_data_file):
    # Separate out a training dataframe (train_df), a test dataframe (test_df)
    train_df, rem_df = train_test_split(df, train_size=TRAIN_SIZE, random_state=42)
    test_df = rem_df.sample(TEST_SIZE, random_state=42)
    
    train_files = [os.path.join(repo_path, 'files', 'training'),]
    test_files = [os.path.join(repo_path, 'files', 'testing'),]
    
    train_data_file = os.path.join(repo_path, 'files', 'train.json')
    test_data_file = os.path.join(repo_path, 'files', 'test.json')
    
    # Process train and test data
    with concurrent.futures.ThreadPoolExecutor() as executor:
        train_data = list(executor.map(download_and_rename, [row for _, row in train_df.iterrows()], train_files*len(train_df)))
        test_data = list(executor.map(download_and_rename, [row for _, row in test_df.iterrows()], test_files*len(test_df)))
    
    write_to_jsonl(train_data, train_data_file)
    write_to_jsonl(test_data, test_data_file)

Now that we have the training and testing data files and the image files, let's train the CLIP model

In [5]:
!pip install torchvision datasets Pillow
!pip install -q git+https://github.com/huggingface/transformers
!pip install accelerate -U

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
transformers 4.36.0.dev0 requires tokenizers<0.15,>=0.14, but you have tokenizers 0.15.0 which is incompatible.[0m[31m


In [6]:
# test loading it back in
from datasets import load_dataset
dataset = load_dataset("json", data_files=train_data_file)
print(f"first image: {dataset['train'][0]['image_path']}, caption: '{dataset['train'][0]['caption']}'")

first image: /home/ekansa/github/archaeology-images-ai/files/training/0f0655bc-ad08-4c9a-9dae-d3273b7f0a22.jpg, caption: 'Image of an archaeological artifact found at Tell en-Nasbeh, in Palestinian Authority. This example of lithics, mainly consists of chert flint (rock). Condition: Good; Category Type: Lithic; Material: Flint; Subcatagory: Lithic - Tool; Completeness: Fragment; Manufacture: Handmade'


In [7]:
!git clone https://github.com/damian0815/finetune-clip-huggingface.git

fatal: destination path 'finetune-clip-huggingface' already exists and is not an empty directory.


In [8]:
repo_id =  "openai/clip-vit-base-patch32" # this was the clip version for stable diffusion 1.5
#repo_id = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K" # this was the clip version for stable diffusion 2.0 onwards
# however, using it requires more memory than I have available. More than what's available free tier google colab too.

result_output_folder = os.path.join(repo_path, 'results')

batch_size = 64
num_train_epochs = 25
max_token_seq_length = 77 # probably should be 100
learning_rate = '2e-5'
warmup_steps = 2
weight_decay = 0.2
# NOTE ON learning_rate = "1e-4" # the prior parameter was "5e-5"

In [9]:
print(f"Finetune {repo_id} for {num_train_epochs} epochs with batch size {batch_size}, and then saving output to {result_output_folder}.")
print("")
print("# Now copy and paste the following into another terminal within the current_production directory")
print("# Make sure you have the appropriate virutal environment activated!")
print("")
print(f"""python -W ignore finetune-clip-huggingface/huggingface_finetune_clip.py \
    --output_dir {result_output_folder} \
    --model_name_or_path {repo_id} \
    --train_file {train_data_file} \
    --validation_file {test_data_file} \
    --image_column="image_path" \
    --overwrite_output_dir=True \
    --max_seq_length={max_token_seq_length} \
    --num_train_epochs={num_train_epochs} \
    --caption_column="caption" \
    --overwrite_cache=True \
    --remove_unused_columns=False \
    --do_train=True \
    --per_device_train_batch_size={batch_size} \
    --per_device_eval_batch_size={batch_size} \
    --learning_rate="{learning_rate}" --warmup_steps="{warmup_steps}" --weight_decay {weight_decay}
""")

Finetune openai/clip-vit-base-patch32 for 25 epochs with batch size 64, and then saving output to /home/ekansa/github/archaeology-images-ai/results.

# Now copy and paste the following into another terminal within the current_production directory
# Make sure you have the appropriate virutal environment activated!

python -W ignore finetune-clip-huggingface/huggingface_finetune_clip.py     --output_dir /home/ekansa/github/archaeology-images-ai/results     --model_name_or_path openai/clip-vit-base-patch32     --train_file /home/ekansa/github/archaeology-images-ai/files/train.json     --validation_file /home/ekansa/github/archaeology-images-ai/files/test.json     --image_column="image_path"     --overwrite_output_dir=True     --max_seq_length=77     --num_train_epochs=25     --caption_column="caption"     --overwrite_cache=True     --remove_unused_columns=False     --do_train=True     --per_device_train_batch_size=64     --per_device_eval_batch_size=64     --learning_rate="2e-5" --warmup_

In [10]:
if False:
    # Disable this running in Jupyter. Too many updates to the Web client usually break things.
    # To make sure this actually works, I ran the fine tuning command in a terminal.
    !python -W ignore finetune-clip-huggingface/huggingface_finetune_clip.py \
        --output_dir {result_output_folder} \
        --model_name_or_path {repo_id} \
        --train_file {train_data_file} \
        --validation_file {test_data_file} \
        --image_column image \
        --overwrite_output_dir=True \
        --max_seq_length={max_token_seq_length} \
        --num_train_epochs={num_train_epochs} \
        --caption_column caption \
        --overwrite_cache=True \
        --remove_unused_columns=False \
        --do_train \
        --per_device_train_batch_size={batch_size} \
        --per_device_eval_batch_size={batch_size} \
        --learning_rate="{learning_rate}" --warmup_steps="{warmup_steps}" --weight_decay {weight_decay}
    print("--\nDONE")
    print(f"If it worked, trained data should be in {result_output_folder}")