https://github.com/elsevierlabs-os/clip-image-search/tree/main

In [None]:
!pip install ftfy pyperclip spacy torch torchvision transformers

In [None]:
import os
import pandas as pd
import requests
import time
from requests.exceptions import Timeout

# Load the csv file
data = pd.read_csv('/Users/shawngraham/Documents/code-experiments/llm-commandline/archaeology-images-ai/csv_data/artifact_images_w_descriptions.csv')

# Filter the dataset to content with image_file__uri
image_data = data[data['image_file__uri'].notna()]

# Reset index before split to ensure unique indices
image_data = image_data.reset_index(drop=True)

# Randomly select 10% for training, 10% for testing and 80% for validation
train_data = image_data.sample(frac=0.001, random_state=42) #small number for now just to get the flow right
remaining_data = image_data.drop(train_data.index)
test_data = remaining_data.sample(frac=0.001, random_state=42)  # 10% of 90% remaining data
remaining_data = remaining_data.drop(test_data.index)
validation_data = remaining_data.sample(frac=0.0005, random_state=42)  # 5% of 80% remaining data

# Create metadata
def create_metadata(data):
    data['caption'] = (data['item__earliest'].fillna('').astype(str) + ', ' +
                       data['item__latest'].fillna('').astype(str) + ', ' +
                       data['context___1'].fillna('').astype(str) + ', ' +
                       data['context___2'].fillna('').astype(str) + ', ' +
                       data['context___3'].fillna('').astype(str) + ', ' +
                       data['Consists of (Label) [https://erlangen-crm.org/current/P45_consists_of]'].fillna('').astype(str) + ', ' +
                       data['project_specific_descriptions'].fillna('').astype(str))
    return data

train_data = create_metadata(train_data)
test_data = create_metadata(test_data)
validation_data = create_metadata(validation_data)

# Create the main directories if they don't exist
os.makedirs('ourimages/test/octest', exist_ok=True)
os.makedirs('ourimages/training/octraining', exist_ok=True)
os.makedirs('ourimages/validation/ocvalidation', exist_ok=True)

datasets = [(train_data, 'ourimages/training/octraining/'), 
            (test_data, 'ourimages/test/octest/'), 
            (validation_data, 'ourimages/validation/ocvalidation/')]

url_errors = []


# Download images and save into respective folders
for dataset in datasets:
    data, folder = dataset
    # Initialize 'image' column
    data['image'] = ""
    for index, row in data.iterrows():
        url = row['image_file__uri']
        extension = url.split('.')[-1]
        media_uuid = row['media__uri'].split('/')[-1]
        file_name = f'{media_uuid}.{extension}'
        # Assign the 'file_name' to the 'image' column of the current row
        data.loc[index, 'image'] = file_name
        file_path = os.path.join(folder, file_name)

        try:
            response = requests.get(url, timeout=5)
            response.raise_for_status()
        except (requests.exceptions.RequestException, Timeout):
            print(f'An error occurred while fetching: {url}')
            url_errors.append(url)
            continue

        with open(file_path, 'wb') as img_file:
            img_file.write(response.content)


# Save captions as csv
train_data[['image', 'caption']].to_csv('ourimages/training/octraining-Captions.csv', index=False)
test_data[['image', 'caption']].to_csv('ourimages/test/octest-captions.txt', sep = "\t", index=False)
validation_data[['image', 'caption']].to_csv('ourimages/validation/ocvalidation-Captions.csv', index=False)

  data = pd.read_csv('/Users/shawngraham/Documents/code-experiments/llm-commandline/archaeology-images-ai/csv_data/artifact_images_w_descriptions.csv')


An error occurred while fetching: https://iiif.archivelab.org/iiif/opencontext-22-c-3-3068-6-1-p-5jpg/full/675,/0/default.jpg


In [None]:
!python vectorize-images.py -baseline

In [None]:
!python train.py -train_configs/run1.cfg