In [None]:
!unzip training.zip

In [None]:
#!pip install transformers
#!pip install torch
!pip install torchvision

In [None]:
#from https://github.com/shashnkvats/Indofashionclip/blob/main/indofashion_clip.py
#with modifications


In [1]:
# the json file with filenames and captions
json_path = 'simple.json'
image_path = 'training'

## The main event

The code below should create something that is compatible with huggingface transformers architecture, which means it should be easier to integrate with llm-clip

In [8]:
from PIL import Image
import os
import tqdm
from tqdm import tqdm
import json
import torch
from torch.utils.data import DataLoader
from transformers import CLIPModel, CLIPProcessor
from torchvision.transforms import ToTensor, Compose, Resize
import re
import string


In [None]:
## try two
## try two

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Setting cudnn benchmark
torch.backends.cudnn.benchmark = True

with open(json_path, 'r') as f:
    input_data = json.load(f)

# Load CLIP model and processor from Hugging Face

# Setting device on GPU if available, else CPU
#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Loading model
#single_gpu_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
#model = torch.nn.DataParallel(single_gpu_model).to(device)   # Make model parallel
#processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = torch.device('mps')
# Loading model
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)

processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Define a custom dataset

from my_datasets import image_title_dataset

def clean_caption(raw_caption, max_caption_char_limit=300):
    if not raw_caption:
       return ''
    # just to be sure we're dealing with a string
    raw_caption = str(raw_caption)
    # Remove line breaks
    raw_caption = raw_caption.replace('\n', ' ')
    # Remove all other punctuation.
    raw_caption =  re.sub(r'[' + string.punctuation + r']+', ' ', raw_caption).strip()
    ok_words = []
    for act_word in raw_caption.split(' '):
        if not act_word:
           # Skip empty characters
           continue
        if len(' '.join(ok_words + [act_word])) < max_caption_char_limit:
            # Adding the act_word is ok, because it's less than our
            # character limits
            ok_words.append(act_word)
    # Combine it all into a single string.
    caption = ' '.join(ok_words)
    # truncating shouldn't be needed, but seems safer
    return caption[:max_caption_char_limit]

# Make sure each image path has one text
list_image_path = []
list_txt = []
for item in input_data:
  if 'filename' in item and 'captions' in item:
    img_path = os.path.join('training', item['filename'].split('/')[-1])
    # cleanup the text string of the caption(s)
    caption = clean_caption(item['captions'])
    # appending path to image then the corresponding caption
    list_image_path.append(img_path)
    list_txt.append(caption)

dataset = image_title_dataset(list_image_path, list_txt)
train_dataloader = DataLoader(dataset, batch_size=250, shuffle=True, num_workers=2) #Define your own dataloader

# Gradient accumulation steps
#accumulation_steps = 2

# Adjust learning rate
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, betas=(0.9,0.98), eps=1e-6, weight_decay=0.2)

loss_img = torch.nn.CrossEntropyLoss()
loss_txt = torch.nn.CrossEntropyLoss()

num_epochs = 10
for epoch in range(num_epochs):
    pbar = tqdm(train_dataloader, total = len(train_dataloader))
    for i, batch in enumerate(pbar):
        if batch is None:
            continue
        images,texts = batch
        inputs = processor(texts, images=images, return_tensors="pt", padding=True, truncation=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        outputs = model(**inputs)

        logits_per_image, logits_per_text = outputs.logits_per_image, outputs.logits_per_text

        ground_truth = torch.arange(len(images),dtype=torch.long,device=device)
        loss = (loss_img(logits_per_image, ground_truth) + loss_txt(logits_per_text, ground_truth)) / 2

        # Normalize loss
       #loss = loss / accumulation_steps

        loss.backward()

        # Optimizer step and zero the gradients every accumulation_steps
     #   if (i+1) % accumulation_steps == 0:
     #       optimizer.step()
      #      optimizer.zero_grad()

        pbar.set_description(f"Epoch {epoch}/{num_epochs}, Loss: {loss.item():.4f}")
        model.save_pretrained(f'./model_after_epoch_{epoch}')
        #torch.save(model.module.state_dict(), f'./model_after_epoch_{epoch}')
    # If the number of batches is not exactly divisible by accumulation_steps,
    # make sure to still zero the gradients after finishing an epoch
    if len(train_dataloader) % accumulation_steps != 0:
        optimizer.step()
        optimizer.zero_grad()


This is what I got when I went with optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, betas=(0.9,0.98), eps=1e-6), weight decay turned off 

Loss: 4.8167: 100%|█████████████████████████████████████████████████████████████████████| 21/21 [14:48<00:00, 42.29s/it]
Epoch 1/10, Loss: 4.5752: 100%|█████████████████████████████████████████████████████████████████████| 21/21 [14:18<00:00, 40.86s/it]
Epoch 2/10, Loss: 4.8087: 100%|█████████████████████████████████████████████████████████████████████| 21/21 [14:15<00:00, 40.75s/it]
Epoch 3/10, Loss: 4.6696: 100%|█████████████████████████████████████████████████████████████████████| 21/21 [15:17<00:00, 43.71s/it]
Epoch 4/10, Loss: 4.5540: 100%|█████████████████████████████████████████████████████████████████████| 21/21 [15:16<00:00, 43.65s/it]
Epoch 5/10, Loss: 4.5737: 100%|█████████████████████████████████████████████████████████████████████| 21/21 [15:11<00:00, 43.41s/it]
Epoch 6/10, Loss: 4.5502: 100%|█████████████████████████████████████████████████████████████████████| 21/21 [15:00<00:00, 42.89s/it]
Epoch 7/10, Loss: 4.5466: 100%|█████████████████████████████████████████████████████████████████████| 21/21 [14:54<00:00, 42.58s/it]
Epoch 8/10, Loss: 4.5443: 100%|█████████████████████████████████████████████████████████████████████| 21/21 [15:29<00:00, 44.25s/it]
Epoch 9/10, Loss: 4.5446:

In [17]:
## trying to implement a learning rate scheduler
## yes, gpt4 suggested this: 1. **Learning Rate Adjustment**: 
#You might want to experiment with a learning rate scheduler for adjusting the learning rate over the epochs. 
#This could help the model learn better and not get stuck giving the same results. 
#Adam optimizer often works best with learning rate decay.

## try three

import torch.optim.lr_scheduler as lr_scheduler

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Setting cudnn benchmark
torch.backends.cudnn.benchmark = True

with open(json_path, 'r') as f:
    input_data = json.load(f)

# Load CLIP model and processor from Hugging Face

# Setting device on GPU if available, else CPU
#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Loading model
#single_gpu_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
#model = torch.nn.DataParallel(single_gpu_model).to(device)   # Make model parallel
#processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = torch.device('mps')
# Loading model
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)

processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Define a custom dataset

from my_datasets import image_title_dataset

def clean_caption(raw_caption, max_caption_char_limit=300):
    if not raw_caption:
       return ''
    # just to be sure we're dealing with a string
    raw_caption = str(raw_caption)
    # Remove line breaks
    raw_caption = raw_caption.replace('\n', ' ')
    # Remove all other punctuation.
    raw_caption =  re.sub(r'[' + string.punctuation + r']+', ' ', raw_caption).strip()
    ok_words = []
    for act_word in raw_caption.split(' '):
        if not act_word:
           # Skip empty characters
           continue
        if len(' '.join(ok_words + [act_word])) < max_caption_char_limit:
            # Adding the act_word is ok, because it's less than our
            # character limits
            ok_words.append(act_word)
    # Combine it all into a single string.
    caption = ' '.join(ok_words)
    # truncating shouldn't be needed, but seems safer
    return caption[:max_caption_char_limit]

# Make sure each image path has one text
list_image_path = []
list_txt = []
for item in input_data:
  if 'filename' in item and 'captions' in item:
    img_path = os.path.join('training', item['filename'].split('/')[-1])
    # cleanup the text string of the caption(s)
    caption = clean_caption(item['captions'])
    # appending path to image then the corresponding caption
    list_image_path.append(img_path)
    list_txt.append(caption)

dataset = image_title_dataset(list_image_path, list_txt)
train_dataloader = DataLoader(dataset, batch_size=50, shuffle=True, num_workers=2) #Define your own dataloader

# Gradient accumulation steps
#accumulation_steps = 2

# adjust learning rate
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5, betas=(0.9, 0.98), eps=1e-6, weight_decay=0.1)

# use a StepLR scheduler
scheduler = lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1)

loss_img = torch.nn.CrossEntropyLoss()
loss_txt = torch.nn.CrossEntropyLoss()

num_epochs = 3
for epoch in range(num_epochs):
    pbar = tqdm(train_dataloader, total = len(train_dataloader))
    for i, batch in enumerate(pbar):
        if batch is None:
            continue
        images,texts = batch
        inputs = processor(texts, images=images, return_tensors="pt", padding=True, truncation=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        outputs = model(**inputs)

        logits_per_image, logits_per_text = outputs.logits_per_image, outputs.logits_per_text

        ground_truth = torch.arange(len(images), dtype=torch.long, device=device)
        loss = (loss_img(logits_per_image, ground_truth) + loss_txt(logits_per_text, ground_truth)) / 2

        loss.backward()
        
        # making sure the optimizer steps and then the gradients are zeroed in each loop
        optimizer.step()
        optimizer.zero_grad()

        pbar.set_description(f"Epoch {epoch}/{num_epochs}, Loss: {loss.item():.4f}")
        model.save_pretrained(f'./model_after_epoch_{epoch}')
    # If the number of batches is not exactly divisible by accumulation_steps,
    # make sure to still zero the gradients after finishing an epoch
    if len(train_dataloader) % accumulation_steps != 0:
        optimizer.step()
        optimizer.zero_grad()


Epoch 0/3, Loss: 3.7844: 100%|████████████████████████████████████████████████████████████████████| 102/102 [10:00<00:00,  5.88s/it]
Epoch 1/3, Loss: 3.7843: 100%|████████████████████████████████████████████████████████████████████| 102/102 [09:56<00:00,  5.85s/it]
Epoch 2/3, Loss: 3.7851: 100%|████████████████████████████████████████████████████████████████████| 102/102 [09:40<00:00,  5.69s/it]
