In [None]:
!unzip training.zip

In [6]:
#!pip install transformers
#!pip install torch
!pip install torchvision

Collecting torchvision
  Downloading torchvision-0.15.2-cp310-cp310-macosx_11_0_arm64.whl (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: torchvision
Successfully installed torchvision-0.15.2


In [None]:
#from https://github.com/shashnkvats/Indofashionclip/blob/main/indofashion_clip.py
#with modifications


In [1]:
# the json file with filenames and captions
json_path = 'simple.json'
image_path = 'training'

## The main event

The code below should create something that is compatible with huggingface transformers architecture, which means it should be easier to integrate with llm-clip

In [2]:
from PIL import Image
import os
import tqdm
from tqdm import tqdm
import json
import torch
from torch.utils.data import DataLoader
from transformers import CLIPModel, CLIPProcessor
from torchvision.transforms import ToTensor, Compose, Resize


In [5]:
## try two
## try two

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Setting cudnn benchmark
torch.backends.cudnn.benchmark = True

with open(json_path, 'r') as f:
    input_data = json.load(f)

# Load CLIP model and processor from Hugging Face

# Setting device on GPU if available, else CPU
#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Loading model
#single_gpu_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
#model = torch.nn.DataParallel(single_gpu_model).to(device)   # Make model parallel
#processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Loading model
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)

processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Define a custom dataset

from my_datasets import image_title_dataset

# Make sure each image path has one text
list_image_path = []
list_txt = []
for item in input_data:
  if 'filename' in item and 'captions' in item:
    img_path = os.path.join('training', item['filename'].split('/')[-1])
    caption = item['captions'][:40]
    # appending path to image then the corresponding caption
    list_image_path.append(img_path)
    list_txt.append(caption)

dataset = image_title_dataset(list_image_path, list_txt)
train_dataloader = DataLoader(dataset, batch_size=500, shuffle=True, num_workers=2) #Define your own dataloader

# Gradient accumulation steps
accumulation_steps = 2

# Adjust learning rate
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5, betas=(0.9,0.98), eps=1e-6, weight_decay=0.2)

loss_img = torch.nn.CrossEntropyLoss()
loss_txt = torch.nn.CrossEntropyLoss()

num_epochs = 10
for epoch in range(num_epochs):
    pbar = tqdm(train_dataloader, total = len(train_dataloader))
    for i, batch in enumerate(pbar):
        if batch is None:
            continue
        images,texts = batch
        inputs = processor(texts, images=images, return_tensors="pt", padding=True, truncation=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        outputs = model(**inputs)

        logits_per_image, logits_per_text = outputs.logits_per_image, outputs.logits_per_text

        ground_truth = torch.arange(len(images),dtype=torch.long,device=device)
        loss = (loss_img(logits_per_image, ground_truth) + loss_txt(logits_per_text, ground_truth)) / 2

        # Normalize loss
        loss = loss / accumulation_steps

        loss.backward()

        # Optimizer step and zero the gradients every accumulation_steps
        if (i+1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        pbar.set_description(f"Epoch {epoch}/{num_epochs}, Loss: {loss.item():.4f}")
        model.save_pretrained(f'./model_after_epoch_{epoch}')
        #torch.save(model.module.state_dict(), f'./model_after_epoch_{epoch}')
    # If the number of batches is not exactly divisible by accumulation_steps,
    # make sure to still zero the gradients after finishing an epoch
    if len(train_dataloader) % accumulation_steps != 0:
        optimizer.step()
        optimizer.zero_grad()


Epoch 0/10, Loss: 3.2121: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [03:05<00:00, 61.70s/it]
Epoch 1/10, Loss: 3.3904: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [03:07<00:00, 62.47s/it]
Epoch 2/10, Loss: 3.8025: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [03:04<00:00, 61.35s/it]
Epoch 3/10, Loss: 3.0458: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [03:08<00:00, 62.89s/it]
Epoch 4/10, Loss: 3.0375: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [03:01<00:00, 60.53s/it]
Epoch 5/10, Loss: 3.0325: 100%|█████████████████████████████████████████████████████████████████████

In [None]:
## this bit was so we could use the data parallel method
## but it never really worked.
## so ignore.
## and now, we do a little sleight of hand to get the resulting model
## reshaped for how we want to use it
## this is because the dataParallel method can't use save_pretrained.

# change X below:
# Load weights into the model for other tasks
base_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
base_model.load_state_dict(torch.load('model_after_epoch_X'))

base_model.save_pretrained(f'./archae_ai')

In [None]:
!zip test.zip -r archae_ai/

updating: model_after_epoch_0/ (stored 0%)
  adding: model_after_epoch_0/pytorch_model.bin (deflated 7%)
  adding: model_after_epoch_0/config.json (deflated 46%)
