In [None]:
# LOCAL = 1 indicates running this notebook locally, 0 indicates running it on Kaggle
LOCAL = 1

import os
if LOCAL != 1:
  GITHUB_USER = "magnusdtd"
  REPO_NAME = "ENTRep"
  BRANCH_NAME = "BioCLIP"

  from kaggle_secrets import UserSecretsClient
  user_secrets = UserSecretsClient()
  GITHUB_TOKEN = user_secrets.get_secret("GITHUB_TOKEN")

  !git clone --single-branch --branch {BRANCH_NAME} https://{GITHUB_USER}:{GITHUB_TOKEN}@github.com/{GITHUB_USER}/{REPO_NAME}.git

  os.chdir("/kaggle/working/")

  from ENTRep.utils.file import File
  File.make_train_path()
else:
  os.chdir("..")

current_path = os.getcwd()
print("Current path:", current_path)

<p align="center" style="font-size:2.5em;"><b>ENTRep Text-to-Image Retrieval</b></p>
<p align="center" style="font-size:2em;">CLIP</p>
<p align="center" style="font-size:1em;">Made by Dam Tien Dat</p>

In [None]:
from CLIP.data_preparation import DataPreparation
from CLIP.ImageTextRetrievalEvaluator import ImageTextRetrievalEvaluator
from utils.unfreeze_layer import unfreeze_model_layers
from PIL import Image
from sentence_transformers import SentenceTransformer
from sentence_transformers.evaluation import TripletEvaluator
from CLIP.CLIP import CLIP

# Transform data

In [None]:
data_preparation = DataPreparation()

# Preprocess data
df = data_preparation.preprocess_data()

# Detect and translate Vietnamese text
df = data_preparation.detect_and_translate(df)

# Generate embeddings
job_embeddings = data_preparation.generate_embeddings(df['DescriptionEN'].to_list())
print(job_embeddings.shape)

# Compute negative pairs
negative_pair_indices = data_preparation.compute_negative_pairs(job_embeddings)
df['DescriptionEN_neg'] = df['DescriptionEN'].iloc[negative_pair_indices].values

# Validate dataframe
data_preparation.validate_dataframe(df)

print(df.head())

In [None]:
dataset = data_preparation.train_test_split(
  df, 
  train_frac=0.8,
  valid_frac=0.1,
  test_frac=0.1
)

# Fine-tune CLIP

In [None]:
model_name = "sentence-transformers/clip-ViT-L-14"
model = SentenceTransformer(model_name)

In [None]:
for name, param in model.named_parameters():
    print(f"name = {name}")

In [None]:
unfreeze_model_layers(model, ['0.model.visual_projection', '0.model.text_projection'])

Count total and trainable parameters

In [None]:
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"Percentage of trainable parameters: {100 * trainable_params / total_params:.2f}%")

Process positive pairs

In [None]:
def preprocess(batch):
  image_list = [Image.open(path) for path in batch["Path"]]

  return {
    "anchor": image_list,       
    "positive": batch["DescriptionEN"],  
    "negative": batch["DescriptionEN_neg"]
  }

In [None]:
# remove columns not relevant to training
columns_to_remove = [col for col in dataset['train'].column_names if col not in ['anchor', 'positive', 'negative']]
# apply transformations
dataset = dataset.map(preprocess, batched=True, remove_columns=columns_to_remove)
dataset

Evaluate pre-trained model

In [None]:
def create_triplet_evaluator(set_name):
  """
    Create triplet evaluator for "train", "valid", or "test" split
  """
  return TripletEvaluator(
    anchors=dataset[f"{set_name}"]["anchor"],
    positives=dataset[f"{set_name}"]["positive"],
    negatives=dataset[f"{set_name}"]["negative"],
    name=f"{set_name}",
  )

In [None]:
evaluator_train = create_triplet_evaluator("train")
evaluator_valid = create_triplet_evaluator("valid")

print("Train:", evaluator_train(model))
print("Valid:", evaluator_valid(model))

In [None]:
def create_recall_evaluator(set_name, k=1):
    """
        Create triplet evaluator for "train", "valid", or "test" split
    """
    return ImageTextRetrievalEvaluator(
        images=dataset[f"{set_name}"]["anchor"],
        texts=dataset[f"{set_name}"]["positive"],
        name=f"{set_name}",
        k=k
    )

In [None]:
evaluator_recall_train = create_recall_evaluator("train", k=1)
evaluator_recall_valid = create_recall_evaluator("valid", k=1)

print("Train:", evaluator_recall_train(model))
print("Valid:", evaluator_recall_valid(model))

In [None]:
clip = CLIP(model, 'finetuned_CLIP')
clip.fine_tune(dataset, evaluator_recall_train, evaluator_recall_valid)

Evaluate fine-tuned model

In [None]:
evaluator_test = create_triplet_evaluator("test")

print("Train:", evaluator_train(model))
print("Valid:", evaluator_valid(model))
print("Test:", evaluator_valid(model))

In [None]:
evaluator_recall_test = create_recall_evaluator("test")

print("Train:", evaluator_recall_train(model))
print("Valid:", evaluator_recall_valid(model))
print("Test:", evaluator_recall_test(model))