In [None]:
# LOCAL = 1 indicates running this notebook locally, 0 indicates running it on Kaggle
LOCAL = 0

import os
if LOCAL != 1:
  GITHUB_USER = "magnusdtd"
  REPO_NAME = "ENTRep"
  BRANCH_NAME = "BioCLIP"

  from kaggle_secrets import UserSecretsClient
  user_secrets = UserSecretsClient()
  GITHUB_TOKEN = user_secrets.get_secret("GITHUB_TOKEN")

  os.system(f"git clone --single-branch --branch {BRANCH_NAME} https://{GITHUB_USER}:{GITHUB_TOKEN}@github.com/{GITHUB_USER}/{REPO_NAME}.git")
  os.chdir("/kaggle/working/")

  from ENTRep.utils.file import File
  File.make_train_path()
else:
  os.chdir("..")

current_path = os.getcwd()
print("Current path:", current_path)

<p align="center" style="font-size:2.5em;"><b>ENTRep Text-to-Image Retrieval</b></p>
<p align="center" style="font-size:2em;">BioCLIP</p>
<p align="center" style="font-size:1em;">Made by Dam Tien Dat</p>

# Setup

In [None]:
from BioCLIP.data_preparation import DataPreparation
import os
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
data_preparation = DataPreparation()

df = data_preparation.preprocess_data()
df = data_preparation.detect_and_translate(df)
data_preparation.validate_dataframe(df)
df['Path'] = df['Path'].apply(lambda x: os.path.join("/kaggle/working/", x))
df.to_csv('Dataset/data.csv', index_label=False)
df.head()

Clone repo, change current directory and make new directory

In [None]:
if not os.path.exists('pure_bioclip'):
  os.system('git clone https://huggingface.co/imageomics/bioclip pure_bioclip')
if not os.path.exists('open_clip'):
  os.system('git clone https://github.com/mlfoundations/open_clip.git')
if not os.path.exists('open_clip/src'):
  raise FileNotFoundError("The 'open_clip/src' directory does not exist after cloning.")
os.chdir('open_clip/src')
if not os.path.exists('./logs'):
  os.makedirs('./logs', exist_ok=True)
os.system("pip install -r ./../requirements-training.txt")

# Training

In [None]:
!export CUDA_VISIBLE_DEVICES=0,1

In [None]:
!torchrun --nproc_per_node 2 -m open_clip_train.main \
    --batch-size 32 \
    --precision amp \
    --workers 4 \
    --save-frequency 3 \
    --dataset-type csv \
    --csv-separator="," \
    --train-data "./../../Dataset/data.csv" \
    --csv-img-key Path \
    --csv-caption-key DescriptionEN \
    --warmup 1000 \
    --lr=5e-6 \
    --wd=0.1 \
    --epochs=5 \
    --model "hf-hub:imageomics/bioclip" \
    --pretrained "./../../pure_bioclip/open_clip_pytorch_model.bin"

In [None]:
!python -m open_clip.push_to_hf_hub \
  --model convnext_large_d_320 \
  --pretrained logs//checkpoints/epoch_5.pt \
  --repo-id magnusdtd/bio-clip-ft

In [None]:
os.chdir('./../..')