In [None]:
# LOCAL = 1 indicates running this notebook locally, 0 indicates running it on Kaggle
LOCAL = 1

import os
if LOCAL != 1:
  GITHUB_USER = "magnusdtd"
  REPO_NAME = "ENTRep"
  BRANCH_NAME = "prototype-clf"

  from kaggle_secrets import UserSecretsClient
  user_secrets = UserSecretsClient()
  GITHUB_TOKEN = user_secrets.get_secret("GITHUB_TOKEN")

  !git clone --single-branch --branch {BRANCH_NAME} https://{GITHUB_USER}:{GITHUB_TOKEN}@github.com/{GITHUB_USER}/{REPO_NAME}.git

  os.chdir("/kaggle/working/")
  from ENTRep.utils.kaggle import Kaggle
  kaggle = Kaggle()
else:
  os.chdir("..")
  from utils.local import Local
  local = Local()

<p align="center" style="font-size:2.5em;"><b>ENTRep Prototype Classifier</b></p>
<p align="center" style="font-size:1em;">Made by Dam Tien Dat</p>

In [None]:
!pip install open_clip_torch pytorch_metric_learning

In [None]:
import torch
from prototype_clf.create_artifacts import create_artifacts
from prototype_clf.artifact import  load_artifacts
from prototype_clf.train import *
from prototype_clf.prototype_clf import *
from prototype_clf.ENTRepDataset import ENTRepDataset
from prototype_clf.evaluate import evaluate_model, random_inference
from prototype_clf.make_submission import make_submission
import albumentations as A
from albumentations.pytorch import ToTensorV2
from typing import Tuple
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
exp_name = 'proto_clf_DINOv2s_BioCLIP_SAMViTB'
if not os.path.exists(f'results/train_df_{exp_name}.csv'):
  train_df, val_df, test_df = create_artifacts(exp_name)
else:
  train_df, val_df, test_df = load_artifacts(exp_name)

In [5]:
train_dataset = ENTRepDataset(train_df)
val_dataset = ENTRepDataset(val_df)

def get_transform(image_size: Tuple[int, int] = (480, 640)):
    return A.Compose([
        A.Resize(*image_size),
        A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
        ToTensorV2()  
    ])

train_loader = torch.utils.data.DataLoader(
  ENTRepDataset(train_df, get_transform()), 
  batch_size=32, 
  shuffle=True,
)
val_loader = torch.utils.data.DataLoader(
  ENTRepDataset(val_df, get_transform()), 
  batch_size=32
)

In [None]:
input_dim = len(train_df['embedding'].iloc[0])
projection_dim = 512
num_classes = train_df['Classification'].nunique()

model = ProjectionModel(
    input_dim=input_dim,
    embedder_dims=[input_dim],
    projection_dim=projection_dim,
    use_layernorm=False,
    use_dropout=True,
    dropout_rate=0.1,
    use_attention=False,
    internal_dim=1024,
    extra_layer=False
)

classifier = CosineClassifier(embed_dim=projection_dim, num_classes=num_classes)

fitted_model, fitted_classifier = train(
    model, 
    classifier, 
    train_loader, 
    val_loader,
    num_epochs=40,
    patience=7,
    lr=1e-4,
    device=device
)

In [None]:
prototype_classifier = PrototypeClassifier(train_dataset, fitted_model, device=device)

test_dataset = ENTRepDataset(test_df, split='test')
test_loader = torch.utils.data.DataLoader(
  ENTRepDataset(test_df, get_transform()), 
  batch_size=32
)

random_inference(prototype_classifier, test_dataset)

In [None]:
evaluate_model(
  fitted_model, 
  fitted_classifier, 
  val_loader, 
  device, 
  class_feature_map=train_dataset.class_feature_map
)

In [None]:
make_submission(
  prototype_classifier,
  'proto_clf_DINOv2s_BioCLIP_SAMViTB'
)