In [1]:
import requests
import os
import logging
import gdown
import random

import torch
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms, models
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchinfo import summary
from torchvision.datasets import VOCSegmentation
import torchmetrics
import torchvision
import albumentations as A

import re
import string
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
import cv2
from PIL import Image
from tqdm import tqdm
import torchvision.transforms as T
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from scipy.io import loadmat
from sklearn.manifold import TSNE
from torchmetrics.classification import MulticlassF1Score, JaccardIndex, MulticlassPrecision, MulticlassRecall, MulticlassAveragePrecision
import pandas as pd
from torchinfo import torchinfo

from transformers import AutoTokenizer, AutoModel, RobertaTokenizer, CLIPModel, CLIPTokenizer, CLIPProcessor
from sklearn.model_selection import train_test_split
from tqdm import tqdm


In [2]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
logging.basicConfig(level=logging.ERROR)

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda:0")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
    
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 3090


In [3]:
ROOT_DIR = '../Datasets/ocular-disease-recognition-odir5k/'

## ALIGN

In [4]:
BATCH_SIZE = 48

In [5]:
CSV_PATH = ROOT_DIR + 'dataset_single_eye.csv'
TEST_CSV = ROOT_DIR + 'TESTING_dataset_single_eye.csv'
IMG_PATH = ROOT_DIR + 'preprocessed_images/'

In [6]:
torchvision.io.read_image(IMG_PATH + '0_left.jpg').shape

torch.Size([3, 512, 512])

In [7]:
train_val_df = pd.read_csv(CSV_PATH)
test_df = pd.read_csv(TEST_CSV)

In [8]:
def preprocess_text(df:pd.DataFrame):
    df['Keywords'] = df['Keywords'].str.lower()
    df['Keywords'] = df['Keywords'].apply(lambda x: " ".join(x.split()))
    df['Keywords'] = df['Keywords'].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '' , x))
    return df
train_val_df = preprocess_text(train_val_df)
test_df = preprocess_text(test_df)

In [9]:
np.max(train_val_df['Keywords'].apply(lambda x: len(x.split())))

10

In [10]:
train_df, val_df = train_test_split(train_val_df, test_size = 0.15)
len(train_df), len(val_df)

(4877, 861)

In [11]:
IMG_SIZE = (224, 224)

rescale_transform = torchvision.transforms.Compose([
    torchvision.transforms.Resize(IMG_SIZE, antialias = True),
    torchvision.transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
    )
])

In [12]:
processor = RobertaTokenizer.from_pretrained('roberta-base')

In [13]:
class ODIRDatasetMM(Dataset) :
    def __init__(self, df, IMG_FOLDER, tokenizer = processor) :
        '''
        id : list of samples ids as string
        '''
        self.text = [tokenizer(text = x, padding = 'max_length', max_length = 25, truncation = True, return_tensors = 'pt') for x in df['Keywords']]
        self.eye = df['eye']
        self.labels = torch.tensor(df[['N', 'D', 'G', 'C', 'A', 'H', 'M', 'O']].to_numpy())
        self.img_dir = [IMG_PATH + x for x in df['Image']]

        self.images = [rescale_transform(torchvision.io.read_image(x).float()/255.0) for x in self.img_dir]
        #self.transform = transform
        
    def __len__(self):
        return len(self.images)
        
    def __getitem__(self, idx):
        return self.images[idx], self.text[idx], self.labels[idx]

In [14]:
train_dataset = ODIRDatasetMM(train_df, IMG_PATH)
val_dataset   = ODIRDatasetMM(val_df, IMG_PATH)
test_dataset  = ODIRDatasetMM(test_df, IMG_PATH)

In [15]:
train_dataloader = DataLoader(train_dataset, batch_size= BATCH_SIZE, shuffle = True)
val_dataloader = DataLoader(val_dataset, batch_size = BATCH_SIZE)
val_dataloader = DataLoader(test_dataset, batch_size= BATCH_SIZE)

In [16]:
# contrastive learning on training data finetuning

In [17]:
from transformers import ConvNextV2Model, RobertaModel

img_model = ConvNextV2Model.from_pretrained("facebook/convnextv2-tiny-1k-224")      #output 768 features
txt_model = RobertaModel.from_pretrained("roberta-base")

Some weights of the model checkpoint at facebook/convnextv2-tiny-1k-224 were not used when initializing ConvNextV2Model: ['convnextv2.encoder.stages.2.layers.7.grn.bias', 'convnextv2.encoder.stages.2.layers.3.grn.bias', 'convnextv2.encoder.stages.2.layers.2.grn.bias', 'classifier.bias', 'convnextv2.encoder.stages.3.layers.1.grn.bias', 'convnextv2.encoder.stages.1.layers.0.grn.bias', 'classifier.weight', 'convnextv2.encoder.stages.1.layers.2.grn.bias', 'convnextv2.encoder.stages.0.layers.0.grn.weight', 'convnextv2.encoder.stages.0.layers.2.grn.weight', 'convnextv2.encoder.stages.2.layers.0.grn.bias', 'convnextv2.encoder.stages.0.layers.1.grn.bias', 'convnextv2.encoder.stages.1.layers.0.grn.weight', 'convnextv2.encoder.stages.2.layers.2.grn.weight', 'convnextv2.encoder.stages.3.layers.2.grn.weight', 'convnextv2.encoder.stages.2.layers.5.grn.weight', 'convnextv2.encoder.stages.0.layers.0.grn.bias', 'convnextv2.encoder.stages.3.layers.0.grn.bias', 'convnextv2.encoder.stages.0.layers.1.grn.

In [18]:
# prepare two models: BERT vs ConvNext, try to compute contrastive losses
class ContrastiveLearning(nn.Module):
    def __init__(self, drop_prob = 0.4):
        super().__init__()
        self.img_model = ConvNextV2Model.from_pretrained("facebook/convnextv2-tiny-1k-224")      #output 768 features
        self.txt_model = RobertaModel.from_pretrained("roberta-base")                            #output 768 features
        
        self.fc1 = nn.Linear(768, 256)
        self.fc2 = nn.Linear(256, 768)
        self.dropout = nn.Dropout(drop_prob)
        self.head = nn.Linear(768, 8)
    def forward(self, img_input, input_ids = None, attn_mask = None, contrastive = False):
        if(contrastive):
            # pretraining
            out_txt = self.txt_model(input_ids, attn_mask)
            out_img = self.img_model(img_input)

            return out_img, out_txt
        else:
            out = self.img_model(img_input)['pooler_output']
            resi = out
            out = F.relu(self.fc1(out))
            out = F.relu(self.fc2(out))
            out = out + resi
            out = self.head(out)
            return out

In [19]:
model = ContrastiveLearning().to(device)

Some weights of the model checkpoint at facebook/convnextv2-tiny-1k-224 were not used when initializing ConvNextV2Model: ['convnextv2.encoder.stages.2.layers.7.grn.bias', 'convnextv2.encoder.stages.2.layers.3.grn.bias', 'convnextv2.encoder.stages.2.layers.2.grn.bias', 'classifier.bias', 'convnextv2.encoder.stages.3.layers.1.grn.bias', 'convnextv2.encoder.stages.1.layers.0.grn.bias', 'classifier.weight', 'convnextv2.encoder.stages.1.layers.2.grn.bias', 'convnextv2.encoder.stages.0.layers.0.grn.weight', 'convnextv2.encoder.stages.0.layers.2.grn.weight', 'convnextv2.encoder.stages.2.layers.0.grn.bias', 'convnextv2.encoder.stages.0.layers.1.grn.bias', 'convnextv2.encoder.stages.1.layers.0.grn.weight', 'convnextv2.encoder.stages.2.layers.2.grn.weight', 'convnextv2.encoder.stages.3.layers.2.grn.weight', 'convnextv2.encoder.stages.2.layers.5.grn.weight', 'convnextv2.encoder.stages.0.layers.0.grn.bias', 'convnextv2.encoder.stages.3.layers.0.grn.bias', 'convnextv2.encoder.stages.0.layers.1.grn.

In [20]:
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)

loss_img = nn.CrossEntropyLoss()
loss_text = nn.CrossEntropyLoss()

EPOCHS = 8

for epoch_num in range(EPOCHS):

      total_acc_train = 0
      total_loss_train = 0

      for train_image, train_text, train_label in tqdm(train_dataloader):
          optimizer.zero_grad()

          train_image = train_image.to(device)
          mask = train_text['attention_mask'].to(device)
          input_id = train_text['input_ids'].squeeze(1).to(device)
          
          # logits_per_image, logits_per_text
          out_img, out_txt = model.forward(train_image, input_id, mask, contrastive = True)

          #print(out_txt['pooler_output'].shape)

          ground_truth = torch.arange(len(train_image),dtype=torch.long,device=device)
          
          batch_loss = (loss_img(out_txt['pooler_output'], ground_truth) + loss_text(out_img['pooler_output'], ground_truth))
          batch_loss.backward()
          optimizer.step()
          total_loss_train += batch_loss.item()
          
        #   acc = (output['logits'].argmax(dim=1) == train_label).sum().item()
        #   total_acc_train += acc
      
      total_acc_val = 0
      total_loss_val = 0

      with torch.no_grad():

          for val_image, val_text, val_label in val_dataloader:

              val_label = val_label.to(device)
              val_image = val_image.to(device)
              mask = val_text['attention_mask'].to(device)
              input_id = val_text['input_ids'].squeeze(1).to(device)

              out_img, out_txt = model.forward(val_image, input_id, mask, contrastive = True)
              
              ground_truth = torch.arange(len(val_image),dtype=torch.long,device=device)
              batch_loss = (loss_img(out_txt['pooler_output'], ground_truth) + loss_text(out_img['pooler_output'], ground_truth))
              total_loss_val += batch_loss.item()
              
      
      avg_train_loss = total_loss_train/len(train_df)
    #   train_accuracy = total_acc_train/len(train_df)

      avg_val_loss = total_loss_val/len(val_df)
    #   val_accuracy = total_acc_val/len(dev_df)

      print(f"Epoch [{epoch_num+1}/{EPOCHS}], "f"Train Loss: {avg_train_loss:.4f}")
      print(f"Epoch [{epoch_num+1}/{EPOCHS}], "f"Val Loss: {avg_val_loss:.4f}")
      print('-'*60)

      torch.save(model.state_dict(), './' + str(epoch_num+21) + '.pt' )


 16%|█▌        | 16/102 [00:10<00:39,  2.16it/s]

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)

criterion = nn.BCELoss()
test_loss = 0
test_acc  = 0
AVERAGING = 'micro'
acc_train = torchmetrics.classification.MultilabelAccuracy(8, average = AVERAGING)#, validate_args = False)
acc_val   = torchmetrics.classification.MultilabelAccuracy(8, average = AVERAGING)

EPOCHS = 10

for epoch_num in range(EPOCHS):
      total_loss_train = 0

      for train_image, train_text, train_label in tqdm(train_dataloader):
          optimizer.zero_grad()

          train_image = train_image.to(device)
          train_label = train_label.to(device)
          mask = train_text['attention_mask'].to(device)
          input_id = train_text['input_ids'].squeeze(1).to(device)
          
          # logits_per_image, logits_per_text
          out_img = model.forward(train_image, contrastive = False)
          
          batch_loss = criterion(out_img, train_label)
          batch_loss.backward()
          optimizer.step()
          total_loss_train += batch_loss.item()
          
          acc_train(out_img, train_label)
        #   acc = (output['logits'].argmax(dim=1) == train_label).sum().item()
        #   total_acc_train += acc
      
      total_acc_val = 0
      total_loss_val = 0

      with torch.no_grad():

          for val_image, val_text, val_label in val_dataloader:

              val_label = val_label.to(device)
              val_image = val_image.to(device)
              mask = val_text['attention_mask'].to(device)
              input_id = val_text['input_ids'].squeeze(1).to(device)

              out_img = model.forward(train_image, contrastive = False)
              

              batch_loss = criterion(out_img, val_label)
              total_loss_val += batch_loss.item()
              acc_val(val_label, out_img)
              
      
      avg_train_loss = total_loss_train/len(train_df)
    #   train_accuracy = total_acc_train/len(train_df)

      avg_val_loss = total_loss_val/len(val_df)
    #   val_accuracy = total_acc_val/len(dev_df)

      print(f"Epoch [{epoch_num+1}/{EPOCHS}], "f"Train Loss: {avg_train_loss:.4f}, "f"Train Accuracy: {acc_train.compute():.4f}")
      print(f"Epoch [{epoch_num+1}/{EPOCHS}], "f"Val Loss: {avg_val_loss:.4f}, "f"Val Accuracy: {acc_val.compute():.4f}")
      print('-'*60)
      
      acc_train.reset()
      acc_val.reset()

      torch.save(model.state_dict(), './' + str(epoch_num)+'finetuning' + '.pt' )


  0%|          | 0/153 [00:00<?, ?it/s]


ValueError: too many values to unpack (expected 2)