In [14]:
!pip install tez



In [15]:
# Everything becomes easy and intuitive from here. 
# Also, Tez keeps your code clean and readable!
# Let's import a few things.

import glob
import os

import albumentations
from transformers import BertTokenizer, VisualBertForPreTraining, VisualBertModel
import torch
import torch.nn as nn
from sklearn import metrics, preprocessing, model_selection

from tez import Tez, TezConfig
from tez.callbacks import EarlyStopping
from tez.datasets import ImageDataset


import matplotlib.pyplot as plt
import pandas as pd

%matplotlib inline

In [16]:
INPUT_PATH = "../input/instacities1m/"
IMAGE_PATH = "../input/instacities1m/InstaCities1M/img_resized_1M/cities_instagram"
MODEL_PATH = "../working/"
MODEL_NAME = "vit_base_patch16_224"
#MODEL_NAME = os.path.basename(__file__)[:-3]
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 20
IMAGE_SIZE = 300
IMAGE_SIZE_MODEL=224

In [4]:
# Let's define a model now
# We inherit from tez.Model instead of nn.Module
# we have monitor_metrics if we want to monitor any metrics
# except the loss
# and we return 3 values in forward function.

class InstaModel(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        self.model = VisualBertModel.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
        self.bert_drop = nn.Dropout(0.3)
        self.out = nn.Linear(768, num_classes)
        
    def monitor_metrics(self, outputs, targets):
        device = targets.get_device()
        if targets is None:
            return {}
        outputs = torch.argmax(outputs, dim=1).cpu().detach().numpy()
        targets = targets.cpu().detach().numpy()
        accuracy = metrics.accuracy_score(targets, outputs)
        return {"accuracy": torch.tensor(accuracy, device=device)}
    
    def optimizer_scheduler(self):
        opt = torch.optim.Adam(self.parameters(), lr=1e-3)
        sch = torch.optim.lr_scheduler.ReduceLROnPlateau(
            opt,
            factor=0.5,
            patience=2,
            verbose=True,
            mode="max",
            threshold=1e-4,
        )
        return opt, sch
  
    def forward(self, image, targets=None):

        o_2 = self.model(image)
        b_o = self.bert_drop(o_2)
        outputs = self.out(b_o)
        
        if targets is not None:
            loss = nn.CrossEntropyLoss()(outputs, targets)
            metrics = self.monitor_metrics(outputs, targets)
            return outputs, loss, metrics
        return outputs, None, None

In [9]:
model = VisualBertModel.from_pretrained("uclanlp/visualbert-vqa-coco-pre")

Some weights of the model checkpoint at uclanlp/visualbert-vqa-coco-pre were not used when initializing VisualBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing VisualBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing VisualBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
model

VisualBertModel(
  (embeddings): VisualBertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=1)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (visual_token_type_embeddings): Embedding(2, 768)
    (visual_position_embeddings): Embedding(512, 768)
    (visual_projection): Linear(in_features=2048, out_features=768, bias=True)
  )
  (encoder): VisualBertEncoder(
    (layer): ModuleList(
      (0): VisualBertLayer(
        (attention): VisualBertAttention(
          (self): VisualBertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): VisualBer

In [None]:
dfx = pd.read_csv(INPUT_PATH + "train.csv")
dfx = dfx.dropna().reset_index(drop=True)
dfx["path"] = dfx["category"].astype(str) + "/" + dfx["id"].astype(str) + ".jpg"
    
lbl_enc = preprocessing.LabelEncoder()
dfx.category = lbl_enc.fit_transform(dfx.category.values)



df_train, df_valid = model_selection.train_test_split(
    dfx, test_size=0.1, random_state=42, stratify=dfx.category.values
)

df_train = df_train.reset_index(drop=True)
df_valid = df_valid.reset_index(drop=True)



train_image_paths = [os.path.join(IMAGE_PATH, x ) for x in df_train.path.values]
valid_image_paths = [os.path.join(IMAGE_PATH, x ) for x in df_valid.path.values]
train_targets = df_train.category.values
valid_targets = df_valid.category.values

dataset_aug = albumentations.Compose(
    [
    albumentations.Resize(IMAGE_SIZE_MODEL, IMAGE_SIZE_MODEL)
    ]
)


train_dataset = ImageDataset(
    image_paths=train_image_paths,
    targets=train_targets,
    augmentations=dataset_aug,
    backend="cv2"


)

valid_dataset = ImageDataset(
    image_paths=valid_image_paths,
    targets=valid_targets,
    augmentations=dataset_aug,
    backend="cv2"
)

model = InstaModel(num_classes=dfx.category.nunique())
es = EarlyStopping(
    monitor="valid_loss",
    model_path=os.path.join(MODEL_PATH, MODEL_NAME + ".bin"),
    patience=3,
    mode="min",
)

model = Tez(model)
config = TezConfig(
    training_batch_size=TRAIN_BATCH_SIZE,
    validation_batch_size=VALID_BATCH_SIZE,
    epochs=EPOCHS,
    step_scheduler_after="epoch",
    step_scheduler_metric="valid_loss",
)


model.fit(
    train_dataset,
    valid_dataset=valid_dataset,
    device="cuda",
    config=config,
    callbacks=[es],
)
model.save(os.path.join(MODEL_PATH, MODEL_NAME + "_image.bin"))

In [1]:
import torch
from transformers import BertTokenizer, VisualBertModel

model = VisualBertModel.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

inputs = tokenizer("What is the man eating?", return_tensors="pt")
# this is a custom function that returns the visual embeddings given the image path
image_path="../input/instacities1m/InstaCities1M/img_resized_1M/cities_instagram/chicago/1481574059510467614.jpg"
visual_embeds = get_visual_embeddings(image_path)

visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
inputs.update(
     {
         "visual_embeds": visual_embeds,
         "visual_token_type_ids": visual_token_type_ids,
         "visual_attention_mask": visual_attention_mask,
     }
)
outputs = model(**inputs)
last_hidden_state = outputs.last_hidden_state

Downloading:   0%|          | 0.00/631 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/428M [00:00<?, ?B/s]

Some weights of the model checkpoint at uclanlp/visualbert-vqa-coco-pre were not used when initializing VisualBertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing VisualBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing VisualBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

NameError: name 'get_visual_embeddings' is not defined

In [17]:
import cv2
import numpy as np
import torch
from PIL import Image, ImageFile


ImageFile.LOAD_TRUNCATED_IMAGES = True


class InstaDataset:
    def __init__(
        self,
        image_paths,
        text,
        targets,
        tokenizer,
        max_len,
        augmentations=None,
        backend="pil",
        channel_first=True,
        grayscale=False,
    ):
        """
        :param image_paths: list of paths to images
        :param targets: numpy array
        :param augmentations: albumentations augmentations
        """
        self.image_paths = image_paths
        self.targets = targets
        self.augmentations = augmentations
        self.backend = backend
        self.channel_first = channel_first
        self.grayscale = grayscale
        self.text = text
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, item):
        targets = self.targets[item]
        if self.backend == "pil":
            image = Image.open(self.image_paths[item])
            image = np.array(image)
            if self.augmentations is not None:
                augmented = self.augmentations(image=image)
                image = augmented["image"]
        elif self.backend == "cv2":
            if self.grayscale is False:
                image = cv2.imread(self.image_paths[item])
                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            else:
                image = cv2.imread(self.image_paths[item], cv2.IMREAD_GRAYSCALE)
            if self.augmentations is not None:
                augmented = self.augmentations(image=image)
                image = augmented["image"]
        else:
            raise Exception("Backend not implemented")
        if self.channel_first is True and self.grayscale is False:
            image = np.transpose(image, (2, 0, 1)).astype(np.float32)

        image_tensor = torch.tensor(image)
        if self.grayscale:
            image_tensor = image_tensor.unsqueeze(0)
            
        #text
        text = str(self.text[item])
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]

        
        
        return {
            "image": image_tensor,
            "targets": torch.tensor(targets),
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
        }

In [18]:
import pandas as pd
import torch
import torch.nn as nn
from sklearn import metrics, model_selection
from transformers import AutoConfig, AutoModel, AutoTokenizer, get_linear_schedule_with_warmup

from tez import Tez, TezConfig
from tez.callbacks import EarlyStopping

In [19]:
class args:
    tokenizer = "bert-base-uncased"
    model = "uclanlp/visualbert-vqa-coco-pre"
    epochs = 20
    batch_size = 32
    learning_rate = 5e-5
    train_batch_size = 32
    valid_batch_size = 32
    max_len = 128
    accumulation_steps = 1


class InstaModel(nn.Module):
    def __init__(self, num_classes, model_name, num_train_steps, learning_rate):
        super().__init__()
        self.num_train_steps = num_train_steps
        self.learning_rate = learning_rate
        hidden_dropout_prob: float = 0.1
        layer_norm_eps: float = 1e-7

        config = AutoConfig.from_pretrained(model_name)

        config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout_prob": hidden_dropout_prob,
                "layer_norm_eps": layer_norm_eps,
                "add_pooling_layer": False,
                "num_labels": num_classes
            }
        )
        self.transformer = AutoModel.from_pretrained(model_name, config=config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.output = nn.Linear(config.hidden_size, num_classes)

    def optimizer_scheduler(self):
        param_optimizer = list(self.named_parameters())
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {
                "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                "weight_decay": 0.001,
            },
            {
                "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        opt = torch.optim.AdamW(optimizer_parameters, lr=self.learning_rate)
        sch = get_linear_schedule_with_warmup(
            opt,
            num_warmup_steps=0,
            num_training_steps=self.num_train_steps,
        )

        return opt, sch

    def loss(self, outputs, targets):
        if targets is None:
            return None
        return nn.BCEWithLogitsLoss()(outputs, targets.view(-1, 1))

    def monitor_metrics(self, outputs, targets):
        if targets is None:
            return {}
        device = targets.get_device()
        outputs = torch.sigmoid(outputs).cpu().detach().numpy() >= 0.5
        targets = targets.cpu().detach().numpy()
        accuracy = metrics.accuracy_score(targets, outputs)
        return {"accuracy": torch.tensor(accuracy, device=device)}

    def forward(self, ids, mask, token_type_ids, targets=None):
        transformer_out = self.transformer(
            ids,
            attention_mask=mask,
            token_type_ids=token_type_ids,
        )
        out = transformer_out.pooler_output
        out = self.dropout(out)
        output = self.output(out)
        loss = self.loss(output, targets)
        acc = self.monitor_metrics(output, targets)
        return output, loss, acc










In [21]:
dfx = pd.read_csv(INPUT_PATH + "train.csv")
dfx = dfx.dropna().reset_index(drop=True)
dfx["path"] = dfx["category"].astype(str) + "/" + dfx["id"].astype(str) + ".jpg"
    
lbl_enc = preprocessing.LabelEncoder()
dfx.category = lbl_enc.fit_transform(dfx.category.values)

df_train, df_valid = model_selection.train_test_split(
    dfx, test_size=0.1, random_state=42, stratify=dfx.category.values
)

df_train = df_train.reset_index(drop=True)
df_valid = df_valid.reset_index(drop=True)

train_image_paths = [os.path.join(IMAGE_PATH, x ) for x in df_train.path.values]
valid_image_paths = [os.path.join(IMAGE_PATH, x ) for x in df_valid.path.values]
train_targets = df_train.category.values
valid_targets = df_valid.category.values

dataset_aug = albumentations.Compose(
    [
    albumentations.Resize(IMAGE_SIZE_MODEL, IMAGE_SIZE_MODEL)
    ]
)

tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)

train_dataset = InstaDataset(
    image_paths=train_image_paths,
    targets=train_targets,
    augmentations=dataset_aug,
    backend="cv2",
    text=df_train.text.values,
    tokenizer=tokenizer,
    max_len=args.max_len,

)

valid_dataset = InstaDataset(
    image_paths=valid_image_paths,
    targets=valid_targets,
    augmentations=dataset_aug,
    backend="cv2",
    text=df_valid.text.values,
    tokenizer=tokenizer,
    max_len=args.max_len,
)

n_train_steps = int(len(train_dataset) / args.batch_size / args.accumulation_steps * args.epochs)

model = InstaModel(
        model_name=args.model,
        num_train_steps=n_train_steps,
        learning_rate=args.learning_rate,
        num_classes=dfx.category.nunique()
    )
model = Tez(model)
es = EarlyStopping(monitor="valid_loss", model_path="model.bin")
config = TezConfig(
        training_batch_size=args.train_batch_size,
        validation_batch_size=args.valid_batch_size,
        gradient_accumulation_steps=args.accumulation_steps,
        epochs=args.epochs,
        step_scheduler_after="batch",
    )

    
model.fit(
    train_dataset,
    valid_dataset=valid_dataset,
    device="cuda",
    config=config,
    callbacks=[es],
)

model.save(os.path.join(MODEL_PATH, MODEL_NAME + "_image.bin"))

Some weights of the model checkpoint at uclanlp/visualbert-vqa-coco-pre were not used when initializing VisualBertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing VisualBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing VisualBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
2022-05-02 12:45:12,019 INFO Using single GPU


  0%|          | 0/506250 [00:00<?, ?it/s]

2022-05-02 12:45:17,448 INFO 
TezConfig(device='cuda', training_batch_size=32, validation_batch_size=32, test_batch_size=32, epochs=20, gradient_accumulation_steps=1, clip_grad_norm=-1, num_jobs=2, fp16=False, train_shuffle=True, valid_shuffle=True, train_drop_last=False, valid_drop_last=False, test_drop_last=False, test_shuffle=False, pin_memory=True, step_scheduler_after='batch', step_scheduler_metric=None, val_strategy='epoch', val_steps=100)


TypeError: forward() got an unexpected keyword argument 'image'

In [10]:
AutoTokenizer.from_pretrained(args.model)

KeyError: <class 'transformers.models.visual_bert.configuration_visual_bert.VisualBertConfig'>