In [3]:
import os 
os.chdir('./..')


In [4]:
!ls

[1m[36mconfig_models[m[m        img_test.jpg         [1m[36mnotebooks[m[m
constants.py         load_and_training.py predict.py
[1m[36mdata_folder[m[m          [1m[36mlog_monitoring[m[m       pretrained_main.py
[1m[36mdataset_structure[m[m    main.py              [1m[36mpretrained_models[m[m
[1m[36mevaluation_metrics[m[m   [1m[36mmodels[m[m               [1m[36mutils[m[m


In [5]:
from transformers import ConvNextModel, ResNetModel, T5ForConditionalGeneration, T5Tokenizer
from data_folder.medical_datasets import RocoDataset
from torch.utils.data import Dataset, DataLoader
from torch import nn


In [6]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [10]:
roco_path = "/Users/pdcos/Documents/Mestrado/IA025/Projeto_Final/Code/caption_medical_images/dataset_structure/roco-dataset"
train_dataset = RocoDataset(roco_path=roco_path, mode="train", caption_max_length=64, tokenizer=tokenizer)
valid_dataset = RocoDataset(roco_path=roco_path, mode="validation", caption_max_length=64, tokenizer=tokenizer)
train_loader = DataLoader(train_dataset, batch_size=3, shuffle=True, drop_last=True)
valid_loader = DataLoader(valid_dataset, batch_size=3, shuffle=True, drop_last=True)

In [11]:
pretrained_model = ConvNextModel.from_pretrained("facebook/convnext-tiny-224")

Some weights of the model checkpoint at facebook/convnext-tiny-224 were not used when initializing ConvNextModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing ConvNextModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ConvNextModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
display(pretrained_model)

ConvNextModel(
  (embeddings): ConvNextEmbeddings(
    (patch_embeddings): Conv2d(3, 96, kernel_size=(4, 4), stride=(4, 4))
    (layernorm): ConvNextLayerNorm()
  )
  (encoder): ConvNextEncoder(
    (stages): ModuleList(
      (0): ConvNextStage(
        (downsampling_layer): Identity()
        (layers): Sequential(
          (0): ConvNextLayer(
            (dwconv): Conv2d(96, 96, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=96)
            (layernorm): ConvNextLayerNorm()
            (pwconv1): Linear(in_features=96, out_features=384, bias=True)
            (act): GELUActivation()
            (pwconv2): Linear(in_features=384, out_features=96, bias=True)
            (drop_path): Identity()
          )
          (1): ConvNextLayer(
            (dwconv): Conv2d(96, 96, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=96)
            (layernorm): ConvNextLayerNorm()
            (pwconv1): Linear(in_features=96, out_features=384, bias=True)
            (act): GELUAc

In [13]:
img, caption, _, _, img_name = next(iter(train_loader))
img_name

('ROCO_69635', 'ROCO_10547', 'ROCO_73315')

In [14]:
out = pretrained_model(img)

In [16]:
out[0].shape

torch.Size([3, 768, 7, 7])

In [25]:
class ConvNextDebugger(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model 
    
    def forward(self, input):
        out = self.model.forward(input, return_dict=False)
        out = out[0]
        return out 
    
    def __call__(self, input):
        out = self.forward(input)
        return out 
    
model_debug = ConvNextDebugger(pretrained_model)

In [27]:
out = model_debug(img)
out.shape

torch.Size([3, 768, 7, 7])

In [28]:
from torchsummary import summary

summary(model_debug, (3, 224, 224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 96, 56, 56]           4,704
 ConvNextLayerNorm-2           [-1, 96, 56, 56]             192
ConvNextEmbeddings-3           [-1, 96, 56, 56]               0
          Identity-4           [-1, 96, 56, 56]               0
            Conv2d-5           [-1, 96, 56, 56]           4,800
 ConvNextLayerNorm-6           [-1, 56, 56, 96]             192
            Linear-7          [-1, 56, 56, 384]          37,248
    GELUActivation-8          [-1, 56, 56, 384]               0
    GELUActivation-9          [-1, 56, 56, 384]               0
   GELUActivation-10          [-1, 56, 56, 384]               0
   GELUActivation-11          [-1, 56, 56, 384]               0
   GELUActivation-12          [-1, 56, 56, 384]               0
   GELUActivation-13          [-1, 56, 56, 384]               0
   GELUActivation-14          [-1, 56, 

In [None]:
class ConvNextTransferLearning(nn.Module):
    def __init__(self, model):
        ...
        # adicionar linear vazio

In [7]:
decoder = T5ForConditionalGeneration.from_pretrained("t5-small")

In [17]:
decoder.config

T5Config {
  "_name_or_path": "t5-small",
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "tran