## Imports

In [1]:
%env CUDA_VISIBLE_DEVICES=0,1,2,3
!echo $CUDA_VISIBLE_DEVICES

env: CUDA_VISIBLE_DEVICES=0,1,2,3
0,1,2,3


In [2]:
import torch

In [3]:
%run ../utils/common.py

In [4]:
DEVICE = torch.device('cuda')
# DEVICE = torch.device('cpu')
DEVICE

device(type='cuda')

## CNN

In [5]:
%run ../utils/conv.py

In [6]:
%run classification/transfusion.py
%run classification/resnet.py
%run classification/densenet.py
%run classification/vgg.py
%run classification/mobilenet.py

In [7]:
labels = [f'disease{idx}' for idx in range(3)]

In [8]:
cnn = Densenet121CNN(labels, multilabel=True)
num_trainable_parameters(cnn), num_trainable_parameters(cnn.base_cnn.features)

(7981931, 6953856)

In [24]:
cnn = Resnet50CNN(labels, multilabel=True)
feats_params = num_trainable_parameters(cnn.base_cnn) - num_trainable_parameters(cnn.base_cnn.fc)
num_trainable_parameters(cnn), feats_params

25563179

In [27]:
cnn = TransfusionCBRCNN(labels, multilabel=True, name='tiny', n_channels=3)
num_trainable_parameters(cnn), num_trainable_parameters(cnn.conv)

(4310019, 4308480)

In [8]:
cnn = VGG19CNN(labels)
num_trainable_parameters(cnn), num_trainable_parameters(cnn.base_cnn.features)

(139582531, 20024384)

In [23]:
cnn = MobileNetV2CNN(labels)
num_trainable_parameters(cnn), num_trainable_parameters(cnn.base_cnn.features)

(3508715, 2223872)

In [9]:
cnn

VGG19CNN(
  (base_cnn): VGG(
    (features): Sequential(
      (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
      (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (3): ReLU(inplace=True)
      (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (6): ReLU(inplace=True)
      (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (8): ReLU(inplace=True)
      (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (11): ReLU(inplace=True)
      (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (13): ReLU(inplace=True)
      (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (15): ReLU(inplace=True)
      (1

In [12]:
batch_size = 4
h = w = 512

images = torch.rand(4, 3, h, w)
images.size()

torch.Size([4, 3, 512, 512])

In [13]:
features = cnn(images, features=True)
features.size()

torch.Size([4, 512, 16, 16])

In [27]:
output, = cnn(images, features=False)
output.size()

torch.Size([4, 14])

## Decoder

In [21]:
%run report_generation/decoder_lstm.py

In [22]:
vocab_size = 1000
embedding_size = 200
hidden_size = 100

decoder = LSTMDecoder(vocab_size, embedding_size, hidden_size)
decoder

LSTMDecoder(
  (embeddings_table): Embedding(1000, 200, padding_idx=0)
  (lstm_cell): LSTMCell(200, 100)
  (W_vocab): Linear(in_features=100, out_features=1000, bias=True)
)

In [23]:
batch_size = 4
hidden_size = 100

initial_state = torch.rand(batch_size, hidden_size)
initial_state.size()

torch.Size([4, 100])

In [24]:
outputs = decoder(initial_state, 10)
words = outputs[0]
words.size()

torch.Size([4, 10, 1000])

## Debug hierarchical

In [60]:
import torch
from torch import nn
import numpy as np

from medai.utils.nlp import PAD_IDX, START_IDX, END_OF_SENTENCE_IDX

In [6]:
%run report_generation/att_2layer.py

### Debug input/output

In [2]:
%run report_generation/decoder_h_lstm_att.py

In [8]:
batch_size = 2

In [9]:
image_features = torch.rand(batch_size, 1024, 16, 16)

In [13]:
reports_h = torch.tensor([[[1, 2, 3, 0],
                           [1, 5, 0, 0],
                           [2, 2, 2, 0],
                          ],
                          [[7, 9, 10, 0],
                           [1, 4, 0, 0],
                           [8, 9, 0, 0],
                          ],
                         ])
reports_h.size()

torch.Size([2, 3, 4])

In [11]:
model = HierarchicalLSTMAttDecoder(200, 100, 100, (1024, 16, 16))

In [12]:
generated, stops, scores = model(image_features, reports_h)
generated.size(), stops.size(), scores.size()

(torch.Size([2, 3, 4, 200]), torch.Size([2, 3, 1]), torch.Size([2, 3, 16, 16]))

### Debug flat and hierarchical padding

In [31]:
%run ../training/report_generation/hierarchical.py
%run ../training/report_generation/flat.py

In [32]:
%run ../datasets/iu_xray.py

In [33]:
dataset = IUXRayDataset(max_samples=20)
dataset.size()

(42, 20)

In [34]:
dataloader = create_hierarchical_dataloader(dataset, batch_size=4)

In [35]:
for batch in dataloader:
    images, reports, stops = batch
    break

In [36]:
reports, stops

(tensor([[[49, 32, 11, 50,  8, 51,  4],
          [52,  8,  9, 13,  4,  0,  0],
          [ 0,  0,  0,  0,  0,  0,  0]],
 
         [[49, 32, 11, 50,  8, 51,  4],
          [52,  8,  9, 13,  4,  0,  0],
          [ 0,  0,  0,  0,  0,  0,  0]],
 
         [[76, 77, 63, 78,  4,  0,  0],
          [52, 10, 13,  4,  0,  0,  0],
          [32, 11, 50,  4,  0,  0,  0]],
 
         [[76, 77, 63, 78,  4,  0,  0],
          [52, 10, 13,  4,  0,  0,  0],
          [32, 11, 50,  4,  0,  0,  0]]]),
 tensor([[0., 0., 1.],
         [0., 0., 1.],
         [0., 0., 0.],
         [0., 0., 0.]]))

In [38]:
from torch import nn

In [46]:
s = torch.tensor([[0, 0, 1],
                  [0, 0, 1],
                  [0, 0, 0],
                  [0, 0, 0],
                 ])

In [47]:
stops

tensor([[0., 0., 1.],
        [0., 0., 1.],
        [0., 0., 0.],
        [0., 0., 0.]])

In [48]:
loss = nn.BCELoss()

In [61]:
loss(s, stops.long())

RuntimeError: "binary_cross_entropy" not implemented for 'Long'

In [59]:
a = torch.rand(10, 5, 1)
a.squeeze(-1).size()

torch.Size([10, 5])

In [129]:
flat_dataloader = create_flat_dataloader(dataset, batch_size=4)

In [130]:
for batch in flat_dataloader:
    _, flat_reports = batch
    break

In [132]:
flat_reports.size()

torch.Size([4, 14])

In [133]:
flat_reports

tensor([[49, 32, 11, 50,  8, 51,  4, 52,  8,  9, 13,  4,  1,  0],
        [49, 32, 11, 50,  8, 51,  4, 52,  8,  9, 13,  4,  1,  0],
        [76, 77, 63, 78,  4, 52, 10, 13,  4, 32, 11, 50,  4,  1],
        [76, 77, 63, 78,  4, 52, 10, 13,  4, 32, 11, 50,  4,  1]])

In [134]:
reports

tensor([[[49, 32, 11, 50,  8, 51,  4],
         [52,  8,  9, 13,  4,  0,  0],
         [ 0,  0,  0,  0,  0,  0,  0]],

        [[49, 32, 11, 50,  8, 51,  4],
         [52,  8,  9, 13,  4,  0,  0],
         [ 0,  0,  0,  0,  0,  0,  0]],

        [[76, 77, 63, 78,  4,  0,  0],
         [52, 10, 13,  4,  0,  0,  0],
         [32, 11, 50,  4,  0,  0,  0]],

        [[76, 77, 63, 78,  4,  0,  0],
         [52, 10, 13,  4,  0,  0,  0],
         [32, 11, 50,  4,  0,  0,  0]]])

## Dummy baselines

### Load data

In [5]:
%run ../datasets/iu_xray.py

In [6]:
dataset_kwargs = {
    'max_samples': None,
    'frontal_only': False,
    'image_size': (512, 512),
}

train_dataset = IUXRayDataset(dataset_type='train', **dataset_kwargs)
dataset_kwargs['vocab'] = train_dataset.get_vocab()
val_dataset = IUXRayDataset(dataset_type='val', **dataset_kwargs)
test_dataset = IUXRayDataset(dataset_type='test', **dataset_kwargs)
len(train_dataset), len(val_dataset), len(test_dataset)

(5923, 751, 752)

In [7]:
vocab_size = len(train_dataset.get_vocab())

### Random

In [76]:
%run ./report_generation/dummy/random.py

In [77]:
model = RandomReport(train_dataset)
model

RandomReport()

In [78]:
bs = 2
features = torch.rand(bs, 256, 16, 16)
reports = (torch.rand(bs, 20) * vocab_size).long()

In [79]:
vocab_size, reports.max().item()

(1775, 1673)

In [109]:
r, = model(features, None, free=True)
r.size()

torch.Size([2, 46, 1775])

### MostSimilarImage

In [12]:
%run ../training/report_generation/flat.py

In [13]:
dataloader = create_flat_dataloader(test_dataset, batch_size=10)

In [14]:
%run ./report_generation/dummy/most_similar_image.py

In [15]:
cnn = cnn.to(DEVICE)

In [16]:
model = MostSimilarImage(cnn, dataloader)

In [34]:
images = torch.rand(10, 3, 256, 256).to(DEVICE)
reports = (torch.randn(10, 4) * vocab_size).long().to(DEVICE)

In [35]:
out, = model(images, reports, free=False)
out.size()

torch.Size([10, 4, 1775])

### Common sentences

In [221]:
%run ../training/report_generation/hierarchical.py

In [222]:
%run ./report_generation/dummy/common_sentences.py

In [223]:
model = MostCommonSentences(train_dataset)

In [224]:
model.n_sentences, model.n_weights

((1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 12, 18, 17),
 (73, 209, 1242, 1740, 1277, 754, 334, 153, 74, 33, 17, 4, 11, 1, 1))

In [245]:
bs = 10
images = torch.rand(bs, 3, 16, 16)
reports = (torch.randn(bs, 8, 6) * vocab_size).long()

In [254]:
a, b = model(images, reports, free=False)
a.size()

torch.Size([10, 8, 6, 1775])

### Common words

In [255]:
%run ./report_generation/dummy/common_words.py

In [256]:
model = MostCommonWords(train_dataset)

In [262]:
bs = 10
images = torch.rand(bs, 3, 16, 16)
reports = (torch.randn(bs, 7) * vocab_size).long()

In [268]:
a, = model(images, reports, free=True)
a.size()

torch.Size([10, 26, 1775])