In [None]:
!pip install medmnist --user

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting medmnist
  Downloading medmnist-2.1.0-py3-none-any.whl (21 kB)
Collecting fire
  Downloading fire-0.4.0.tar.gz (87 kB)
[K     |████████████████████████████████| 87 kB 3.0 MB/s 
Building wheels for collected packages: fire
  Building wheel for fire (setup.py) ... [?25l[?25hdone
  Created wheel for fire: filename=fire-0.4.0-py2.py3-none-any.whl size=115943 sha256=8f86e1f7f7d87f0c383415338abd2bbb85b42163981f62252daa84789c686418
  Stored in directory: /root/.cache/pip/wheels/8a/67/fb/2e8a12fa16661b9d5af1f654bd199366799740a85c64981226
Successfully built fire
Installing collected packages: fire, medmnist
Successfully installed fire-0.4.0 medmnist-2.1.0


In [None]:
from tqdm import tqdm
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torchvision.transforms as transforms

import medmnist
from medmnist import INFO, Evaluator

In [None]:
NUM_EPOCHS = 20
BATCH_SIZE = 128
lr = 0.001

In [None]:
!python -m medmnist info --flag=pathmnist

{'MD5': 'a8b06965200029087d5bd730944a56c1',
 'description': 'The PathMNIST is based on a prior study for predicting '
                'survival from colorectal cancer histology slides, providing a '
                'dataset (NCT-CRC-HE-100K) of 100,000 non-overlapping image '
                'patches from hematoxylin & eosin stained histological images, '
                'and a test dataset (CRC-VAL-HE-7K) of 7,180 image patches '
                'from a different clinical center. The dataset is comprised of '
                '9 types of tissues, resulting in a multi-class classification '
                'task. We resize the source images of 3×224×224 into 3×28×28, '
                'and split NCT-CRC-HE-100K into training and validation set '
                'with a ratio of 9:1. The CRC-VAL-HE-7K is treated as the test '
                'set.',
 'label': {'0': 'adipose',
           '1': 'background',
           '2': 'debris',
           '3': 'lymphocytes',
           '4': 'mucus',
 

In [None]:
data_flag = 'pathmnist'
download = True

info = INFO[data_flag]
task = info['task']
n_channels = info['n_channels']
n_classes = len(info['label'])

DataClass = getattr(medmnist, info['python_class'])

In [None]:
# preprocessing
data_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[.5], std=[.5])
])

# load the data
train_dataset = DataClass(split='train', transform=data_transform, download=download)
test_dataset = DataClass(split='test', transform=data_transform, download=download)

pil_dataset = DataClass(split='train', download=download)

# encapsulate data into dataloader form
train_loader = data.DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)
train_loader_at_eval = data.DataLoader(dataset=train_dataset, batch_size=2*BATCH_SIZE, shuffle=False)
test_loader = data.DataLoader(dataset=test_dataset, batch_size=2*BATCH_SIZE, shuffle=False)

Downloading https://zenodo.org/record/6496656/files/pathmnist.npz?download=1 to /root/.medmnist/pathmnist.npz


  0%|          | 0/205615438 [00:00<?, ?it/s]

Using downloaded and verified file: /root/.medmnist/pathmnist.npz
Using downloaded and verified file: /root/.medmnist/pathmnist.npz


In [None]:
print(train_dataset)
print("===================")
print(test_dataset)

Dataset PathMNIST (pathmnist)
    Number of datapoints: 89996
    Root location: /root/.medmnist
    Split: train
    Task: multi-class
    Number of channels: 3
    Meaning of labels: {'0': 'adipose', '1': 'background', '2': 'debris', '3': 'lymphocytes', '4': 'mucus', '5': 'smooth muscle', '6': 'normal colon mucosa', '7': 'cancer-associated stroma', '8': 'colorectal adenocarcinoma epithelium'}
    Number of samples: {'train': 89996, 'val': 10004, 'test': 7180}
    Description: The PathMNIST is based on a prior study for predicting survival from colorectal cancer histology slides, providing a dataset (NCT-CRC-HE-100K) of 100,000 non-overlapping image patches from hematoxylin & eosin stained histological images, and a test dataset (CRC-VAL-HE-7K) of 7,180 image patches from a different clinical center. The dataset is comprised of 9 types of tissues, resulting in a multi-class classification task. We resize the source images of 3×224×224 into 3×28×28, and split NCT-CRC-HE-100K into train

In [None]:
# define a simple CNN model

class Net(nn.Module):
    def __init__(self, in_channels, num_classes):
        super(Net, self).__init__()

        self.layer1 = nn.Sequential(
            nn.Conv2d(in_channels, 16, kernel_size=3),
            nn.BatchNorm2d(16),
            nn.ReLU())

        self.layer2 = nn.Sequential(
            nn.Conv2d(16, 16, kernel_size=3),
            nn.BatchNorm2d(16),
            nn.ReLU())

        self.layer3 = nn.Sequential(
            nn.Conv2d(16, 64, kernel_size=3),
            nn.BatchNorm2d(64),
            nn.ReLU())
        
        self.layer4 = nn.Sequential(
            nn.Conv2d(64, 64, kernel_size=3),
            nn.BatchNorm2d(64),
            nn.ReLU())

        self.layer5 = nn.Sequential(
            nn.Conv2d(64, 64, kernel_size=3),
            nn.BatchNorm2d(64),
            nn.ReLU())

        self.fc1 = nn.Sequential(
            nn.Linear(20736, num_classes),
            nn.Softmax(-1))
        

    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.layer5(x)
        x = x.view(x.size(0), -1)
        x = self.fc1(x)
        return x

model = Net(in_channels=n_channels, num_classes=n_classes)
    
# define loss function and optimizer
if task == "multi-label, binary-class":
    criterion = nn.BCEWithLogitsLoss()
else:
    criterion = nn.CrossEntropyLoss()
    
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

!pip install torchsummary

from torchvision import models
from torchsummary import summary

summary(model, (n_channels, 28, 28))

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 16, 26, 26]             448
       BatchNorm2d-2           [-1, 16, 26, 26]              32
              ReLU-3           [-1, 16, 26, 26]               0
            Conv2d-4           [-1, 16, 24, 24]           2,320
       BatchNorm2d-5           [-1, 16, 24, 24]              32
              ReLU-6           [-1, 16, 24, 24]               0
            Conv2d-7           [-1, 64, 22, 22]           9,280
       BatchNorm2d-8           [-1, 64, 22, 22]             128
              ReLU-9           [-1, 64, 22, 22]               0
           Conv2d-10           [-1, 64, 20, 20]          36,928
      BatchNorm2d-11           [-1, 64, 20, 20]             128
             ReLU-12           [-1, 64, 20, 20]               0
    

In [None]:
# train

losses = []

for epoch in range(NUM_EPOCHS):
    running_loss = 0.0
    model.train()
    for inputs, targets in tqdm(train_loader) :
        # forward + backward + optimize

        optimizer.zero_grad()
        inputs1 = inputs.to(device)
        targets1 = targets.to(device)
        outputs = model(inputs1)
        
        if task == 'multi-label, binary-class':
            targets2 = targets1.to(torch.float32)
            loss = criterion(outputs, targets2)
        else:
            targets2 = targets1.squeeze().long()
            loss = criterion(outputs, targets2)
        
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0) 
    epoch_loss = running_loss / len(train_loader)
    losses.append(epoch_loss)

100%|██████████| 704/704 [00:34<00:00, 20.58it/s]
100%|██████████| 704/704 [00:26<00:00, 26.65it/s]
100%|██████████| 704/704 [00:26<00:00, 26.52it/s]
100%|██████████| 704/704 [00:26<00:00, 26.46it/s]
100%|██████████| 704/704 [00:26<00:00, 26.47it/s]
100%|██████████| 704/704 [00:26<00:00, 26.90it/s]
100%|██████████| 704/704 [00:26<00:00, 26.34it/s]
100%|██████████| 704/704 [00:26<00:00, 26.51it/s]
100%|██████████| 704/704 [00:25<00:00, 27.18it/s]
100%|██████████| 704/704 [00:26<00:00, 26.42it/s]
100%|██████████| 704/704 [00:26<00:00, 27.07it/s]
100%|██████████| 704/704 [00:26<00:00, 26.24it/s]
100%|██████████| 704/704 [00:26<00:00, 26.36it/s]
100%|██████████| 704/704 [00:26<00:00, 26.23it/s]
100%|██████████| 704/704 [00:26<00:00, 26.84it/s]
100%|██████████| 704/704 [00:26<00:00, 26.67it/s]
100%|██████████| 704/704 [00:26<00:00, 26.25it/s]
100%|██████████| 704/704 [00:26<00:00, 26.58it/s]
100%|██████████| 704/704 [00:26<00:00, 26.21it/s]
100%|██████████| 704/704 [00:26<00:00, 26.95it/s]


In [None]:
# evaluation

def test(split):
    model.eval()
    y_true = torch.tensor([])
    y_score = torch.tensor([])
    
    data_loader = train_loader_at_eval if split == 'train' else test_loader

    with torch.no_grad():
        for inputs, targets in data_loader:
            outputs = model(inputs.to(device))

            if task == 'multi-label, binary-class':
                targets = targets.to(torch.float32)
                outputs = outputs.softmax(dim=-1)
            else:
                targets = targets.squeeze().long()
                outputs = outputs.softmax(dim=-1)
                targets = targets.float().resize_(len(targets), 1)

            y_true = torch.cat((y_true.to(device), targets.to(device)), 0)
            y_score = torch.cat((y_score.to(device), outputs.to(device)), 0)
        y_true = y_true.cpu()
        y_true = y_true.numpy()
        y_score = y_score.cpu()
        y_score = y_score.detach().numpy()
        
        evaluator = Evaluator(data_flag, split)
        metrics = evaluator.evaluate(y_score)
    
        print('%s  acc: %.3f  auc:%.3f' % (split, *metrics))

        
print('==> Evaluating ...')

test('train')
test('test')

==> Evaluating ...
train  acc: 0.971  auc:0.832
test  acc: 0.863  auc:0.695


In [None]:
# Feature Extraction

features1 = []
features2 = []
features3 = []
features4 = []
features5 = []
labels_p = []

def extract_features() :
    model.eval() # Features are extracted in evaluation mode

    with torch.no_grad() :
        for inputs, targets in tqdm(train_loader):

            inputs1 = inputs.to(device)
            targets1 = targets.to(device)

            x1 = model.layer1(inputs1)
            x11= x1.view(x1.size(0), x1.size(1), -1)
            size = x11.shape[2]
            x11 = torch.div(torch.sum(x11, 2), size)
            features1.extend(x11)

            x2 = model.layer2(x1)
            x22= x2.view(x2.size(0), x2.size(1), -1)
            size = x22.shape[2]
            x22 = torch.div(torch.sum(x22, 2), size)
            features2.extend(x22)

            x3 = model.layer3(x2)
            x33= x3.view(x3.size(0), x3.size(1), -1)
            size = x33.shape[2]
            x33 = torch.div(torch.sum(x33, 2), size)
            features3.extend(x33)

            x4 = model.layer4(x3)
            x44= x4.view(x4.size(0), x4.size(1), -1)
            size = x44.shape[2]
            x44 = torch.div(torch.sum(x44, 2), size)
            features4.extend(x44)

            x5 = model.layer5(x4)
            x55 = x5.view(x5.size(0), x5.size(1), -1)
            size = x55.shape[2]
            x55 = torch.div(torch.sum(x55, 2), size)
            features5.extend(x55)
            
            labels_p.extend(targets1)
    return

extract_features()
features1 = torch.stack(features1)
features2 = torch.stack(features2)
features3 = torch.stack(features3)
features4 = torch.stack(features4)
features5 = torch.stack(features5)
labels_p = torch.stack(labels_p)

100%|██████████| 704/704 [00:18<00:00, 38.64it/s]


In [None]:
features3.shape,labels_p.shape

(torch.Size([89996, 64]), torch.Size([89996, 1]))

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import os

dump_at = '/content/drive/MyDrive/BTP/' + data_flag + '/'
os.mkdir(dump_at)

X = features1.cpu().detach().numpy() #convert to Numpy array
df = pd.DataFrame(X) #convert to a dataframe
df.to_csv(dump_at + "L1_FeatureMap_avg.csv",index=False) #save to file

X = features2.cpu().detach().numpy() 
df = pd.DataFrame(X) 
df.to_csv(dump_at + "L2_FeatureMap_avg.csv",index=False) 

X = features3.cpu().detach().numpy() 
df = pd.DataFrame(X) 
df.to_csv(dump_at + "L3_FeatureMap_avg.csv",index=False) 

X = features4.cpu().detach().numpy() 
df = pd.DataFrame(X) 
df.to_csv(dump_at + "L4_FeatureMap_avg.csv",index=False) 

X = features5.cpu().detach().numpy() 
df = pd.DataFrame(X) 
df.to_csv(dump_at + "L5_FeatureMap_avg.csv",index=False) 

X = labels_p.cpu().detach().numpy() 
df = pd.DataFrame(X) 
df.to_csv(dump_at + "labels.csv",index=False) 

Mounted at /content/drive


In [None]:
import pandas as pd
import os

dump_at = '/content/drive/MyDrive/BTP/' + data_flag + '/'
# os.mkdir(dump_at)

L1 = L1.cpu()
L1_np = L1.detach().numpy() #convert to Numpy array
L1_np = np.sum(L1_np, axis=-1)/L1_np.shape[2]
df = pd.DataFrame(L1_np) #convert to a dataframe
df.to_csv(dump_at + "L1_FeatureMap_avg.csv",index=False) #save to file

L2 = L2.cpu()
L2_np = L2.detach().numpy() 
L2_np = np.sum(L2_np, axis=-1)/L2_np.shape[2]
df = pd.DataFrame(L2_np) 
df.to_csv(dump_at + "L2_FeatureMap_avg.csv",index=False) 

L3 = L3.cpu()
L3_np = L3.detach().numpy()
L3_np = np.sum(L3_np, axis=-1)/L3_np.shape[2]
df = pd.DataFrame(L3_np) 
df.to_csv(dump_at + "L3_FeatureMap_avg.csv",index=False) 

L4 = L4.cpu()
L4_np = L4.detach().numpy()
L4_np = np.sum(L4_np, axis=-1)/L4_np.shape[2]
df = pd.DataFrame(L4_np)
df.to_csv(dump_at + "L4_FeatureMap_avg.csv",index=False) 

L5 = L5.cpu()
L5_np = L5.detach().numpy() 
L5_np = np.sum(L5_np, axis=-1)/L5_np.shape[2]
df = pd.DataFrame(L5_np) 
df.to_csv(dump_at + "L5_FeatureMap_avg.csv",index=False) 

t1 = t1.cpu()
t1_np = t1.detach().numpy()
df = pd.DataFrame(t1_np) 
df.to_csv(dump_at + "labels.csv",index=False) 

In [None]:
def write_features(n) :
    L1 = torch.stack(features1)
    L2 = torch.stack(features2)
    L3 = torch.stack(features3)
    L4 = torch.stack(features4)
    L5 = torch.stack(features5)
    t1 = torch.stack(labels_p)

    dump_at = '/content/drive/MyDrive/BTP/' + data_flag + '/'

    L1 = L1.cpu()
    L1_np = L1.detach().numpy() #convert to Numpy array
    L1_np = np.sum(L1_np, axis=-1)/L1_np.shape[2]
    df = pd.DataFrame(L1_np) #convert to a dataframe
    df.to_csv(dump_at + "L1_FeatureMap_avg_%d.csv"%(n),index=False) #save to file

    L2 = L2.cpu()
    L2_np = L2.detach().numpy() 
    L2_np = np.sum(L2_np, axis=-1)/L2_np.shape[2]
    df = pd.DataFrame(L2_np) 
    df.to_csv(dump_at + "L2_FeatureMap_avg_%d.csv"%(n),index=False) 

    L3 = L3.cpu()
    L3_np = L3.detach().numpy()
    L3_np = np.sum(L3_np, axis=-1)/L3_np.shape[2]
    df = pd.DataFrame(L3_np) 
    df.to_csv(dump_at + "L3_FeatureMap_avg_%d.csv"%(n),index=False) 

    L4 = L4.cpu()
    L4_np = L4.detach().numpy()
    L4_np = np.sum(L4_np, axis=-1)/L4_np.shape[2]
    df = pd.DataFrame(L4_np)
    df.to_csv(dump_at + "L4_FeatureMap_avg_%d.csv"%(n),index=False) 

    L5 = L5.cpu()
    L5_np = L5.detach().numpy() 
    L5_np = np.sum(L5_np, axis=-1)/L5_np.shape[2]
    df = pd.DataFrame(L5_np) 
    df.to_csv(dump_at + "L5_FeatureMap_avg_%d.csv"%(n),index=False) 

    t1 = t1.cpu()
    t1_np = t1.detach().numpy()
    df = pd.DataFrame(t1_np) 
    df.to_csv(dump_at + "labels_%d.csv"%(n),index=False) 

    return