In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib widget

In [2]:
import numpy as np
import random
import sys
sys.path.append('tools')

import os

import torch

import dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import pytorch_lightning as pl
import pytorch_lightning.loggers as pl_loggers
import pytorch_lightning.callbacks as pl_callbacks
import data_utility, annotation_utility
from models.rns_dataloader import get_data, RNS_Downstream
from active_learning_utility import get_strategy
from active_learning_data import Data
from active_learning_net import Net
from copy import deepcopy
from models.SwaV import SwaV
from models.SupervisedDownstream import SupervisedDownstream
import warnings

warnings.filterwarnings("ignore", ".*Consider increasing the value of the `num_workers` argument*")
warnings.filterwarnings("ignore", ".*Set a lower value for log_every_n_steps if you want to see logs for the training epoch*")

In [None]:
random_seed = 42
random.seed(random_seed)
torch.manual_seed(random_seed)
np.random.seed(random_seed)

if torch.cuda.is_available():
    torch.cuda.manual_seed(random_seed)
    # True ensures the algorithm selected by CUFA is deterministic
    torch.backends.cudnn.deterministic = True
    # torch.set_deterministic(True)
    # False ensures CUDA select the same algorithm each time the application is run
    torch.backends.cudnn.benchmark = False

In [None]:
data_dir = "../../../user_data/"
log_folder_root = '../../../user_data/logs/'
ckpt_folder_root = '../../../user_data/checkpoints/'

In [None]:
strategy_name = 'RandomSampling'

In [None]:
nStart = 1
nEnd = 20
nQuery = 2

In [None]:
args_task = {'n_epoch': 80,
             'transform_train': True,
             'strategy_name': strategy_name,
             'transform': False,
             'loader_tr_args': {'batch_size': 128, 'num_workers': 0, 'collate_fn': collate_fn,
                                'drop_last': True},
             'loader_te_args': {'batch_size': 256, 'num_workers': 0, 'collate_fn': collate_fn,
                                'drop_last': True}
             }

In [None]:
raw_annotations = pd.read_csv(data_dir + 'full_updated_anns_annotTbl_cleaned.csv')
ids = list(np.unique(raw_annotations[raw_annotations['descriptions'].notnull()]['HUP_ID']))
# ids = list(np.unique(raw_annotations['HUP_ID']))

data_import = data_utility.read_files(path=data_dir+'rns_data', path_data=data_dir+'rns_raw_cache', patientIDs=ids,
                                      verbose=True)  # Import data with annotation

In [None]:
n_pool = len(y_train)
n_test = len(y_test)

NUM_INIT_LB = int(nStart * n_pool / 100)
NUM_QUERY = int(nQuery * n_pool / 100) if nStart != 100 else 0
NUM_ROUND = int((int(nEnd * n_pool / 100) - NUM_INIT_LB) / NUM_QUERY) if nStart != 100 else 0
if NUM_QUERY != 0:
    if (int(nEnd * n_pool / 100) - NUM_INIT_LB) % NUM_QUERY != 0:
        NUM_ROUND += 1

print(NUM_INIT_LB)
print(NUM_QUERY)
print(NUM_ROUND)

In [None]:
data_list = os.listdir('rns_test_cache')

X_train, y_train, X_test, y_test  = get_data(data_list, split=0.8)
# data, label,_,_ = get_data(data_list, split=1)
# train_data, test_data, train_label, test_label = sklearn.model_selection.train_test_split(data, label, test_size=0.8, random_state=42)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
dataset = Data(X_train, y_train, X_test, y_test, RNS_Downstream, args_task)

In [None]:
swav = SwaV().load_from_checkpoint(
    ckpt_folder_root + 'kaggle_dog_swav_34/kaggle_dog_swav-epoch=116-swav_loss=2.73583.ckpt')
model = SupervisedDownstream(swav.backbone)
# initialize model and save the model state
modelstate = deepcopy(model.state_dict())
device = "cuda" if torch.cuda.is_available() else "cpu"
net = Net(model, args_task, device)

In [None]:
strategy = get_strategy(strategy_name, dataset, net, None, args_task)

In [None]:
# initial round of training, round 0
dataset.initialize_labels(NUM_INIT_LB)
strategy.train()

In [None]:
for rd in range(1, NUM_ROUND +1):
    print('round ' + str(rd))
    q_idxs = strategy.query(NUM_QUERY)
    strategy.update(q_idxs)
    strategy.net.round = rd
    strategy.net.net.load_state_dict(modelstate)
    strategy.train()

In [4]:
# import torchvision.transforms as T
# from torch.utils.data import Dataset
# class RNS_Active(Dataset):
#     def __init__(self, data, label, transform=None, astensor=True):
#         self.data = data
#         self.label = label
#         self.transform = transform['transform']
#         print('data loaded')
#
#         self.label = self.label[np.newaxis].T
#
#         self.length = len(self.data)
#
#         print(data.shape)
#         print(label.shape)
#
#         if astensor:
#             self.augmentation = T.Compose([
#                 T.ToPILImage(),
#                 T.Resize((256, 256), interpolation=T.InterpolationMode.NEAREST),
#                 T.RandomApply([T.ColorJitter()], p=0.5),
#                 T.RandomApply([T.GaussianBlur(kernel_size=(3, 3))], p=0.5),
#                 T.RandomInvert(p=0.2),
#                 T.RandomPosterize(4, p=0.2),
#                 T.ToTensor()
#             ])
#
#             self.totensor = T.Compose([
#                 T.ToPILImage(),
#                 T.Resize((256, 256), interpolation=T.InterpolationMode.NEAREST),
#                 T.ToTensor()
#             ])
#         else:
#             self.augmentation = T.Compose([
#                 T.ToPILImage(),
#                 T.Resize((256, 256), interpolation=T.InterpolationMode.NEAREST),
#                 T.RandomApply([T.ColorJitter()], p=0.5),
#                 T.RandomApply([T.GaussianBlur(kernel_size=(3, 3))], p=0.5),
#                 T.RandomInvert(p=0.2),
#                 T.RandomPosterize(4, p=0.2),
#             ])
#
#             self.totensor = T.Compose([
#                 T.ToPILImage(),
#                 T.Resize((256, 256), interpolation=T.InterpolationMode.NEAREST),
#             ])
#
#     def __len__(self):
#         return self.length
#
#     def __getitem__(self, index):
#         data = self.data[index]
#         label = self.label[index]
#
#         if self.transform:
#             concat_len = data.shape[1] / 4
#             channel_index = np.arange(4)
#             np.random.shuffle(channel_index)
#             channel_index = channel_index * concat_len + (concat_len - 1) / 2
#             channel_index = np.repeat(channel_index, concat_len)
#             concate_len_1 = (concat_len - 1) / 2
#             a_repeat = np.arange(-concate_len_1, concate_len_1 + 1)[np.newaxis].T
#             base_repeat = np.repeat(a_repeat, 4, axis=1).T.flatten()
#             channel_index = channel_index + base_repeat
#             data = data[channel_index.astype(int)]
#             data = torch.from_numpy(data).clone()
#             data = data.repeat(3, 1, 1)
#             data = self.augmentation(data)
#
#         else:
#             concat_len = data.shape[1] / 4
#             channel_index = np.arange(4)
#             # np.random.shuffle(channel_index)
#             channel_index = channel_index * concat_len + (concat_len - 1) / 2
#             channel_index = np.repeat(channel_index, concat_len)
#             concate_len_1 = (concat_len - 1) / 2
#             a_repeat = np.arange(-concate_len_1, concate_len_1 + 1)[np.newaxis].T
#             base_repeat = np.repeat(a_repeat, 4, axis=1).T.flatten()
#             channel_index = channel_index + base_repeat
#             data = data[channel_index.astype(int)]
#             data = torch.from_numpy(data).clone()
#             data = data.repeat(3, 1, 1)
#             data = self.totensor(data)
#
#         return data, torch.from_numpy(label).to(dtype=torch.long), index

In [7]:
# raw_annotations = pd.read_csv('full_updated_anns_annotTbl_cleaned.csv')
# ids = list(np.unique(raw_annotations[raw_annotations['descriptions'].notnull()]['HUP_ID']))
# data_import = data_utility.read_files(path='data/rns_data', path_data='rns_raw_cache', patientIDs=ids,
#                                       verbose=True)  # Import data with annotation
# annotations = annotation_utility.read_annotation(annotation_path='full_updated_anns_annotTbl_cleaned.csv',
#                                                  data=data_import, n_class=3)
# annot = annotations.annotations
# patient_list = list(np.unique(annot['Patient_ID']))
# # patient_list = ['RNS026', 'HUP159', 'HUP129', 'HUP096', 'HUP182']
#
# clip_dict = annotation_utility.combine_annot_index(annot,patient_list, 42)
#
# window_len = 1
# stride = 1
# concat_n = 4
# for id in tqdm(clip_dict.keys()):
#     data_import[id].set_window_parameter(window_length=window_len, window_displacement=stride)
#     data_import[id].set_concatenation_parameter(concatenate_window_n=concat_n)
#     window_indices, _ = data_import[id].get_windowed_data(clip_dict[id][0], clip_dict[id][1])
#     import_label = np.array([])
#     for i, ind in enumerate(window_indices):
#         import_label = np.hstack((import_label, np.repeat(clip_dict[id][2][i], len(ind))))
#     data_import[id].normalize_windowed_data()
#     _, concatenated_data = data_import[id].get_concatenated_data(data_import[id].windowed_data, arrange='channel_stack')
#     assert import_label.shape[0] == concatenated_data.shape[0]
#     np.save('rns_test_cache/' + id + '.npy', {'data': concatenated_data, 'label': import_label})

100%|██████████| 18/18 [00:10<00:00,  1.64it/s]


(2, 25)
(2, 20)
(2, 59)
(2, 39)
(2, 4)
(2, 60)
(2, 21)
(2, 41)
(2, 44)
(2, 0)
(2, 71)
(2, 99)
(2, 24)
(2, 0)
(2, 40)
(2, 0)
(2, 73)
(2, 51)


100%|██████████| 15/15 [00:12<00:00,  1.18it/s]


100%|██████████| 15/15 [00:17<00:00,  1.14s/it]

(84503, 249, 36)
(84503,)
(21132, 249, 36)
(21132,)





In [10]:
y_test.sum()

5320.0

In [11]:



save_file_name = 'rns_active_lc'

845
1690
10


In [13]:
idxs_lb = np.zeros(n_pool, dtype=bool)
idxs_tmp = np.arange(n_pool)
np.random.shuffle(idxs_tmp)
idxs_lb[idxs_tmp[:NUM_INIT_LB]] = True

In [14]:
ckpt = torch.load("rns_ckpt/checkpoint31.pth")
resnet = torchvision.models.resnet50()
backbone = nn.Sequential(*list(resnet.children())[:-1])
swav = SwaV(backbone)
swav.load_state_dict(ckpt['model_state_dict'])
model = ActiveLearning(swav.backbone)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

idxs_train = np.arange(n_pool)[idxs_lb]

checkpoint_callback = pl_callbacks.ModelCheckpoint(monitor='val_loss', filename=save_file_name+'_round_0-{epoch:02d}-{val_loss:.5f}', dirpath=save_file_name + '_ckpt')
csv_logger = pl_loggers.CSVLogger(save_file_name + '_log', name="logger_round_0")
trainer = pl.Trainer( logger=csv_logger, max_epochs=30, callbacks=[checkpoint_callback],accelerator='gpu', devices=1,log_every_n_steps=5)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [15]:
from copy import deepcopy
modelstate = deepcopy(model.state_dict())

In [16]:
def collate_fn(batch):
    info = list(zip(*batch))
    data = info[0]
    label = info[1]


    return torch.stack(data), torch.stack(label)
transforms_param = {'transform_tr': {'transform': True},
                    'transform_te': {'transform': False},
                    }

train_data = RNS_Active(X_train[idxs_train],y_train[idxs_train],transform=transforms_param['transform_tr'])
test_data = RNS_Active(X_test,y_test,transform=transforms_param['transform_te'])
train_dataloader = torch.utils.data.DataLoader(train_data,
                                        batch_size=128,
                                        shuffle=True,
                                        collate_fn=collate_fn,
                                        drop_last=True, )
val_dataloader = torch.utils.data.DataLoader(
    test_data,
    batch_size=128,
    collate_fn=collate_fn,
    shuffle=False,
    drop_last=True,
)
trainer.fit(model, train_dataloader, val_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type       | Params
----------------------------------------
0 | backbone | Sequential | 23.5 M
1 | fc1      | Linear     | 1.0 M 
2 | fc2      | Linear     | 32.8 K
3 | fc3      | Linear     | 520   
4 | fc4      | Linear     | 18    
5 | softmax  | Softmax    | 0     
----------------------------------------
24.6 M    Trainable params
0         Non-trainable params
24.6 M    Total params
98.362    Total estimated model params size (MB)


data loaded
(845, 249, 36)
(845,)
data loaded
(21132, 249, 36)
(21132,)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=30` reached.


In [17]:
def random_query(idxs_lb, n):
    inds = np.where(idxs_lb==0)[0]
    return inds[np.random.permutation(len(inds))][:n]

def entropy_query(X_train, y_train, trainer, model, idxs_lb, n):
    idxs_unlabeled = np.arange(n_pool)[~idxs_lb]
    untrained_data = RNS_Active(X_train[idxs_unlabeled], y_train[idxs_unlabeled], transform=transforms_param['transform_te'])
    untrained_dataloader = torch.utils.data.DataLoader(untrained_data,
                                                   batch_size=128,
                                                   shuffle=True,
                                                   collate_fn=collate_fn,
                                                   drop_last=True, )
    predictions = trainer.predict(model,untrained_dataloader)

    probs = []
    m = nn.Softmax(dim=1)
    for pred, y in predictions:
        out = m(pred)
        probs.append(out)
    probs = torch.vstack(probs)
    # print(probs)
    log_probs = torch.log(probs)
    # print(log_probs)
    U = (probs*log_probs).sum(1)
    # print(U.sort())
    # print(U.sort()[1][:n])
    # print(y_train[idxs_unlabeled[U.sort()[1][:n]][::-1]])
    return idxs_unlabeled[U.sort()[1][:n]]

def lease_conf_query(X_train, y_train, trainer, model, idxs_lb, n):
    idxs_unlabeled = np.arange(n_pool)[~idxs_lb]
    untrained_data = RNS_Active(X_train[idxs_unlabeled], y_train[idxs_unlabeled], transform=transforms_param['transform_te'])
    untrained_dataloader = torch.utils.data.DataLoader(untrained_data,
                                                   batch_size=128,
                                                   shuffle=True,
                                                   collate_fn=collate_fn,
                                                   drop_last=True, )
    predictions = trainer.predict(model,untrained_dataloader)
    output_list = []
    m = nn.Softmax(dim=1)
    for pred, y in predictions:
        out = m(pred)
        output_list.append(out)
    probs = torch.vstack(output_list)
    # print(probs)
    U = probs.max(1)[0]
    # print(U)
    # print(y_train[idxs_unlabeled[U.sort()[1][:n]][::-1]])
    return idxs_unlabeled[U.sort()[1][:n]]

In [1]:
for rd in range(1, NUM_ROUND + 1):
    print('Round {}/{}'.format(rd, NUM_ROUND), flush=True)
    labeled = len(np.arange(n_pool)[idxs_lb])
    if NUM_QUERY > int(nEnd * n_pool / 100) - labeled:
        NUM_QUERY = int(nEnd * n_pool / 100) - labeled

    output = lease_conf_query(X_train, y_train, trainer, model, idxs_lb, NUM_QUERY)

        # entropy_query(X_train, y_train, trainer, model, idxs_lb, NUM_QUERY)

    idxs_lb_previous = deepcopy(idxs_lb)
    # output = random_query(idxs_lb, NUM_QUERY)
    q_idxs = output
    idxs_lb_previous[q_idxs] = True
    idxs_lb = idxs_lb_previous
    print(len(np.arange(n_pool)[idxs_lb]))

    idxs_train = np.arange(n_pool)[idxs_lb]
    train_data = RNS_Active(X_train[idxs_train], y_train[idxs_train], transform=transforms_param['transform_tr'])
    test_data = RNS_Active(X_test, y_test, transform=transforms_param['transform_te'])
    train_dataloader = torch.utils.data.DataLoader(train_data,
                                                   batch_size=128,
                                                   shuffle=True,
                                                   collate_fn=collate_fn,
                                                   drop_last=True, )
    val_dataloader = torch.utils.data.DataLoader(
        test_data,
        batch_size=128,
        collate_fn=collate_fn,
        shuffle=False,
        drop_last=True,
    )
    model.load_state_dict(modelstate)
    checkpoint_callback = pl_callbacks.ModelCheckpoint(monitor='val_loss', filename=save_file_name+'_round_' + str(
        rd) + '-{epoch:02d}-{val_loss:.5f}', dirpath=save_file_name + '_ckpt')
    csv_logger = pl_loggers.CSVLogger(save_file_name + '_log', name="logger_round_" + str(rd))
    trainer = pl.Trainer(logger=csv_logger, max_epochs=30, callbacks=[checkpoint_callback], accelerator='gpu', devices=1,log_every_n_steps=5)
    trainer.fit(model, train_dataloader, val_dataloader)

NameError: name 'NUM_ROUND' is not defined