In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib widget

In [2]:
import numpy as np
import random
import sys
sys.path.append('../tools')

import os

import torch

import pandas as pd
from sklearn.model_selection import train_test_split
import pytorch_lightning as pl
import pytorch_lightning.loggers as pl_loggers
import pytorch_lightning.callbacks as pl_callbacks
import data_utility, annotation_utility
from models.rns_dataloader import *
from active_learning_utility import get_strategy
from active_learning_data import Data
from active_learning_net import Net
from copy import deepcopy
from models.SwaV import SwaV
from models.SupervisedDownstream import SupervisedDownstream
import warnings

warnings.filterwarnings("ignore", ".*Consider increasing the value of the `num_workers` argument*")
warnings.filterwarnings("ignore", ".*Set a lower value for log_every_n_steps if you want to see logs for the training epoch*")

In [3]:
random_seed = 42
random.seed(random_seed)
torch.manual_seed(random_seed)
np.random.seed(random_seed)

if torch.cuda.is_available():
    torch.cuda.manual_seed(random_seed)
    # True ensures the algorithm selected by CUFA is deterministic
    torch.backends.cudnn.deterministic = True
    # torch.set_deterministic(True)
    # False ensures CUDA select the same algorithm each time the application is run
    torch.backends.cudnn.benchmark = False

import pytorch_lightning
pytorch_lightning.utilities.seed.seed_everything(seed=random_seed, workers=True)

Global seed set to 42


42

In [4]:
data_dir = "../../../user_data/"
log_folder_root = '../../../user_data/logs/'
ckpt_folder_root = '../../../user_data/checkpoints/'

In [5]:
strategy_name = 'LeastConfidence'

In [6]:
nStart = 1
nEnd = 20
nQuery = 2

In [7]:
args_task = {'n_epoch': 100,
             'transform_train': True,
             'strategy_name': strategy_name,
             'transform': False,
             'loader_tr_args': {'batch_size': 256, 'num_workers': 4, 'collate_fn': collate_fn,
                                'drop_last': True,'persistent_workers':True},
             'loader_te_args': {'batch_size': 256, 'num_workers': 8, 'collate_fn': collate_fn,
                                'drop_last': True,'persistent_workers':True}
             }

In [8]:
# raw_annotations = pd.read_csv(data_dir + 'full_updated_anns_annotTbl_cleaned.csv')
# ids = list(np.unique(raw_annotations[raw_annotations['descriptions'].notnull()]['HUP_ID']))
# # ids = list(np.unique(raw_annotations['HUP_ID']))
#
# data_import = data_utility.read_files(path=data_dir+'rns_data', path_data=data_dir+'rns_raw_cache', patientIDs=ids,
#                                       verbose=True)  # Import data with annotation

In [9]:
# data_list = os.listdir(data_dir+'rns_test_cache')
# print(data_list)
data_list = ['HUP047.npy', 'HUP084.npy', 'HUP096.npy', 'HUP109.npy', 'HUP121.npy', 'HUP129.npy', 'HUP131.npy',
             'HUP137.npy', 'HUP147.npy', 'HUP156.npy', 'HUP159.npy', 'HUP182.npy', 'HUP197.npy', 'HUP199.npy',
             'RNS026.npy', 'RNS029.npy']
X_train, y_train, X_test, y_test, index_train, index_test  = get_data(data_list, split=0.8)
# data, label,_,_ = get_data(data_list, split=1)
# train_data, test_data, train_label, test_label = sklearn.model_selection.train_test_split(data, label, test_size=0.8, random_state=42)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

100%|██████████| 16/16 [00:14<00:00,  1.09it/s]

(86197, 249, 36)
(86197,)
(21556, 249, 36)
(21556,)





In [10]:
n_pool = len(y_train)
n_test = len(y_test)

NUM_INIT_LB = int(nStart * n_pool / 100)
NUM_QUERY = int(nQuery * n_pool / 100) if nStart != 100 else 0
NUM_ROUND = int((int(nEnd * n_pool / 100) - NUM_INIT_LB) / NUM_QUERY) if nStart != 100 else 0
if NUM_QUERY != 0:
    if (int(nEnd * n_pool / 100) - NUM_INIT_LB) % NUM_QUERY != 0:
        NUM_ROUND += 1

print(NUM_INIT_LB)
print(NUM_QUERY)
print(NUM_ROUND)

861
1723
10


In [11]:
dataset = Data(X_train, y_train, X_test, y_test, RNS_Downstream, args_task)

In [12]:
swav = SwaV().load_from_checkpoint(
    ckpt_folder_root + 'rns_swav_50_12/rns_swav-epoch=82-swav_loss=2.58204.ckpt')
model = SupervisedDownstream(swav.backbone)
# initialize model and save the model state
modelstate = deepcopy(model.state_dict())
device = "cuda" if torch.cuda.is_available() else "cpu"


net = Net(model, args_task, device, ckpt_folder_root = 'rns_active', log_folder_root = 'rns_active')

In [13]:
strategy = get_strategy(strategy_name, dataset, net, None, args_task)

In [14]:
# initial round of training, round 0
dataset.initialize_labels(NUM_INIT_LB)
strategy.train()

Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


[226, 265, 342, 347, 379, 618, 632, 712, 957, 1003, 1424, 1638, 1841, 2212, 2220, 2225, 2267, 2320, 2449, 2493, 2513, 2544, 2605, 2623, 2685, 2740, 2833, 3050, 3137, 3342, 3502, 3711, 3780, 3804, 4079, 4162, 4272, 4363, 4402, 4472, 4505, 4635, 5012, 5040, 5058, 5069, 5173, 5442, 5453, 5604, 5676, 5866, 5923, 6084, 6459, 6649, 6660, 6662, 6729, 6924, 6956, 7024, 7389, 7478, 7625, 7968, 8131, 8158, 8225, 8296, 8344, 8473, 8854, 8879, 9030, 9249, 9639, 9652, 9724, 9887, 9945, 10085, 10164, 10182, 10262, 10365, 10399, 10463, 10530, 10574, 10593, 10762, 10803, 10815, 11128, 11425, 11597, 11619, 11622, 11751, 11791, 11980, 12042, 12143, 12266, 12450, 12469, 12520, 12582, 12635, 12684, 12691, 12928, 12945, 13022, 13076, 13108, 13132, 13143, 13567, 13694, 13956, 14005, 14117, 14131, 14452, 14475, 14579, 14644, 14813, 15113, 15199, 15267, 15323, 15351, 15371, 15497, 15618, 15697, 15722, 15736, 15844, 16160, 16252, 16275, 16443, 16458, 16467, 16525, 16540, 16754, 16764, 17002, 17147, 17262, 1737

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=100` reached.


In [15]:
# q_idxs = strategy.query(NUM_QUERY)

In [16]:
unlabeled_idxs, unlabeled_data = strategy.dataset.get_unlabeled_data()
probs = strategy.predict_prob(unlabeled_data)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


data loaded
(85336, 249, 36)
(85336,)


Predicting: 3it [00:00, ?it/s]

In [17]:
index_train[unlabeled_idxs]

array([(b'HUP047', 1102,  0,   180296), (b'HUP047', 1102,  1,   180296),
       (b'HUP047', 1102,  2,   180296), ...,
       (b'RNS029', 3810, 21, 60804040), (b'RNS029', 3810,  0, 60809715),
       (b'RNS029', 3810,  1, 60809715)],
      dtype=[('patient_index', 'S10'), ('episode_index', '<i4'), ('slice_index', '<i4'), ('start_index', '<i4')])

In [21]:
unique_episodes, unique_episodes_counts = np.unique(index_train[unlabeled_idxs]['episode_index'], return_counts=True)

In [22]:
import time

uncertainties = probs.max(1)[0]
orders = torch.sort(uncertainties,stable = True)[1]
ranks = torch.argsort(orders)
start_time = time.perf_counter()

rank_list = [ranks[index_train[unlabeled_idxs]['episode_index'] == epi_ind]for epi_ind in unique_episodes]
end_time = time.perf_counter()

elapsed_time = end_time - start_time
print("Elapsed time: ", elapsed_time)

Elapsed time:  3.05195380000805


In [23]:
first_n = 5000
from copy import deepcopy
emission_table = []
rank_list_copy = deepcopy(rank_list)
emission_table = [torch.where(rl < first_n, 0.85, 0.15) for rl in rank_list_copy]
rank_list_copy = deepcopy(rank_list)
active_rank_list = [torch.where(rl < first_n, 1, 0) for rl in rank_list_copy]
ordered_rank_list = []
rank_list_copy = deepcopy(rank_list)
active_rank_list = [torch.where(rl < first_n, rl, 0) for rl in rank_list_copy]

In [34]:
avg_rank = torch.sort(torch.tensor([torch.sum(rl)/(torch.count_nonzero(rl)+1e-6) for rl in active_rank_list]))
episode_rank = avg_rank[1][avg_rank[0]>0]

tensor([ 866,  820,  107,  339,  654,  395,  833,  407,  158,  122,  364,  741,
         167,  382,   12,  363,  799,  360,  304,  359,  343,  387,  956,  838,
         345,  126,  162,  789,  720,  168,   34,  624,  223,  369,  491,   22,
         490,  926,  414,  904,  787,    1,  119,  296,    3,   96,  769,  816,
         412,  396,  665,  217,  973,  934,  328,  770,  264,  410,  790,  664,
         262,   40,  607,  850,   51,  771,  511,  795,  831,  922,  828,    0,
         150,  468,  842,  219,  498,  136,  792,  885,  367,  228,  557,  449,
         878,  178,  314,  946,  967,  752,  921,  205,  586,  915,  415,   80,
         517,  212,  569,  313,   83,   36,  151,  311,  893,  527,  729,  202,
         626,  848,  406,  465,  994,  213,  951,  867,  577,   89,  435,   71,
          75,   15,  639,   43,  907,  454,  445,  444,  349,  287,  604,  398,
         821,   47,  943,  199,  279,  225,  473,  611,  616,  270,  553,   52,
         470,  222,   11,  901,  909,  8

In [40]:
active_rank_list[395]

tensor([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0, 102])

In [184]:
s = []
for ls in active_rank_list:
    s.append(torch.sum(ls))

In [185]:
torch.sort(torch.tensor(s),descending =True)

torch.return_types.sort(
values=tensor([25, 24, 24,  ...,  0,  0,  0]),
indices=tensor([ 648,  537,  535,  ...,  388,  290, 1003]))

In [147]:
l[torch.argsort(torch.tensor(s),descending =True)[2]]

tensor([1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,
        1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0])

In [186]:
from models.HMM import HMM

hmm = HMM(active_rank_list)


array([0.69920319, 0.30079681])

In [198]:
start_time = time.perf_counter()
filtered_list = []
for i in range(len(active_rank_list)):
    emission_mat_1 = emission_table[i]
    emission_mat_0 = 1-emission_mat_1
    emission_mat = torch.vstack([emission_mat_1,emission_mat_0]).T
    predicted_label = 1 - hmm.predict_labels(active_rank_list[i].numpy(), emission_mat.numpy())
    filtered_list.append(predicted_label)
end_time = time.perf_counter()
elapsed_time = end_time - start_time
print("Elapsed time: ", elapsed_time)

Elapsed time:  1.3014186000073096


In [244]:
def filter_continuous_positive(array, length_m):

    # Find the indices where 1s start and end
    start_indices = np.where(np.diff(array) == 1)[0] + 1
    end_indices = np.where(np.diff(array) == -1)[0]

    if array[0] == 1:
        start_indices = np.insert(start_indices, 0, 0)
    if array[-1] == 1:
        end_indices = np.append(end_indices, len(array) - 1)

    lengths = end_indices - start_indices + 1

    filtered_indices = start_indices[lengths > length_m]
    filtered_lengths = lengths[lengths>length_m]

    result_array = np.zeros_like(array)
    for start_idx,length in zip(filtered_indices,filtered_lengths):
        result_array[start_idx:start_idx + length] = 1

    return result_array

In [245]:

start_time = time.perf_counter()
filtered_filtered_list = [filter_continuous_positive(arr,9) for arr in filtered_list]
end_time = time.perf_counter()
elapsed_time = end_time - start_time
print("Elapsed time: ", elapsed_time)

Elapsed time:  0.029054699989501387


In [246]:
sum([np.sum(arr) for arr in filtered_filtered_list])

2161

In [247]:
np.hstack(filtered_filtered_list).shape

(85336,)

In [None]:
for rd in range(1, NUM_ROUND +1):
    print('round ' + str(rd))
    q_idxs = strategy.query(NUM_QUERY)
    strategy.update(q_idxs)
    strategy.net.round = rd
    strategy.net.net.load_state_dict(modelstate)
    strategy.train()