In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib widget

In [2]:
import numpy as np
import random
import sys
sys.path.append('../tools')

import os

import torch

import pandas as pd
from sklearn.model_selection import train_test_split
import pytorch_lightning as pl
import pytorch_lightning.loggers as pl_loggers
import pytorch_lightning.callbacks as pl_callbacks
import data_utility, annotation_utility
from models.rns_dataloader import *
from active_learning_utility import get_strategy
from active_learning_data import Data
from active_learning_net import Net
from copy import deepcopy
from models.SwaV import SwaV
from models.SupervisedDownstream import SupervisedDownstream
import warnings

warnings.filterwarnings("ignore", ".*Consider increasing the value of the `num_workers` argument*")
warnings.filterwarnings("ignore", ".*Set a lower value for log_every_n_steps if you want to see logs for the training epoch*")

In [3]:
random_seed = 42
random.seed(random_seed)
torch.manual_seed(random_seed)
np.random.seed(random_seed)

if torch.cuda.is_available():
    torch.cuda.manual_seed(random_seed)
    # True ensures the algorithm selected by CUFA is deterministic
    torch.backends.cudnn.deterministic = True
    # torch.set_deterministic(True)
    # False ensures CUDA select the same algorithm each time the application is run
    torch.backends.cudnn.benchmark = False

import pytorch_lightning
pytorch_lightning.utilities.seed.seed_everything(seed=random_seed, workers=True)

Global seed set to 42


42

In [4]:
data_dir = "../../../user_data/"
log_folder_root = '../../../user_data/logs/'
ckpt_folder_root = '../../../user_data/checkpoints/'

In [5]:
strategy_name = 'LeastConfidence'

In [6]:
nStart = 1
nEnd = 20
nQuery = 2

In [7]:
args_task = {'n_epoch': 100,
             'transform_train': True,
             'strategy_name': strategy_name,
             'transform': False,
             'loader_tr_args': {'batch_size': 256, 'num_workers': 4, 'collate_fn': collate_fn,
                                'drop_last': True,'persistent_workers':True},
             'loader_te_args': {'batch_size': 256, 'num_workers': 8, 'collate_fn': collate_fn,
                                'drop_last': True,'persistent_workers':True}
             }

In [8]:
# raw_annotations = pd.read_csv(data_dir + 'full_updated_anns_annotTbl_cleaned.csv')
# ids = list(np.unique(raw_annotations[raw_annotations['descriptions'].notnull()]['HUP_ID']))
# # ids = list(np.unique(raw_annotations['HUP_ID']))
#
# data_import = data_utility.read_files(path=data_dir+'rns_data', path_data=data_dir+'rns_raw_cache', patientIDs=ids,
#                                       verbose=True)  # Import data with annotation

In [9]:
# data_list = os.listdir(data_dir+'rns_test_cache')
# print(data_list)
data_list = ['HUP047.npy', 'HUP084.npy', 'HUP096.npy', 'HUP109.npy', 'HUP121.npy', 'HUP129.npy', 'HUP131.npy',
             'HUP137.npy', 'HUP147.npy', 'HUP156.npy', 'HUP159.npy', 'HUP182.npy', 'HUP197.npy', 'HUP199.npy',
             'RNS026.npy', 'RNS029.npy']
X_train, y_train, X_test, y_test, index_train, index_test  = get_data(data_list, split=0.8)
# data, label,_,_ = get_data(data_list, split=1)
# train_data, test_data, train_label, test_label = sklearn.model_selection.train_test_split(data, label, test_size=0.8, random_state=42)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

100%|██████████| 16/16 [00:16<00:00,  1.06s/it]

(86197, 249, 36)
(86197,)
(21556, 249, 36)
(21556,)





In [13]:
unique_episodes, unique_episodes_counts = np.unique(index_train[index_train['episode_index']], return_counts=True)

In [28]:
def indices_of_change(arr):
    # Find indices where changes occur
    change_indices = np.where(arr[:-1] != arr[1:])[0]

    # Increment indices by 1 to get the index of the changed element
    change_indices += 1

    # Add index 0 if the first element is a change
    if arr[0] != arr[1]:
        change_indices = np.insert(change_indices, 0, 0)

    # Add index n-1 if the last element is a change
    if arr[-1] != arr[-2]:
        change_indices = np.append(change_indices, len(arr) - 1)

    return change_indices
np.diff(indices_of_change(index_train['episode_index']))

array([38, 90, 90, ..., 88, 89, 89], dtype=int64)

In [39]:
import torch
import torch.nn as nn

# Define your LSTM model
class MyLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(MyLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)

    def forward(self, x, lengths):
        # Sort input sequence by length
        sorted_lengths, sorted_idx = lengths.sort(descending=True)
        x_sorted = x[sorted_idx]

        # Pack the padded sequence
        x_packed = nn.utils.rnn.pack_padded_sequence(x_sorted, sorted_lengths, batch_first=True)
        # print(x_packed.size())
        # Forward pass through LSTM
        out_packed, _ = self.lstm(x_packed)
        # print(out_packed.size())
        # Unpack the output sequence
        out_sorted, _ = nn.utils.rnn.pad_packed_sequence(out_packed, batch_first=True)
        print(out_sorted.size())

        # Reorder the output sequence
        _, original_idx = sorted_idx.sort()
        out = out_sorted[original_idx]

        return out

# Example usage
input_size = 10
hidden_size = 20
num_layers = 2
batch_size = 5
max_seq_length = 8

# Random input sequence
x = torch.randn(batch_size, max_seq_length, input_size)

# Generate random lengths for each sequence in the batch
lengths = torch.randint(1, max_seq_length, (batch_size,))

# Masking: Create mask tensor where padded positions are 0 and others are 1
mask = torch.arange(max_seq_length).expand(len(lengths), max_seq_length) < lengths.unsqueeze(1)

# Create LSTM model instance
lstm_model = MyLSTM(input_size, hidden_size, num_layers)

# Forward pass
output = lstm_model(x, lengths)

# Apply mask to ignore padded positions
masked_output = output[mask].view(batch_size, -1, hidden_size)

torch.Size([5, 6, 20])


IndexError: The shape of the mask [5, 8] at index 1 does not match the shape of the indexed tensor [5, 6, 20] at index 1

In [None]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
from torch.utils.data import DataLoader, Dataset

# Sample dataset with varying sequence lengths
class VariableLengthDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

# Function to pad and pack sequences
def collate_fn(batch):
    batch.sort(key=lambda x: len(x[0]), reverse=True)  # Sort by sequence length
    sequences, labels = zip(*batch)
    sequences_padded = pad_sequence(sequences, batch_first=True)
    lengths = [len(seq) for seq in sequences]
    return sequences_padded, torch.tensor(labels), lengths

# Define the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, lengths):
        # Pack the sequence
        packed_input = pack_padded_sequence(x, lengths, batch_first=True)
        packed_output, (ht, ct) = self.lstm(packed_input)
        # We use the last hidden state to classify
        output = self.fc(ht[-1])
        return output

# Example usage
input_size = 10  # Number of features
hidden_size = 50
output_size = 2  # Number of output classes

model = LSTMModel(input_size, hidden_size, output_size)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

# Assuming 'data' is your dataset of variable-length sequences and 'labels' their corresponding labels
dataset = VariableLengthDataset(data, labels)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)

# Training loop
for epoch in range(epochs):
    for sequences, labels, lengths in dataloader:
        optimizer.zero_grad()
        outputs = model(sequences, lengths)
        loss = loss_function(outputs, labels)
        loss.backward()
        optimizer.step()

In [35]:
mask.size()

torch.Size([5, 8])

In [36]:
output.size()

torch.Size([5, 5, 20])

In [31]:
output.size()

torch.Size([5, 7, 20])

In [10]:
n_pool = len(y_train)
n_test = len(y_test)

NUM_INIT_LB = int(nStart * n_pool / 100)
NUM_QUERY = int(nQuery * n_pool / 100) if nStart != 100 else 0
NUM_ROUND = int((int(nEnd * n_pool / 100) - NUM_INIT_LB) / NUM_QUERY) if nStart != 100 else 0
if NUM_QUERY != 0:
    if (int(nEnd * n_pool / 100) - NUM_INIT_LB) % NUM_QUERY != 0:
        NUM_ROUND += 1

print(NUM_INIT_LB)
print(NUM_QUERY)
print(NUM_ROUND)

861
1723
10


In [11]:
dataset = Data(X_train, y_train, X_test, y_test, RNS_Downstream, args_task)

In [12]:
swav = SwaV().load_from_checkpoint(
    ckpt_folder_root + 'rns_swav_50_12/rns_swav-epoch=82-swav_loss=2.58204.ckpt')
model = SupervisedDownstream(swav.backbone)
# initialize model and save the model state
modelstate = deepcopy(model.state_dict())
device = "cuda" if torch.cuda.is_available() else "cpu"


net = Net(model, args_task, device, ckpt_folder_root = 'rns_active', log_folder_root = 'rns_active')

In [13]:
strategy = get_strategy(strategy_name, dataset, net, None, args_task)

In [None]:
train_emb = strategy.get_embeddings(X_train)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [15]:
# initial round of training, round 0
# dataset.initialize_labels(NUM_INIT_LB)
strategy.train()

Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


[]
data loaded
(0, 249, 36)
(0,)
data loaded
(21556, 249, 36)
(21556,)


ValueError: num_samples should be a positive integer value, but got num_samples=0

In [15]:
# q_idxs = strategy.query(NUM_QUERY)

In [25]:
unlabeled_data = strategy.dataset.get_test_data()
test_emb = strategy.get_embeddings(unlabeled_data)

data loaded
(21556, 249, 36)
(21556,)


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

In [26]:
np.save(data_dir + 'rns_emb/test_emb.npy', test_emb.cpu().detach().numpy())

In [27]:
test_emb.size()

torch.Size([21556, 2048])

In [21]:
unique_episodes, unique_episodes_counts = np.unique(index_train[unlabeled_idxs]['episode_index'], return_counts=True)

In [22]:
import time

uncertainties = probs.max(1)[0]
orders = torch.sort(uncertainties,stable = True)[1]
ranks = torch.argsort(orders)
start_time = time.perf_counter()

rank_list = [ranks[index_train[unlabeled_idxs]['episode_index'] == epi_ind]for epi_ind in unique_episodes]
end_time = time.perf_counter()

elapsed_time = end_time - start_time
print("Elapsed time: ", elapsed_time)

Elapsed time:  3.05195380000805


In [23]:
first_n = 5000
from copy import deepcopy
emission_table = []
rank_list_copy = deepcopy(rank_list)
emission_table = [torch.where(rl < first_n, 0.85, 0.15) for rl in rank_list_copy]
rank_list_copy = deepcopy(rank_list)
active_rank_list = [torch.where(rl < first_n, 1, 0) for rl in rank_list_copy]
ordered_rank_list = []
rank_list_copy = deepcopy(rank_list)
active_rank_list = [torch.where(rl < first_n, rl, 0) for rl in rank_list_copy]

In [34]:
avg_rank = torch.sort(torch.tensor([torch.sum(rl)/(torch.count_nonzero(rl)+1e-6) for rl in active_rank_list]))
episode_rank = avg_rank[1][avg_rank[0]>0]

tensor([ 866,  820,  107,  339,  654,  395,  833,  407,  158,  122,  364,  741,
         167,  382,   12,  363,  799,  360,  304,  359,  343,  387,  956,  838,
         345,  126,  162,  789,  720,  168,   34,  624,  223,  369,  491,   22,
         490,  926,  414,  904,  787,    1,  119,  296,    3,   96,  769,  816,
         412,  396,  665,  217,  973,  934,  328,  770,  264,  410,  790,  664,
         262,   40,  607,  850,   51,  771,  511,  795,  831,  922,  828,    0,
         150,  468,  842,  219,  498,  136,  792,  885,  367,  228,  557,  449,
         878,  178,  314,  946,  967,  752,  921,  205,  586,  915,  415,   80,
         517,  212,  569,  313,   83,   36,  151,  311,  893,  527,  729,  202,
         626,  848,  406,  465,  994,  213,  951,  867,  577,   89,  435,   71,
          75,   15,  639,   43,  907,  454,  445,  444,  349,  287,  604,  398,
         821,   47,  943,  199,  279,  225,  473,  611,  616,  270,  553,   52,
         470,  222,   11,  901,  909,  8

In [40]:
active_rank_list[395]

tensor([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0, 102])

In [184]:
s = []
for ls in active_rank_list:
    s.append(torch.sum(ls))

In [185]:
torch.sort(torch.tensor(s),descending =True)

torch.return_types.sort(
values=tensor([25, 24, 24,  ...,  0,  0,  0]),
indices=tensor([ 648,  537,  535,  ...,  388,  290, 1003]))

In [147]:
l[torch.argsort(torch.tensor(s),descending =True)[2]]

tensor([1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,
        1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0])

In [186]:
from models.HMM import HMM

hmm = HMM(active_rank_list)


array([0.69920319, 0.30079681])

In [198]:
start_time = time.perf_counter()
filtered_list = []
for i in range(len(active_rank_list)):
    emission_mat_1 = emission_table[i]
    emission_mat_0 = 1-emission_mat_1
    emission_mat = torch.vstack([emission_mat_1,emission_mat_0]).T
    predicted_label = 1 - hmm.predict_labels(active_rank_list[i].numpy(), emission_mat.numpy())
    filtered_list.append(predicted_label)
end_time = time.perf_counter()
elapsed_time = end_time - start_time
print("Elapsed time: ", elapsed_time)

Elapsed time:  1.3014186000073096


In [244]:
def filter_continuous_positive(array, length_m):

    # Find the indices where 1s start and end
    start_indices = np.where(np.diff(array) == 1)[0] + 1
    end_indices = np.where(np.diff(array) == -1)[0]

    if array[0] == 1:
        start_indices = np.insert(start_indices, 0, 0)
    if array[-1] == 1:
        end_indices = np.append(end_indices, len(array) - 1)

    lengths = end_indices - start_indices + 1

    filtered_indices = start_indices[lengths > length_m]
    filtered_lengths = lengths[lengths>length_m]

    result_array = np.zeros_like(array)
    for start_idx,length in zip(filtered_indices,filtered_lengths):
        result_array[start_idx:start_idx + length] = 1

    return result_array

In [245]:

start_time = time.perf_counter()
filtered_filtered_list = [filter_continuous_positive(arr,9) for arr in filtered_list]
end_time = time.perf_counter()
elapsed_time = end_time - start_time
print("Elapsed time: ", elapsed_time)

Elapsed time:  0.029054699989501387


In [246]:
sum([np.sum(arr) for arr in filtered_filtered_list])

2161

In [247]:
np.hstack(filtered_filtered_list).shape

(85336,)

In [None]:
for rd in range(1, NUM_ROUND +1):
    print('round ' + str(rd))
    q_idxs = strategy.query(NUM_QUERY)
    strategy.update(q_idxs)
    strategy.net.round = rd
    strategy.net.net.load_state_dict(modelstate)
    strategy.train()