In [1]:
import plotly
%load_ext autoreload
%autoreload 2
%matplotlib widget

In [2]:
import numpy as np
import random
import sys

sys.path.append('../tools')

import os

import torch

import pandas as pd
from sklearn.model_selection import train_test_split
import pytorch_lightning as pl
import pytorch_lightning.loggers as pl_loggers
import pytorch_lightning.callbacks as pl_callbacks
import data_utility, annotation_utility
from models.rns_dataloader import *
from active_learning_utility import get_strategy
from active_learning_data import Data
from active_learning_net import Net
from active_learning_waal import Net_WAAL, WAAL, Classifier,Discriminator
from active_learning_lpl import Classifier, LossNet, LPL, Backbone
from copy import deepcopy
from models.SwaV import SwaV
from models.LSTMDownStream import SupervisedDownstream
import warnings
import pickle
warnings.filterwarnings("ignore")


In [3]:
random_seed = 42
random.seed(random_seed)
torch.manual_seed(random_seed)
np.random.seed(random_seed)

if torch.cuda.is_available():
    torch.cuda.manual_seed(random_seed)
    # True ensures the algorithm selected by CUFA is deterministic
    torch.backends.cudnn.deterministic = True
    # torch.set_deterministic(True)
    # False ensures CUDA select the same algorithm each time the application is run
    torch.backends.cudnn.benchmark = False

import pytorch_lightning

pytorch_lightning.utilities.seed.seed_everything(seed=random_seed, workers=True)

Global seed set to 42


42

In [4]:
data_dir = "../../../user_data/"
log_folder_root = '../../../user_data/logs/'
ckpt_folder_root = '../../../user_data/checkpoints/'

In [5]:
strategy_name = 'LossPredictionLoss'

In [6]:
nStart = 1
nEnd = 20
nQuery = 2

In [7]:
args_task = {'n_epoch': 60,
             'transform_train': True,
             'strategy_name': strategy_name,
             'transform': False,
             'loader_tr_args': {'batch_size': 8, 'num_workers': 4, 'collate_fn': collate_fn,
                                'drop_last': True, 'persistent_workers': True},
             'loader_te_args': {'batch_size': 4, 'num_workers': 4, 'collate_fn': collate_fn,
                                'drop_last': True, 'persistent_workers': True}
             }

In [8]:
# raw_annotations = pd.read_csv(data_dir + 'full_updated_anns_annotTbl_cleaned.csv')
# ids = list(np.unique(raw_annotations[raw_annotations['descriptions'].notnull()]['HUP_ID']))
# # ids = list(np.unique(raw_annotations['HUP_ID']))
#
# data_import = data_utility.read_files(path=data_dir+'rns_data', path_data=data_dir+'rns_raw_cache', patientIDs=ids,
#                                       verbose=True)  # Import data with annotation

In [9]:
# data_list = os.listdir(data_dir+'rns_test_cache')
# print(data_list)
data_list = ['HUP047.npy', 'HUP084.npy', 'HUP096.npy', 'HUP109.npy', 'HUP121.npy', 'HUP129.npy', 'HUP131.npy',
             'HUP137.npy', 'HUP147.npy', 'HUP156.npy', 'HUP159.npy', 'HUP182.npy', 'HUP197.npy', 'HUP199.npy',
             'RNS026.npy', 'RNS029.npy']
# data_list = os.listdir(data_dir+'rns_test_cache')[1:]

# data_list = ['HUP182.npy',   'HUP129.npy',   'HUP109.npy', 'HUP156.npy', 'HUP096.npy', 'RNS026.npy',  'HUP159.npy']
# data_list = ['RNS026.npy', 'HUP159.npy', 'HUP129.npy', 'HUP096.npy', 'HUP182.npy']
train_data, train_label, test_data, test_label, train_index, test_index = get_data_by_episode(data_list, split=0.8)
# data, label,_,_ = get_data(data_list, split=1)
# train_data, test_data, train_label, test_label = sklearn.model_selection.train_test_split(data, label, test_size=0.8, random_state=42)

print(train_data.shape)
print(train_label.shape)
print(test_data.shape)
print(test_label.shape)

16it [00:07,  2.27it/s]

(992,)
(992,)
(270,)
(270,)





In [10]:
X_train = np.concatenate(train_data)
y_train = np.concatenate(train_label)
X_test = np.concatenate(test_data)
y_test = np.concatenate(test_label)
index_train = np.concatenate(train_index)
index_test = np.concatenate(test_index)
seq_len_train = np.array([y.shape[0] for y in train_label])
seq_len_test = np.array([y.shape[0] for y in test_label])

In [11]:
n_pool = len(train_data)
n_test = len(test_data)

NUM_INIT_LB = int(nStart * n_pool / 100)
NUM_QUERY = int(nQuery * n_pool / 100) if nStart != 100 else 0
NUM_ROUND = int((int(nEnd * n_pool / 100) - NUM_INIT_LB) / NUM_QUERY) if nStart != 100 else 0
if NUM_QUERY != 0:
    if (int(nEnd * n_pool / 100) - NUM_INIT_LB) % NUM_QUERY != 0:
        NUM_ROUND += 1

print(NUM_INIT_LB)
print(NUM_QUERY)
print(NUM_ROUND)

9
19
10


In [12]:
dataset = Data(X_train, y_train, X_test, y_test, seq_len_train, seq_len_test, RNS_Active_by_episode_LSTM, args_task)

In [13]:
# swav = SwaV().load_from_checkpoint(
#     ckpt_folder_root + 'rns_swav_50_12/rns_swav-epoch=82-swav_loss=2.58204.ckpt')
swav = SwaV()
model = LPL(Backbone(swav.backbone), Classifier(2048), LossNet())
modelstate = deepcopy(model.state_dict())
device = "cuda" if torch.cuda.is_available() else "cpu"

net = Net(model, args_task, device, ckpt_folder_root='rns_active', log_folder_root='rns_active')

In [14]:
strategy = get_strategy(strategy_name, dataset, net, None, args_task, rns_data=True)

In [15]:
# initial round of training, round 0
dataset.initialize_labels(NUM_INIT_LB)

In [16]:
ids, _ = dataset.get_labeled_data()

selected_inds = {}
selected_inds[0] = ids

if not os.path.exists(log_folder_root + 'rns_active_selected/' + strategy_name):
    os.makedirs(log_folder_root + 'rns_active_selected/' + strategy_name)

# Serialize and save to a file
with open(log_folder_root + 'rns_active_selected/' + strategy_name + '/' + 'selected_indices.pkl', 'wb') as f:
    pickle.dump(selected_inds, f)

In [17]:
strategy.train()

Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=60` reached.


In [18]:
for rd in range(1, NUM_ROUND + 1):
    print('round ' + str(rd))
    log_file_name = log_folder_root + 'rns_active/active_logs_' + strategy_name + '/logger_round_' + str(
        rd - 1) + '/version_0/metrics.csv'
    logs = pd.read_csv(log_file_name)
    max_ind = logs['val_acc'].argmax()
    max_row = logs.iloc[max_ind]
    ckpt_directory = ckpt_folder_root + 'rns_active/active_checkpoints_' + strategy_name
    ckpt_files = os.listdir(ckpt_directory)
    load_file_name = strategy_name + '_round_' + str(rd - 1) + '-step=' + str(int(max_row['step'] + 1))
    print(load_file_name)

    ind = next((i for i, s in enumerate(ckpt_files) if load_file_name in s), None)
    print(ind, ckpt_files[ind])
    strategy.net.net = strategy.net.net.load_from_checkpoint(ckpt_directory + '/' + ckpt_files[ind],
                                          net_fea=Backbone(swav.backbone),
                                          net_clf=Classifier(2048),
                                          net_lpl=LossNet())

    q_idxs = strategy.query(NUM_QUERY * 90)

    with open(log_folder_root + 'rns_active_selected/' + strategy_name + '/' + 'selected_indices.pkl', 'rb') as f:
        # Load the content of the file into a Python object
        selected_inds = pickle.load(f)
    selected_inds[rd] = q_idxs
    with open(log_folder_root + 'rns_active_selected/' + strategy_name + '/' + 'selected_indices.pkl', 'wb') as f:
        pickle.dump(selected_inds, f)
    # Now you can use the dictionary object as usual
    strategy.update(q_idxs)
    strategy.net.round = rd
    strategy.net.net.load_state_dict(modelstate)
    strategy.train()

round 1
LossPredictionLoss_round_0-step=60
2 LossPredictionLoss_round_0-step=60-train_loss=0.02222.ckpt


100%|██████████| 248/248 [02:04<00:00,  2.00it/s]


[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True]
[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True 

Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Sanity Checking: 0it [00:00, ?it/s]

round 2


FileNotFoundError: [Errno 2] No such file or directory: '../../../user_data/logs/rns_active/active_logs_LossPredictionLoss/logger_round_1/version_0/metrics.csv'

In [None]:
from torch.utils.data import DataLoader
unlabeled_idxs, unlabeled_data = strategy.dataset.get_unlabeled_data()
loader_te = DataLoader(unlabeled_data, shuffle=False, **strategy.args_task['loader_te_args'])

strategy.net.net.net_fea.eval()
strategy.net.net.net_dis.eval()

scores = torch.zeros(len(unlabeled_data))

with torch.no_grad():
    for x, y, idxs in tqdm(loader_te):
        x, y = x.cuda(), y.cuda()
        latent = strategy.net.net.net_fea(x).view(-1,2048)
        out = strategy.net.net.net_dis(latent).cpu()
        scores[idxs] = out.view(-1)

In [None]:
strategy.net.net.net_dis.net[3](strategy.net.net.net_dis.net[3](strategy.net.net.net_dis.net[2](strategy.net.net.net_dis.net[1](strategy.net.net.net_dis.net[0](torch.rand((180,512)).cuda())))))

In [None]:
strategy.net.net.net_dis(torch.rand((180,512)).cuda())

In [None]:
out

In [None]:
import torchvision
resnet50 = torchvision.models.resnet50()
for name, para in resnet18.named_children():
     print("-"*20)
     print(f"name: {name}")
     print("values: ")
     print(para)

In [None]:
from torch.utils.data import DataLoader
train_loader = DataLoader(
                strategy.net.handler_joint(X_labeled, Y_labeled, X_unlabeled, Y_unlabeled,
                                   transform=strategy.net.params['transform_train']),
                shuffle=True,
                **strategy.net.params['loader_tr_args']
            )
testloader = DataLoader(test_data, shuffle=False, **strategy.net.params['loader_te_args'])

In [None]:
label_x, label_y, seq_len_label, unlabel_x, _, seq_len_unlabel = next(iter(train_loader))
len(label_x)

In [None]:
dis_loss

In [None]:
wassertein_distance

In [None]:
for param in model.net_fea.parameters():
    param.requires_grad = False

for name, child in model.net_fea.named_parameters():
    if name in ['7','8']:
        for param in child.parameters():
            param.requires_grad = True

for name, para in model.backbone.named_parameters():
     print("-"*20)
     print(f"name: {name}")
     print("values: ")
     print(para)

In [None]:
for name, child in model.net_fea.named_children():
    print(name,child)

In [None]:
model.net_fea['7']

In [None]:
backbone = swav.backbone
feature0 = nn.Sequential(*list(backbone.children())[0:3])
feature1 = nn.Sequential(*list(backbone.children())[4])
feature2 = nn.Sequential(*list(backbone.children())[5])
feature3 = nn.Sequential(*list(backbone.children())[6])
feature4 = nn.Sequential(*list(backbone.children())[7])
feature5 = nn.Sequential(*list(backbone.children())[8:])

In [None]:
class Backbone(pl.LightningModule):
    def __init__(self, backbone):
        super().__init__()
        self.backbone = backbone
        self.feature0 = self.backbone[:4]
        self.feature1 = self.backbone[4]
        self.feature2 = self.backbone[5]
        self.feature3 = self.backbone[6]
        self.feature4 = self.backbone[7]
        self.feature5 = self.backbone[8:]

    def forward(self, x):
        x0 = self.feature0(x)
        x1 = self.feature1(x0)
        x2 = self.feature2(x1)
        x3 = self.feature3(x2)
        x4 = self.feature4(x3)
        emb = self.feature5(x4)

        return emb, [x1, x2, x3, x4]

    def forward2(self, x):
        return self.backbone(x)

In [None]:
backbone.to('cuda')[:4]

In [None]:
resnet18 = torchvision.models.resnet18()
features = nn.Sequential(*list(resnet18.children())[:-1])
bb =Backbone(backbone)
bb.to('cuda')

In [None]:
for name, child in bb.named_children():
    print(name)

In [None]:

test_input = torch.zeros((1,3,256,256)).cuda()

[t.shape for t in bb(test_input)[1]]

In [None]:
bb(test_input)

In [None]:
bb.forward2(test_input1)

In [None]:
torch.equal(bb.forward2(test_input1), bb(test_input1)[0])

In [None]:
def set_requires_grad(model, requires_grad=True, exclude=None):
    """
    Used in training adversarial approach
    :param model:
    :param requires_grad:
    :return:
    """
    for param in model.parameters():
        param.requires_grad = requires_grad

    if exclude is not None:
        for name, child in model.named_children():
            if name in exclude:
                for param in child.parameters():
                    param.requires_grad = not requires_grad

In [None]:
import torch.nn as nn
features = nn.Sequential(*list(resnet50.children())[:-1])
for name, child in features.named_children():
    print(name)

In [None]:
for name, child in model.net_fea.named_children():
    if name in [n for n, _ in model.net_fea.named_children()]:
        for param in child.parameters():
            print(name)

In [None]:
set_requires_grad(model.net_fea, True, exclude=['0','1','2','3','4','5','6'])

In [None]:
for name, para in model.net_fea.named_parameters():
     print("-"*20)
     print(f"name: {name}")
     print("values: ")
     print(para)

In [None]:
import torchvision
from torch import nn
resnet = torchvision.models.resnet50()
resnet_backbone = nn.Sequential(*list(resnet.children())[:-1])
for name, param in resnet_backbone.state_dict().items():
    print(name)

In [None]:
for rd in range(1, NUM_ROUND + 1):
    print('round ' + str(rd))
    log_file_name = log_folder_root + 'rns_active/active_logs_' + strategy_name + '/logger_round_' + str(
        rd - 1) + '/version_0/metrics.csv'
    logs = pd.read_csv(log_file_name)
    max_ind = logs['val_acc'].argmax()
    max_row = logs.iloc[max_ind]
    ckpt_directory = ckpt_folder_root + 'rns_active/active_checkpoints_' + strategy_name
    ckpt_files = os.listdir(ckpt_directory)
    load_file_name = strategy_name + '_round_' + str(rd - 1) + '-epoch=' + str(int(max_row['epoch'])).zfill(2)

    ind = next((i for i, s in enumerate(ckpt_files) if load_file_name in s), None)
    print(ind, ckpt_files[ind])
    load_from_checkpoint(ckpt_directory + '/' + ckpt_files[ind], backbone=swav.backbone)

    q_idxs = strategy.query(NUM_QUERY * 90)

    with open(log_folder_root + 'rns_active_selected/' + strategy_name + '/' + 'selected_indices.pkl', 'rb') as f:
    # Load the content of the file into a Python object
        selected_inds = pickle.load(f)
    selected_inds[rd] = q_idxs
    with open(log_folder_root + 'rns_active_selected/' + strategy_name + '/' + 'selected_indices.pkl', 'wb') as f:
        pickle.dump(selected_inds, f)
# Now you can use the dictionary object as usual
    strategy.update(q_idxs)
    strategy.net.round = rd
    strategy.net.net.load_state_dict(modelstate)
    strategy.train()
    break

In [None]:
for name, param in swav.backbone.state_dict().items():
    print(name)

In [None]:
torch.log(probs)

In [None]:


probs, seq_len = strategy.predict_prob(unlabeled_data)

In [None]:
unlabeled_idxs, unlabeled_data = strategy.dataset.get_train_data_unaugmented()

In [None]:
from torch.utils.data import DataLoader
loader = DataLoader(unlabeled_data, shuffle=False, **strategy.args_task['loader_te_args'])
strategy.net.net.net_clf.eval()
strategy.net.net.net_fea.eval()
strategy.net.net.net_lpl.eval()
strategy.net.net.net_clf.cuda()
strategy.net.net.net_fea.cuda()
strategy.net.net.net_lpl.cuda()
uncertainty = torch.tensor([]).cuda()
with torch.no_grad():
    for x, y, seq_len in tqdm(loader):
        x, y = x.cuda(), y.cuda()
        lb_z, feature = strategy.net.net.net_fea(x)
        lb_z = lb_z.view(-1, 2048)
        lb_out, _, emb_t = strategy.net.net.net_clf(lb_z, seq_len)
        feature.append(emb_t)
        pred_loss = strategy.net.net.net_lpl(feature)
        pred_loss = pred_loss.view(pred_loss.size(0))

        uncertainty = torch.cat((uncertainty, pred_loss), 0)

In [None]:
uncertainty

In [None]:
uncertainty.shape

In [None]:
probs, seq_len = strategy.predict_prob_dropout_split(unlabeled_data, n_drop=5)


In [None]:
probs[0][0]

In [None]:
probs[1][0]

In [None]:
probs[2][0]

In [None]:
embeddings

In [None]:
cluster_learner = KMeans(n_clusters=170)
cluster_learner.fit(embeddings)

cluster_idxs = cluster_learner.predict(embeddings)
centers = cluster_learner.cluster_centers_[cluster_idxs]
dis = (embeddings - centers)**2
dis = dis.sum(axis=1)

In [None]:
import copy
labeled_idxs, train_data = strategy.dataset.get_train_data_unaugmented()


#
# dist_mat = np.matmul(embeddings, embeddings.transpose())
# sq = np.array(dist_mat.diagonal()).reshape(len(labeled_idxs), 1)
# dist_mat *= -2
# dist_mat += sq
# dist_mat += sq.transpose()
# dist_mat = np.sqrt(dist_mat)
#
# mat = dist_mat[~labeled_idxs, :][:, labeled_idxs]
#
# mat_copy = copy.deepcopy(mat)



# output = np.arange(strategy.dataset.n_pool)[(strategy.dataset.labeled_idxs ^ labeled_idxs)]

In [None]:
embeddings, seq_len  = strategy.get_grad_embeddings(train_data)


In [None]:
predictions = strategy.net.run_prediction(train_data)


In [None]:
output_list = []
emb_list = []
seq_len_list = []
m = torch.nn.Softmax(dim=1)
for pred, y, emb, emb_t, seq_len in predictions:
    output_list.append(pred)
    emb_list.append(emb_t)
    seq_len_list.append(seq_len)
emb = torch.vstack(emb_list)
out = emb.data.cpu().numpy()
pred_raw = torch.vstack(output_list)
batchProbs = m(pred_raw.float()).data.cpu().numpy()
maxInds = np.argmax(batchProbs, 1)

nLab = batchProbs.shape[1]
embDim = emb.shape[1]
embeddings = np.zeros([len(emb), embDim * nLab])

for j in range(len(emb)):
    for c in range(nLab):
        if c == maxInds[j]:
            embeddings[j][embDim * c: embDim * (c + 1)] = deepcopy(out[j]) * (
                    1 - batchProbs[j][c]) * -1.0
        else:
            embeddings[j][embDim * c: embDim * (c + 1)] = deepcopy(out[j]) * (
                    -1 * batchProbs[j][c]) * -1.0

In [None]:
embeddings

In [None]:
from sklearn.metrics import pairwise_distances
from scipy import stats
# import pdb
def init_centers(X, K):
    ind = np.argmax([np.linalg.norm(s, 2) for s in X])
    mu = [X[ind]]
    indsAll = [ind]
    centInds = [0.] * len(X)
    cent = 0
    print('#Samps\tTotal Distance')
    while len(mu) < K:
        if len(mu) == 1:
            D2 = pairwise_distances(X, mu).ravel().astype(float)
        else:
            newD = pairwise_distances(X, [mu[-1]]).ravel().astype(float)
            for i in range(len(X)):
                if D2[i] >  newD[i]:
                    centInds[i] = cent
                    D2[i] = newD[i]
        print(str(len(mu)) + '\t' + str(sum(D2)), flush=True)
        if sum(D2) == 0.0: return indsAll
        D2 = D2.ravel().astype(float)
        Ddist = (D2 ** 2)/ sum(D2 ** 2)
        customDist = stats.rv_discrete(name='custm', values=(np.arange(len(D2)), Ddist))
        ind = customDist.rvs(size=1)[0]
        while ind in indsAll: ind = customDist.rvs(size=1)[0]
        mu.append(X[ind])
        indsAll.append(ind)
        cent += 1
    return indsAll

In [None]:
import time
start_time = time.time()
chosen = init_centers(embeddings, 6000)
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
strategy.keep_continuous_segments(np.sort(chosen),10)

In [None]:
norm_data = embeddings_t / np.linalg.norm(embeddings_t, axis=1, keepdims=True)

dist_mat = np.dot(norm_data, norm_data.T)

In [None]:

mat = dist_mat[~labeled_idxs, :][:, labeled_idxs]

mat_copy = copy.deepcopy(mat)
for i in tqdm(range(3000), ncols=100):
    mat_min = mat.min(axis=1)
    q_idx_ = np.argmax(mat_min)
    q_idx = np.arange(strategy.dataset.n_pool)[~labeled_idxs][q_idx_]
    labeled_idxs[q_idx] = True
    mat = np.delete(mat, q_idx_, 0)
    mat = np.append(mat, dist_mat[~labeled_idxs,q_idx][:, None], axis=1)

In [None]:
mat.shape

In [None]:
labeled_idxs

In [None]:
print(dist_mat[~labeled_idxs][:,q_idx].shape)

In [None]:
mat_ind = np.zeros_like(labeled_idxs)
mat_ind[q_idx] = True

In [None]:
mat_ind

In [None]:
~labeled_idxs

In [None]:
embeddings = embeddings_t.numpy()
#
# dist_mat = np.matmul(embeddings, embeddings.transpose())
# sq = np.array(dist_mat.diagonal()).reshape(len(labeled_idxs), 1)
# dist_mat *= -2
# dist_mat += sq
# dist_mat += sq.transpose()
# dist_mat = np.sqrt(dist_mat)
#
# mat = dist_mat[~labeled_idxs, :][:, labeled_idxs]
#
# mat_copy = copy.deepcopy(mat)

norm_data = embeddings_t / np.linalg.norm(embeddings_t, axis=1, keepdims=True)

similarity_matrix = np.dot(norm_data, norm_data.T)

In [None]:
similarity_matrix[0]

In [None]:
cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6)
test = cos(embeddings_t,embeddings_t[0])

In [None]:
np.array_equal(similarity_matrix[0], test.numpy())

In [None]:
np.mean(similarity_matrix[0]-test.numpy())

In [None]:
output = np.arange(strategy.dataset.n_pool)[(strategy.dataset.labeled_idxs ^ labeled_idxs)]

In [None]:
output

In [None]:
changed_ind = np.where(np.sign(np.diff(output)-1)==1)[0]
changed_ind

In [None]:
l = 0
for i in range(len(greater_than_10)):
    l+= len(output[changed_ind[greater_than_10][i]+1:changed_ind[greater_than_10+1][i]+1])
l

In [None]:
changed_ind[greater_than_10],changed_ind[greater_than_10+1]

In [None]:
greater_than_10 = np.where(np.diff(changed_ind)>8)[0]

In [None]:
np.diff(changed_ind)

In [None]:
len(greater_than_10)

In [None]:
def keep_continuous_segments(arr, n):
    changed_ind = np.where(np.sign(np.diff(arr)-1)==1)[0]
    valid_split = np.where(np.diff(changed_ind)>=n)[0]
    start_ind = changed_ind[valid_split]+1
    end_ind = changed_ind[valid_split+1]+1
    cleaned_arr = np.empty(0,dtype=int)
    for i in range(len(valid_split)):
        cleaned_arr = np.hstack((cleaned_arr,output[start_ind[i]:end_ind[i]]))
    return cleaned_arr

In [None]:
np.diff(keep_continuous_segments(output,8))

In [None]:
from torch_kmeans import KMeans
n = 1700
labeled_idxs, train_data = strategy.dataset.get_train_data_unaugmented()
embeddings, embeddings_t, seq_len  = strategy.get_embeddings(train_data)
# embeddings = embeddings_t.numpy()


In [None]:
from fast_pytorch_kmeans import KMeans
import time
start_time = time.time()

cluster_learner = KMeans(n_clusters=100, mode = 'cosine', verbose=1)
cluster_learner.fit(embeddings)
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
cluster_idxs = cluster_learner.predict(embeddings)
centers = cluster_learner.centroids[cluster_idxs]
dis = (embeddings - centers) ** 2
dis = dis.sum(axis=1)

In [None]:
unlabeled_idxs, unlabeled_data = strategy.dataset.get_train_data_unaugmented()
embeddings, embeddings_t, seq_len = strategy.get_embeddings(unlabeled_data)





In [None]:
cluster_idxs = cluster_learner.predict(embeddings_t)
centers = cluster_learner.centroids[cluster_idxs]
cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6)
dis = cos(embeddings_t, centers)
uncertainties = dis

In [None]:
start_time = time.time()
cluster_learner = KMeans(n_clusters=500, mode='cosine', verbose=1)
cluster_learner.fit(embeddings_t)
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
dis.shape

In [None]:
dist_mat[~labeled_idxs, q_idx][:, None]

In [None]:
labeled_idxs[q_idx]

In [None]:
q_idx

In [None]:
mat.min(axis=1)

In [None]:
mat = dist_mat[~unlabeled_idxs, :][:, unlabeled_idxs]

In [None]:
for i in tqdm(range(100), ncols=100):
    mat_min = mat.min(axis=1)
    print(mat.shape)
    q_idx_ = mat_min.argmax()
    q_idx = np.arange(strategy.dataset.n_pool)[~unlabeled_idxs][q_idx_]
    unlabeled_idxs[q_idx] = True
    mat = np.delete(mat, q_idx_, 0)
    print(mat.shape)
    break
    mat = np.append(mat, dist_mat[~labeled_idxs, q_idx][:, None], axis=1)

In [None]:
cluster_learner = KMeans(n_clusters=n)
        cluster_learner.fit(embeddings)

        cluster_idxs = cluster_learner.predict(embeddings)
        centers = cluster_learner.cluster_centers_[cluster_idxs]
        dis = (embeddings - centers)**2
        dis = dis.sum(axis=1)
        q_idxs = np.array([np.arange(embeddings.shape[0])[cluster_idxs==i][dis[cluster_idxs==i].argmin()] for i in range(n)])

In [None]:
1700/60

In [None]:
np.concatenate(unlabeled_data.label)

In [None]:
n = 1700
# log_probs = torch.log(probs)
uncertainties = probs.max(1)[0]
# uncertainties = (probs * log_probs).sum(1)
# probs_sorted, idxs = probs.sort(descending=True)
# uncertainties = probs_sorted[:, 0] - probs_sorted[:,1]
# sigma_c = np.std(probs.numpy(), axis=0)
# uncertainties = torch.from_numpy(np.mean(sigma_c, axis=-1))
# uncertainties = moving_average(uncertainties,10)
# uncertainties = dist_mat[:, output].min(1)
# uncertainties = dis
indices = np.argsort(uncertainties)
data_sorted = uncertainties[indices]
original_order = indices.argsort()
data_original = data_sorted[original_order]
cdf = np.arange(1, len(uncertainties) + 1) / len(uncertainties)
linear_data =np.interp(cdf, (cdf.min(), cdf.max()), (0, 1))



In [None]:

dist_mat[:, output].min(1)

In [None]:
uncertainties.sort(descending=True)[0]

In [None]:

value2 = 0.2-linear_data
value1 = data_sorted
plt.figure()
plt.plot(value1)
plt.plot(value2)
plt.hlines(0,0,85111,colors='red')
plt.show()
print('value1', len(np.where(np.sign(value1)>0)[0]))
print('value2', len(np.where(np.sign(value2)>0)[0]))

In [None]:
data_sorted

In [None]:
uncertainties.min()

In [None]:
def moving_average(data, window_size):
    """
    Apply a simple moving average filter to a time series data.

    Parameters:
    - data: array-like, the time series data.
    - window_size: int, the number of observations to consider for calculating the moving average.

    Returns:
    - smoothed_data: array-like, the smoothed time series.
    """
    weights = np.ones(window_size) / window_size
    smoothed_data = np.convolve(data, weights, mode='same')
    return smoothed_data

In [None]:
# uncertainties_metric = value1[original_order]
uncertainties_metric = value2[original_order]
metrics = strategy.dataset.combine_window_to_episode(uncertainties_metric, seq_len)
to_select = strategy.get_combined_important(torch.flatten(seq_len), metrics, 1700)
print('selected', np.sum(to_select))

In [None]:
value1

In [None]:
to_plot1 = 3576
to_plot2 = 3667
import matplotlib.pyplot as plt
x = np.arange(to_plot1,to_plot2)
plt.figure(figsize=(25,12))
plt.plot(x,uncertainties[to_plot1:to_plot2])
plt.plot(x,value2[original_order][to_plot1:to_plot2])
plt.plot(x,to_select[to_plot1:to_plot2])
plt.plot(x,np.concatenate(unlabeled_data.label)[to_plot1:to_plot2], '-.')
# plt.plot(x,probs.argmax(1)[to_plot1:to_plot2], '--', color = 'k')
plt.hlines(0,to_plot1,to_plot2,colors='red')
plt.vlines(np.cumsum(seq_len)[np.where((np.cumsum(seq_len)>to_plot1)&(np.cumsum(seq_len)<to_plot2))[0]],-0.75,1, colors='red')
plt.show()

In [None]:
np.abs(np.cumsum(seq_len)-3580).argmin()

In [None]:
np.cumsum(seq_len)[41]

In [None]:

plt.figure()
plt.plot(unlabeled_data.data[40][:,:,4].flatten()+2,color = 'k')
plt.plot(unlabeled_data.data[40][:,:,13].flatten()+1,color = 'k')
plt.plot(unlabeled_data.data[40][:,:,22].flatten(),color = 'k')
plt.plot(unlabeled_data.data[40][:,:,31].flatten()-1,color = 'k')
plt.axvspan(np.where(np.diff(extended) == 1)[0][0], np.where(np.diff(extended) == -1)[0][0], color="yellow", alpha=0.3)

plt.show()

In [None]:
extended = np.repeat(to_select[np.cumsum(seq_len)[40]:np.cumsum(seq_len)[41]],249)

In [None]:
to_select[np.cumsum(seq_len)[40]:np.cumsum(seq_len)[40]]

In [None]:
pb.shape

In [None]:
import sklearn
clf_report = sklearn.metrics.classification_report(probs.argmax(1), np.concatenate(unlabeled_data.label), digits=6)

print(f"Classification Report : \n{clf_report}")

In [None]:
unlabeled_data.label

In [None]:
plt.figure()
plt.plot(0.2-linear_data[original_order][:to_plot])
plt.hlines(0,0,to_plot,colors='gold')
plt.show()

In [None]:
len(uncertainties)

In [None]:

linear_data2, seq_len2 = strategy.dataset.get_slice_from_episode(linear_data[data_sorted_argsort], seq_len, ~unlabeled_idxs)
linear_data3 = np.concatenate(linear_data2)

In [None]:
threshold = 0.90

linear_data4 = strategy.dataset.combine_window_to_episode(threshold - linear_data3, seq_len2)
to_select = strategy.get_combined_important(torch.flatten(seq_len2), linear_data4, n)

unlabeled_idxs, _ = strategy.dataset.get_unlabeled_data()
print('selected', np.sum(to_select), threshold)

In [None]:
np.argsort(threshold - linear_data3)

In [None]:
uncertainties = probs.max(1)[0]
uncertainties, seq_len = strategy.dataset.get_slice_from_episode(uncertainties, seq_len, ~unlabeled_idxs)

In [None]:
log_probs = torch.log(probs)
uncertainties = (probs * log_probs).sum(1)

In [None]:
len(uncertainties)

In [None]:
cum_sum_index = np.cumsum(seq_len)
cum_sum_index = np.insert(cum_sum_index, 0, 0)


print(len(uncertainties))
print(cum_sum_index[-1])

In [None]:
uncertainties.sort()[0][:100]

In [None]:
import matplotlib.pyplot as plt
plt.figure()
plt.hist(uncertainty.cpu(), bins=100, color='blue')
plt.title('Distribution of the Array')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.show()

In [None]:


# Plotting the CDF
plt.figure()
plt.plot(data_sorted, cdf)
plt.plot(0.2-linear_data,cdf)
plt.title('CDF of the Distribution')
plt.xlabel('Value')
plt.ylabel('CDF')
plt.grid(True)
plt.show()

In [None]:
np.vstack((data_sorted,cdf,linear_data))

In [None]:
n= 1700
linear_data, seq_len = strategy.dataset.get_slice_from_episode(uncertainties, seq_len, ~unlabeled_idxs)
# linear_data = np.concatenate(linear_data)
#
# threshold = 0.97
#
# metrics = strategy.dataset.combine_window_to_episode(threshold - uncertainties, seq_len)
# to_select = strategy.get_combined_important(torch.flatten(seq_len), metrics, n)
#
# unlabeled_idxs, _ = strategy.dataset.get_unlabeled_data()
# print('selected', np.sum(to_select), threshold)

In [None]:
len(linear_data)

In [None]:
np.cumsum(seq_len)

In [None]:


fig, axs = plt.subplots(3, 1, figsize=(10, 15))

axs[0].hist(uncertainties, bins=30, color='skyblue')
axs[0].set_title("Poisson Distribution")

axs[1].hist(uncertainties, bins=30, color='lightgreen')
axs[1].set_title("Uniform Distribution (CDF of Poisson)")

axs[2].plot(linear_data)
axs[2].set_title("Linear Transformation with Slope of 1")

In [None]:

for rd in range(1, NUM_ROUND + 1):
    # print('round ' + str(rd))
    # log_file_name = log_folder_root + 'rns_active/active_logs_' + strategy_name + '/logger_round_' + str(
    #     rd - 1) + '/version_0/metrics.csv'
    # logs = pd.read_csv(log_file_name)
    # max_ind = logs['val_acc'].argmax()
    # max_row = logs.iloc[max_ind]
    # ckpt_directory = ckpt_folder_root + 'rns_active/active_checkpoints_' + strategy_name
    # ckpt_files = os.listdir(ckpt_directory)
    # load_file_name = strategy_name + '_round_' + str(rd - 1) + '-epoch=' + str(int(max_row['epoch'])).zfill(2)
    #
    # ind = next((i for i, s in enumerate(ckpt_files) if load_file_name in s), None)
    # print(ind, ckpt_files[ind])
    # strategy.net.net.load_from_checkpoint(ckpt_directory + '/' + ckpt_files[ind], backbone=swav.backbone)

    q_idxs = strategy.query(NUM_QUERY * 90)

    # with open(log_folder_root + 'rns_active_selected/' + strategy_name + '/' + 'selected_indices.pkl', 'rb') as f:
    # # Load the content of the file into a Python object
    #     selected_inds = pickle.load(f)
    # selected_inds[rd] = q_idxs
    # with open(log_folder_root + 'rns_active_selected/' + strategy_name + '/' + 'selected_indices.pkl', 'wb') as f:
    #     pickle.dump(selected_inds, f)
# Now you can use the dictionary object as usual
    strategy.update(q_idxs)
    strategy.net.round = rd
    strategy.net.net.load_state_dict(modelstate)
    strategy.train()