In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib widget

In [2]:
import numpy as np
import random
import sys

sys.path.append('../tools')

import os

import torch

import pandas as pd
from sklearn.model_selection import train_test_split
import pytorch_lightning as pl
import pytorch_lightning.loggers as pl_loggers
import pytorch_lightning.callbacks as pl_callbacks
import data_utility, annotation_utility
from models.rns_dataloader import *
from active_learning_utility import get_strategy
from active_learning_data import Data
from active_learning_net import Net
from copy import deepcopy
from models.SwaV import SwaV
from models.LSTMDownStream import SupervisedDownstream
import warnings
import pickle
warnings.filterwarnings("ignore")


In [3]:
from sklearn.model_selection import KFold
import torch.nn as nn
import sklearn
def get_accuracy_scores(pred_raw, target, out, episode_out, episode_target):

    kf = KFold(n_splits=100, shuffle=True, random_state=42)
    accuracies = []
    f1_scores = []
    auc_scores = []
    episode_acc = []
    episode_f1s = []
    for train_ind, test_ind in kf.split(out):
        pred_split, target_split, pred_raw_split = out[test_ind],target[test_ind],m(pred_raw.float())[:,1][test_ind]
        acc = sklearn.metrics.accuracy_score(pred_split, target_split)
        f1_score = sklearn.metrics.f1_score(pred_split, target_split, pos_label=1, average='macro')
        fpr, tpr, thresholds = sklearn.metrics.roc_curve(target_split, pred_raw_split, pos_label=1)
        auc = sklearn.metrics.auc(fpr, tpr)
        accuracies.append(acc)
        f1_scores.append(f1_score)
        auc_scores.append(auc)
    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    for train_ind, test_ind in kf.split(episode_out):
        episode_pred_split, episode_target_split = episode_out[test_ind], episode_target[test_ind]
        episode_acc.append(sklearn.metrics.accuracy_score(episode_pred_split,episode_target_split))
        episode_f1s.append(sklearn.metrics.f1_score(episode_pred_split,episode_target_split, pos_label=1, average='macro'))
    return accuracies, f1_scores, auc_scores, episode_acc, episode_f1s

In [4]:
random_seed = 42
random.seed(random_seed)
torch.manual_seed(random_seed)
np.random.seed(random_seed)

if torch.cuda.is_available():
    torch.cuda.manual_seed(random_seed)
    # True ensures the algorithm selected by CUFA is deterministic
    torch.backends.cudnn.deterministic = True
    # torch.set_deterministic(True)
    # False ensures CUDA select the same algorithm each time the application is run
    torch.backends.cudnn.benchmark = False

import pytorch_lightning

pytorch_lightning.utilities.seed.seed_everything(seed=random_seed, workers=True)

Global seed set to 42


42

In [5]:
data_dir = "../../../user_data/"
log_folder_root = '../../../user_data/logs/'
ckpt_folder_root = '../../../user_data/checkpoints/'

In [14]:
strategy_name_list = ['BadgeSampling']
for strategy_name in strategy_name_list:
    nStart = 1
    nEnd = 20
    nQuery = 2
    args_task = {'n_epoch': 60,
                 'transform_train': True,
                 'strategy_name': strategy_name,
                 'transform': False,
                 'loader_tr_args': {'batch_size': 4, 'num_workers': 4, 'collate_fn': collate_fn,
                                    'drop_last': True, 'persistent_workers': True},
                 'loader_te_args': {'batch_size': 8, 'num_workers': 4, 'collate_fn': collate_fn,
                                    'drop_last': True, 'persistent_workers': True}
                 }
    data_list = ['HUP047.npy', 'HUP084.npy', 'HUP096.npy', 'HUP109.npy', 'HUP121.npy', 'HUP129.npy', 'HUP131.npy',
                 'HUP137.npy', 'HUP147.npy', 'HUP156.npy', 'HUP159.npy', 'HUP182.npy', 'HUP197.npy', 'HUP199.npy',
                 'RNS026.npy', 'RNS029.npy']
    # data_list = os.listdir(data_dir+'rns_test_cache')[1:]
    
    # data_list = ['HUP182.npy',   'HUP129.npy',   'HUP109.npy', 'HUP156.npy', 'HUP096.npy', 'RNS026.npy',  'HUP159.npy']
    # data_list = ['RNS026.npy', 'HUP159.npy', 'HUP129.npy', 'HUP096.npy', 'HUP182.npy']
    train_data, train_label, test_data, test_label, train_index, test_index = get_data_by_episode(data_list, split=0.8)
    # data, label,_,_ = get_data(data_list, split=1)
    # train_data, test_data, train_label, test_label = sklearn.model_selection.train_test_split(data, label, test_size=0.8, random_state=42)
    
    print(train_data.shape)
    print(train_label.shape)
    print(test_data.shape)
    print(test_label.shape)
    X_train = np.concatenate(train_data)
    y_train = np.concatenate(train_label)
    X_test = np.concatenate(test_data)
    y_test = np.concatenate(test_label)
    index_train = np.concatenate(train_index)
    index_test = np.concatenate(test_index)
    seq_len_train = np.array([y.shape[0] for y in train_label])
    seq_len_test = np.array([y.shape[0] for y in test_label])
    
    n_pool = len(train_data)
    n_test = len(test_data)
    
    NUM_INIT_LB = int(nStart * n_pool / 100)
    NUM_QUERY = int(nQuery * n_pool / 100) if nStart != 100 else 0
    NUM_ROUND = int((int(nEnd * n_pool / 100) - NUM_INIT_LB) / NUM_QUERY) if nStart != 100 else 0
    if NUM_QUERY != 0:
        if (int(nEnd * n_pool / 100) - NUM_INIT_LB) % NUM_QUERY != 0:
            NUM_ROUND += 1
    
    print(NUM_INIT_LB)
    print(NUM_QUERY)
    print(NUM_ROUND)
    
    NUM_ROUND = 8
    
    dataset = Data(X_train, y_train, X_test, y_test, seq_len_train, seq_len_test, RNS_Active_by_episode_LSTM, args_task)
    
    swav = SwaV().load_from_checkpoint(
        ckpt_folder_root + 'rns_swav_50_12/rns_swav-epoch=82-swav_loss=2.58204.ckpt')
    model = SupervisedDownstream(swav.backbone)
    # initialize model and save the model state
    modelstate = deepcopy(model.state_dict())
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    net = Net(model, args_task, device, ckpt_folder_root='rns_active', log_folder_root='rns_active')
    
    strategy = get_strategy(strategy_name, dataset, net, None, args_task, rns_data=True)
    
    trainer = pl.Trainer(accelerator='gpu',
                         devices=1,
                         precision=16,)
    
    strategy.net.trainer = trainer
    
    result_final = {}
    for rd in range(0, NUM_ROUND+1):
    # for rd in [10]:
        print('round ' + str(rd))
        log_file_name = log_folder_root + 'rns_active/active_logs_' + strategy_name + '/logger_round_' + str(
            rd) + '/version_0/metrics.csv'
        logs = pd.read_csv(log_file_name)
        # max_ind = logs['val_acc'].argmax()
        # max_row = logs.iloc[max_ind]
        sinds = np.argsort(list(logs['val_acc']))[::-1]
        ckpt_directory = ckpt_folder_root + 'rns_active/active_checkpoints_' + strategy_name
        ckpt_files = os.listdir(ckpt_directory)
        
        n = np.min((len(ckpt_files),3))
        
        selected_ind = sinds[~np.isnan(logs['val_acc'][sinds])][:n]
        
        result_list = []
        for s_idx in selected_ind:
            try:
                max_row = logs.iloc[s_idx]
                load_file_name = strategy_name + '_round_' + str(rd) + '-step=' + str(int(max_row['step']+1))
                print(load_file_name)
                
                ind = next((i for i, s in enumerate(ckpt_files) if load_file_name in s), None)
                print(ind, ckpt_files[ind])
                strategy.net.net = strategy.net.net.load_from_checkpoint(ckpt_directory + '/' + ckpt_files[ind], backbone=swav.backbone)
                
                with open(log_folder_root + 'rns_active_selected/' + strategy_name + '/' + 'selected_indices.pkl', 'rb') as f:
                # Load the content of the file into a Python object
                    selected_inds = pickle.load(f)
                
                q_idxs = np.array(selected_inds[rd])
                
                strategy.update(q_idxs)
                
                _, unlabeled_data = strategy.dataset.get_train_data_unaugmented()
                predictions = strategy.net.run_prediction(unlabeled_data)
                output_list = []
                seq_len_list = []
                target_list = []
                emb_list = []
                m = nn.Softmax(dim=1)
                for pred, y, emb, emb_t, seq_len in predictions:
                    output_list.append(pred)
                    seq_len_list.append(seq_len)
                    target_list.append(y)
                    emb_list.append(emb)
                
                target = torch.concat(target_list)
                emb = torch.vstack(emb_list)
                pred_raw = torch.vstack(output_list).float()
                seq_len_out = torch.tensor([item for sublist in seq_len_list for item in sublist])
                probs = m(pred_raw)
                out = torch.argmax(pred_raw, dim=1)
                unlabeled_idxs, _ = strategy.dataset.get_unlabeled_data()
                
                pred_label_split = strategy.dataset.combine_window_to_episode(out, seq_len_train, ~strategy.dataset.labeled_idxs)
                test_label_split = strategy.dataset.combine_window_to_episode(target, seq_len_train, ~strategy.dataset.labeled_idxs)
                episode_out = torch.tensor([np.sign(tl.sum()) for tl in pred_label_split])
                episode_target = torch.tensor([np.sign(tl.sum()) for tl in test_label_split])
                
                result_list.append(get_accuracy_scores(pred_raw[unlabeled_idxs], target[unlabeled_idxs], out[unlabeled_idxs], episode_out, episode_target))
            except:
                continue
        
        result_final[rd] = result_list
    
    np.save('save_acc/'+ strategy_name + '.npy', result_final)
    del model
    del strategy
    del swav
    del net

16it [00:05,  2.67it/s]


(992,)
(992,)
(270,)
(270,)
9
19
10


Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


round 0
BadgeSampling_round_0-step=120
1 BadgeSampling_round_0-step=120-train_loss=0.01188.ckpt


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

BadgeSampling_round_0-step=60
10 BadgeSampling_round_0-step=60-train_loss=0.02524.ckpt


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

BadgeSampling_round_0-step=220
7 BadgeSampling_round_0-step=220-train_loss=0.01114.ckpt


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

round 1
BadgeSampling_round_1-step=20
17 BadgeSampling_round_1-step=20-train_loss=0.02673.ckpt


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

BadgeSampling_round_1-step=40
18 BadgeSampling_round_1-step=40-train_loss=0.01403.ckpt


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

BadgeSampling_round_1-step=100
12 BadgeSampling_round_1-step=100-train_loss=0.01134.ckpt


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

round 2
BadgeSampling_round_2-step=220
28 BadgeSampling_round_2-step=220-train_loss=0.01279.ckpt


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

BadgeSampling_round_2-step=60
37 BadgeSampling_round_2-step=60-train_loss=0.01239.ckpt


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

BadgeSampling_round_2-step=260
30 BadgeSampling_round_2-step=260-train_loss=0.01134.ckpt


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

round 3
BadgeSampling_round_3-step=140
41 BadgeSampling_round_3-step=140-train_loss=0.01489.ckpt


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

BadgeSampling_round_3-step=280
49 BadgeSampling_round_3-step=280-train_loss=0.01207.ckpt


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

BadgeSampling_round_3-step=300
50 BadgeSampling_round_3-step=300-train_loss=0.01235.ckpt


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

round 4
BadgeSampling_round_4-step=80
69 BadgeSampling_round_4-step=80-train_loss=0.02148.ckpt


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

BadgeSampling_round_4-step=220
63 BadgeSampling_round_4-step=220-train_loss=0.02125.ckpt


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

BadgeSampling_round_4-step=160
59 BadgeSampling_round_4-step=160-train_loss=0.01897.ckpt


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

round 5
BadgeSampling_round_5-step=200
76 BadgeSampling_round_5-step=200-train_loss=0.01504.ckpt


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

BadgeSampling_round_5-step=60
88 BadgeSampling_round_5-step=60-train_loss=0.02895.ckpt


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

BadgeSampling_round_5-step=360
84 BadgeSampling_round_5-step=360-train_loss=0.01258.ckpt


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

round 6
BadgeSampling_round_6-step=340
103 BadgeSampling_round_6-step=340-train_loss=0.02117.ckpt


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

BadgeSampling_round_6-step=240
98 BadgeSampling_round_6-step=240-train_loss=0.01762.ckpt


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

BadgeSampling_round_6-step=300
101 BadgeSampling_round_6-step=300-train_loss=0.01528.ckpt


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

round 7
BadgeSampling_round_7-step=100
117 BadgeSampling_round_7-step=100-train_loss=0.02017.ckpt


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

BadgeSampling_round_7-step=140
119 BadgeSampling_round_7-step=140-train_loss=0.01810.ckpt


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

BadgeSampling_round_7-step=260
126 BadgeSampling_round_7-step=260-train_loss=0.01746.ckpt


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

round 8


FileNotFoundError: [Errno 2] No such file or directory: '../../../user_data/logs/rns_active/active_logs_BadgeSampling/logger_round_8/version_0/metrics.csv'

In [16]:
np.save('save_acc/'+ strategy_name + '.npy', result_final)

In [17]:
with open(log_folder_root + 'rns_active_selected/' + strategy_name + '/' + 'selected_indices.pkl', 'rb') as f:
        # Load the content of the file into a Python object
            selected_inds = pickle.load(f)
selected_inds[rd] = q_idxs
with open(log_folder_root + 'rns_active_selected/' + strategy_name + '/' + 'selected_indices.pkl', 'wb') as f:
    pickle.dump(selected_inds, f)

dict_keys([0, 1, 2, 3, 4, 5, 6, 7])

In [7]:
# raw_annotations = pd.read_csv(data_dir + 'full_updated_anns_annotTbl_cleaned.csv')
# ids = list(np.unique(raw_annotations[raw_annotations['descriptions'].notnull()]['HUP_ID']))
# # ids = list(np.unique(raw_annotations['HUP_ID']))
#
# data_import = data_utility.read_files(path=data_dir+'rns_data', path_data=data_dir+'rns_raw_cache', patientIDs=ids,
#                                       verbose=True)  # Import data with annotation

In [8]:
# data_list = os.listdir(data_dir+'rns_test_cache')
# print(data_list)


In [9]:
import torch.nn as nn
import sklearn


In [10]:
empty_dict = {}

In [11]:
result_temp = result_final

In [12]:
for k in result_temp.keys():
    empty_dict[k] = result_temp[k]

In [13]:
for k in result_temp_2.keys():
    empty_dict[k] = result_temp_2[k]

NameError: name 'result_temp_2' is not defined

In [None]:
for k in result_final.keys():
    empty_dict[k] = result_final[k]

In [None]:
empty_dict.keys()

In [None]:
result_final = empty_dict

In [None]:
result_temp_2 = result_final

In [None]:
unlabeled_idxs

In [None]:
[print(len(ls)) for ls in strategy.dataset.combine_window_to_episode(out, seq_len_train, ~strategy.dataset.labeled_idxs)]

In [None]:
np.where(strategy.dataset.labeled_idxs == True)

In [None]:

for rd in range(1, NUM_ROUND + 1):
    print('round ' + str(rd))
    log_file_name = log_folder_root + 'rns_active/active_logs_' + strategy_name + '/logger_round_' + str(
        rd) + '/version_0/metrics.csv'
    logs = pd.read_csv(log_file_name)
    max_ind = logs['val_acc'].argmax()
    max_row = logs.iloc[max_ind]
    ckpt_directory = ckpt_folder_root + 'rns_active/active_checkpoints_' + strategy_name
    ckpt_files = os.listdir(ckpt_directory)
    load_file_name = strategy_name + '_round_' + str(rd) + '-step=' + str(int(max_row['step']+1))
    print(load_file_name)

    ind = next((i for i, s in enumerate(ckpt_files) if load_file_name in s), None)
    print(ind, ckpt_files[ind])
    strategy.net.net = strategy.net.net.load_from_checkpoint(ckpt_directory + '/' + ckpt_files[ind], backbone=swav.backbone)

    with open(log_folder_root + 'rns_active_selected/' + strategy_name + '/' + 'selected_indices.pkl', 'rb') as f:
    # Load the content of the file into a Python object
        selected_inds = pickle.load(f)
    
    # selected_ind_list = []
    # for items in selected_inds.items():
    #     selected_ind_list.append(np.array(items[1]))
    q_idxs = np.array(selected_inds[rd])
    
    # with open(log_folder_root + 'rns_active_selected/' + strategy_name + '/' + 'selected_indices.pkl', 'wb') as f:
    #     pickle.dump(selected_inds, f)
# Now you can use the dictionary object as usual
    strategy.update(q_idxs)
    
    _, unlabeled_data = strategy.dataset.get_train_data_unaugmented()
    predictions = strategy.net.run_prediction(unlabeled_data)
    output_list = []
    seq_len_list = []
    target_list = []
    emb_list = []
    m = nn.Softmax(dim=1)
    for pred, y, emb, emb_t, seq_len in predictions:
        output_list.append(pred)
        seq_len_list.append(seq_len)
        target_list.append(y)
        emb_list.append(emb)

    target = torch.concat(target_list)
    emb = torch.vstack(emb_list)
    pred_raw = torch.vstack(output_list).float()
    seq_len_out = torch.tensor([item for sublist in seq_len_list for item in sublist])
    probs = m(pred_raw)
    out = torch.argmax(pred_raw, dim=1)
    unlabeled_idxs, _ = strategy.dataset.get_unlabeled_data()
    f1_score = sklearn.metrics.f1_score(target[unlabeled_idxs],out[unlabeled_idxs])

    print(f1_score)

#     strategy.net.round = rd
#     strategy.net.net.load_state_dict(modelstate)
#     torch.cuda.empty_cache()
#     strategy.train()
#     torch.cuda.empty_cache()

In [None]:
unlabeled_idxs, _ = strategy.dataset.get_unlabeled_data()

In [None]:
clf_report = sklearn.metrics.classification_report(torch.argmax(pred_raw, dim=1), target, digits=6)

print(f"Classification Report : \n{clf_report}")

In [None]:
f1_score = sklearn.metrics.f1_score(target,out)

In [None]:
f1_score