In [1]:
import os
import pandas as pd

from torch.utils.data import DataLoader
from sklearn.neighbors import KNeighborsClassifier
from configs import configs
from dataset import ChestXRayCaptionDataset
import torch
import numpy as np
from model import Chexnet
from tqdm import tqdm
from utils import train_transform, evaluate_transform
from tokenizer import create_tokenizer
from test import evaluation_matrix
from chexpert import chexpert

2091lines [00:00, 174710.45lines/s]


caller: c:\Users\darkenstardragon\Documents\Work\chest-xray-report-gen\text_generation\chexpert.py
Creating Chexpert reward module...
Using 1 GPUs!


In [2]:
LOAD_RANDOM_PROJECTION_DATA = False
# RANDOM_PROJECT_DIM = 128
BUILD_CACHED_MAPS = False
USE_CACHED_MAPS = True
# SEED = 0

# filename = {
#     'train_x': configs['mimic_dir'] + 'baseline_data/' +  f'train_image_embeddings_{RANDOM_PROJECT_DIM}_{SEED}.npy',
#     'train_y': configs['mimic_dir'] + 'baseline_data/' +  f'train_captions_{RANDOM_PROJECT_DIM}_{SEED}.npy',
#     'val_x': configs['mimic_dir'] +'baseline_data/' +  f'val_image_embeddings_{RANDOM_PROJECT_DIM}_{SEED}.npy',
#     'val_y': configs['mimic_dir'] +'baseline_data/' +  f'val_captions_{RANDOM_PROJECT_DIM}_{SEED}.npy',
#     'test_x': configs['mimic_dir'] +'baseline_data/' +  f'test_image_embeddings_{RANDOM_PROJECT_DIM}_{SEED}.npy',
#     'test_y': configs['mimic_dir'] +'baseline_data/' +  f'test_captions_{RANDOM_PROJECT_DIM}_{SEED}.npy',
# }

In [3]:
tokenizer = create_tokenizer()
checkpoint = torch.load('weights/pretrained_encoder/pretrained_enc_epoch_5_2022-03-08_15-43-47.540586.pth.tar')
print(f"loaded epoch {checkpoint['epoch']+1} model, val_loss: {checkpoint['val_loss']}")
encoder = checkpoint['encoder'].cuda()
train_loader = DataLoader(
    ChestXRayCaptionDataset('train', transform=train_transform),
    batch_size=16,
    shuffle=False,
    num_workers=4,
    pin_memory=True,
)

val_loader = DataLoader(
    ChestXRayCaptionDataset('val', transform=evaluate_transform),
    batch_size=16,
    shuffle=False,
    num_workers=0,
    pin_memory=True,
)

test_loader = DataLoader(
    ChestXRayCaptionDataset('test', transform=evaluate_transform),
    batch_size=16,
    shuffle=False,
    num_workers=0,
    pin_memory=True,
)

2091lines [00:00, 174717.41lines/s]


loaded epoch 5 model, val_loss: 0.28202417492866516


In [4]:
def generate_image_embeddings_random(encoder, data_loader, projection_matrix, project_every=2):
    # With random projection
    encoder.eval()
    image_embeddings = []
    captions = []
    batch = []
    with torch.no_grad():
        for i, (img, caption, _) in enumerate(tqdm(data_loader)):
            img = img.cuda()
            encoded_img, _ = encoder(img)
            batch.append(encoded_img.cpu())
            captions.append(caption.cpu())
            if ((i+1) % project_every) == 0 or (i+1) == len(data_loader):
                batch = torch.cat(batch).reshape(-1, 1024*8*8).numpy()
                batch = np.matmul(batch, projection_matrix)
                image_embeddings.append(batch)
                batch = []

    image_embeddings = np.vstack(image_embeddings)
    captions = torch.cat(captions).numpy()
    return image_embeddings, captions

def generate_image_embeddings_save_every(encoder, data_split, data_loader, save_every=1024):
    encoder.eval()
    image_embeddings = []
    captions = []
    file_index = 0

    with torch.no_grad():
        for i, (img, caption, _) in enumerate(tqdm(data_loader)):
            img = img.cuda()
            encoded_img, _ = encoder(img)
            image_embeddings.append(encoded_img.cpu())
            captions.append(caption.cpu())

            if ((i+1) % save_every) == 0 or (i+1) == len(data_loader):
                # stack
                image_embeddings = torch.cat(image_embeddings).reshape(-1, 1024*8*8).numpy()
                captions = torch.cat(captions).numpy()

                # save
                np.save(configs['mimic_dir'] + f'raw_embeddings/{data_split}/feature_maps_{file_index}.npy', image_embeddings)
                np.save(configs['mimic_dir'] + f'raw_embeddings/{data_split}/captions_{file_index}.npy', captions)

                # clear and update
                image_embeddings = []
                captions = []
                file_index += 1

def random_project(data_loader, data_split, projection_matrix, save_every=1024, project_every=256):
    n_split = len(data_loader) // save_every + 1
    print(f"{n_split=}")
    projected_image_embeddings = []
    captions = []
    
    for file_index in tqdm(range(n_split)):
        feat_maps = np.load(configs['mimic_dir'] + f'raw_embeddings/{data_split}/feature_maps_{file_index}.npy')
        caps = np.load(configs['mimic_dir'] + f'raw_embeddings/{data_split}/captions_{file_index}.npy')
        captions.append(caps)

        # project
        feat_maps = np.array_split(feat_maps, project_every)
        for batch in feat_maps:
            proj = np.matmul(batch, projection_matrix)
            projected_image_embeddings.append(proj)

    projected_image_embeddings = np.vstack(projected_image_embeddings)
    captions = np.vstack(captions)
    
    return projected_image_embeddings, captions

In [11]:
def get_vectors(SEED, RANDOM_PROJECT_DIM, load=False):
    """
    End to end train, val, test projected vectors function
    Input:
        SEED (int): Seed of the random projection matrix
        RANDOM_PROJECT_DIM: Dimension of the random projection matrix
    Output:
        train_x, train_y, val_x, val_y, test_x, test_y
    """

    """
    Load Cached Vectors
    """
    if load:
        file_suffix = f"_{RANDOM_PROJECT_DIM}_{SEED}.npy"
        file_prefix = configs['mimic_dir'] + 'baseline_data/'
        train_image_embeddings = np.load(file_prefix + 'train_image_embeddings' + file_suffix)
        train_captions = np.load(file_prefix + 'train_captions' + file_suffix)
        val_image_embeddings = np.load(file_prefix + 'val_image_embeddings' + file_suffix)
        val_captions = np.load(file_prefix + 'val_captions' + file_suffix)
        test_image_embeddings = np.load(file_prefix + 'test_image_embeddings' + file_suffix)
        test_captions = np.load(file_prefix + 'test_captions' + file_suffix)
        return train_image_embeddings, train_captions, val_image_embeddings, val_captions, test_image_embeddings, test_captions
    
    print(f"Random projecting with {SEED=}, {RANDOM_PROJECT_DIM=}")
    # Create a whole new projection
    rng = np.random.RandomState(SEED)
    # Gaussian random projection
    projection_matrix = rng.normal(0.0, 1/RANDOM_PROJECT_DIM, (65536, RANDOM_PROJECT_DIM))

    """
    Train vectors
    """

    print("Projecting train vectors...")

    if USE_CACHED_MAPS:
        # New method: Predict first, cache them, then project
        if BUILD_CACHED_MAPS:
            generate_image_embeddings_save_every(encoder, 'train', train_loader, save_every=1024)

        train_image_embeddings, train_captions = random_project(train_loader, 'train', projection_matrix, save_every=1024, project_every=64)
        print(train_image_embeddings.shape)
        print(train_captions.shape)
        # np.save(filename['train_x'], train_image_embeddings)
        # np.save(filename['train_y'], train_captions)
    else:
        # Old method: Project as we predict
        if LOAD_RANDOM_PROJECTION_DATA:
            # Use cached projection
            # train_image_embeddings = np.load(filename['train_x'])
            # train_captions = np.load(filename['train_y'])
            print(train_image_embeddings.shape)
            print(train_captions.shape)
        else:
            project_every = 256
            train_image_embeddings, train_captions = generate_image_embeddings_random(encoder, train_loader, projection_matrix, project_every=project_every)
            print(train_image_embeddings.shape)
            print(train_captions.shape)
            # np.save(filename['train_x'], train_image_embeddings)
            # np.save(filename['train_y'], train_captions)
    
    """
    Val & Test vectors
    """

    print("Projecting val & test vectors...")

    if USE_CACHED_MAPS:
        # New method: Predict first, cache them, then project
        if BUILD_CACHED_MAPS:
            generate_image_embeddings_save_every(encoder, 'val', val_loader, save_every=1024)
            generate_image_embeddings_save_every(encoder, 'test', test_loader, save_every=1024)

        val_image_embeddings, val_captions = random_project(val_loader, 'val', projection_matrix, save_every=1024, project_every=64)
        print(val_image_embeddings.shape)
        print(val_captions.shape)
        # np.save(filename['val_x'], val_image_embeddings)
        # np.save(filename['val_y'], val_captions)
        test_image_embeddings, test_captions = random_project(test_loader, 'test', projection_matrix, save_every=1024, project_every=64)
        print(test_image_embeddings.shape)
        print(test_captions.shape)
        # np.save(filename['test_x'], test_image_embeddings)
        # np.save(filename['test_y'], test_captions)

    else:
        if LOAD_RANDOM_PROJECTION_DATA:
            # val_image_embeddings = np.load(filename['val_x'])
            # val_captions = np.load(filename['val_y'])
            # test_image_embeddings = np.load(filename['test_x'])
            # test_captions = np.load(filename['test_y'])
            print(val_image_embeddings.shape)
            print(val_captions.shape)
            print(test_image_embeddings.shape)
            print(test_captions.shape)
        else:
            project_every = 256
            val_image_embeddings, val_captions = generate_image_embeddings_random(encoder, val_loader, projection_matrix, project_every=project_every)
            print(val_image_embeddings.shape)
            print(val_captions.shape)
            test_image_embeddings, test_captions = generate_image_embeddings_random(encoder, test_loader, projection_matrix, project_every=project_every)
            print(test_image_embeddings.shape)
            print(test_captions.shape)
            # np.save(filename['val_x'], val_image_embeddings)
            # np.save(filename['val_y'], val_captions)
            # np.save(filename['test_x'], test_image_embeddings)
            # np.save(filename['test_y'], test_captions)
    
    return train_image_embeddings, train_captions, val_image_embeddings, val_captions, test_image_embeddings, test_captions

In [6]:
def predict(embeddings, train_captions, one_nn, decode=False, batch_size=64):
    captions = []
    data_loader = DataLoader(
        embeddings,
        batch_size=batch_size,
        num_workers=0,
        pin_memory=True,
    )
    for j, batch in enumerate(tqdm(data_loader)):
        dists, indices = one_nn.kneighbors(batch)
        captions.extend([train_captions[i] for i in indices])
    captions = np.array(captions).reshape(embeddings.shape[0], -1)
    if decode:
        captions = tokenizer.decode(captions)
    return captions

def evaluate(true_captions, pred_captions, batch_size):
    true_df = []
    pred_df = []

    true_loader = DataLoader(
        true_captions, 
        batch_size=batch_size,
        num_workers=0,
        pin_memory=True
    )
    pred_loader = DataLoader(
        pred_captions, 
        batch_size=batch_size,
        num_workers=0,
        pin_memory=True
    )

    for t in tqdm(true_loader):
        labels = chexpert(t, tokenizer)
        true_df.append(labels)

    for p in tqdm(pred_loader):
        labels = chexpert(p, tokenizer)
        pred_df.append(labels)
    
    true_df = pd.concat(true_df).reset_index(drop=True)
    pred_df = pd.concat(pred_df).reset_index(drop=True)
    return evaluation_matrix(true_df, pred_df)

def evaluate_all(one_nn, train_captions, val_image_embeddings, val_captions, test_image_embeddings, test_captions, seed, project_dim):
    predicted_reports = predict(val_image_embeddings, train_captions, one_nn, batch_size=1024)
    val_eval_matrix = evaluate(val_captions, predicted_reports, batch_size=12)
    val_eval_matrix.to_csv(f'results/val_results_{project_dim}_{seed}.csv', index=False)

    predicted_reports = predict(test_image_embeddings, train_captions, one_nn, batch_size=1024)
    test_eval_matrix = evaluate(test_captions, predicted_reports, batch_size=12)
    test_eval_matrix.to_csv(f'results/test_results_{project_dim}_{seed}.csv', index=False)

In [7]:
def e2e_benchmark(seed, project_dim):
    """
    Perform vector acquisition, 1-NN, and evaluate on val and test set, and save results into files
    """

    # Get vector
    train_image_embeddings, train_captions, val_image_embeddings, val_captions, test_image_embeddings, test_captions = get_vectors(seed, project_dim)

    # Fit projected vectors into knn
    indices = [*range(train_image_embeddings.shape[0])]
    one_nn = KNeighborsClassifier(n_neighbors=1)
    one_nn.fit(train_image_embeddings, indices)

    # Evaluate
    evaluate_all(one_nn, train_captions, val_image_embeddings, val_captions, test_image_embeddings, test_captions, seed, project_dim)

In [15]:
e2e_benchmark(seed=1000, project_dim=128)

Random projecting with SEED=1000, RANDOM_PROJECT_DIM=128
Projecting train vectors...
n_split=17


100%|██████████| 17/17 [02:51<00:00, 10.07s/it]


(267838, 128)
(267838, 402)
Projecting val & test vectors...
n_split=1


100%|██████████| 1/1 [00:01<00:00,  1.65s/it]


(2085, 128)
(2085, 402)
n_split=1


100%|██████████| 1/1 [00:02<00:00,  2.88s/it]


(3653, 128)
(3653, 402)


100%|██████████| 3/3 [00:07<00:00,  2.46s/it]
100%|██████████| 174/174 [00:44<00:00,  3.90it/s]
100%|██████████| 174/174 [00:51<00:00,  3.36it/s]
100%|██████████| 4/4 [00:12<00:00,  3.11s/it]
100%|██████████| 305/305 [01:36<00:00,  3.17it/s]
100%|██████████| 305/305 [01:51<00:00,  2.74it/s]


In [8]:
seeds = [3000, 4000]
dims = [8192]

for dim in dims:
    for seed in seeds:
        path_to_check = f'results/val_results_{dim}_{seed}.csv'
        exist = os.path.exists(path_to_check)
        if not exist:
            e2e_benchmark(seed=seed, project_dim=dim)

Random projecting with SEED=3000, RANDOM_PROJECT_DIM=8192
Projecting train vectors...
n_split=17


100%|██████████| 17/17 [35:07<00:00, 123.99s/it]


(267838, 8192)
(267838, 402)
Projecting val & test vectors...
n_split=1


100%|██████████| 1/1 [00:41<00:00, 41.84s/it]


(2085, 8192)
(2085, 402)
n_split=1


100%|██████████| 1/1 [01:01<00:00, 61.63s/it]


(3653, 8192)
(3653, 402)


100%|██████████| 3/3 [30:11<00:00, 603.80s/it]
100%|██████████| 174/174 [00:50<00:00,  3.45it/s]
100%|██████████| 174/174 [00:52<00:00,  3.31it/s]
100%|██████████| 4/4 [55:17<00:00, 829.33s/it]
100%|██████████| 305/305 [01:38<00:00,  3.11it/s]
100%|██████████| 305/305 [01:36<00:00,  3.15it/s]


Random projecting with SEED=4000, RANDOM_PROJECT_DIM=8192
Projecting train vectors...
n_split=17


100%|██████████| 17/17 [33:11<00:00, 117.15s/it]


(267838, 8192)
(267838, 402)
Projecting val & test vectors...
n_split=1


100%|██████████| 1/1 [00:41<00:00, 41.71s/it]


(2085, 8192)
(2085, 402)
n_split=1


100%|██████████| 1/1 [00:55<00:00, 55.09s/it]


(3653, 8192)
(3653, 402)


100%|██████████| 3/3 [30:07<00:00, 602.44s/it]
100%|██████████| 174/174 [00:48<00:00,  3.56it/s]
100%|██████████| 174/174 [00:51<00:00,  3.37it/s]
100%|██████████| 4/4 [54:34<00:00, 818.54s/it]
100%|██████████| 305/305 [01:37<00:00,  3.14it/s]
100%|██████████| 305/305 [01:36<00:00,  3.15it/s]


In [12]:
train_image_embeddings, train_captions, val_image_embeddings, val_captions, test_image_embeddings, test_captions = get_vectors(0, 2048, load=True)
indices = [*range(train_image_embeddings.shape[0])]
one_nn = KNeighborsClassifier(n_neighbors=1)
one_nn.fit(train_image_embeddings, indices)

KNeighborsClassifier(n_neighbors=1)

In [13]:
test_predicted_reports = predict(test_image_embeddings, train_captions, one_nn, batch_size=1024)


100%|██████████| 4/4 [00:51<00:00, 12.79s/it]


In [19]:
def chexperify(captions, batch_size):
    df = []
    data_loader = DataLoader(
        captions, 
        batch_size=batch_size,
        num_workers=0,
        pin_memory=True
    )

    for batch in tqdm(data_loader):
        labels = chexpert(batch, tokenizer)
        df.append(labels)


    df = pd.concat(df).reset_index(drop=True)
    df['captions'] = tokenizer.decode(captions)
    return df

In [22]:
true_df = chexperify(test_captions, batch_size=12)

100%|██████████| 305/305 [01:57<00:00,  2.60it/s]


In [25]:
true_df

Unnamed: 0,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices,No Finding,captions
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,lateral view somewhat limited due to overlying...
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,lateral view somewhat limited due to overlying...
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,frontal and lateral radiographs of the chest a...
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,frontal and lateral radiographs of the chest a...
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,frontal and lateral radiographs of the chest a...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3648,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,one of the right chest tubes appears to have b...
3649,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,one of the right chest tubes appears to have b...
3650,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,one of the right chest tubes appears to have b...
3651,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,pa and lateral chest views were obtained with ...


In [23]:
pred_df = chexperify(test_predicted_reports, batch_size=12)

100%|██████████| 305/305 [01:33<00:00,  3.25it/s]


In [24]:
pred_df

Unnamed: 0,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices,No Finding,captions
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,bibasilar atelectasis is similar to appearance...
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,there is no change . relatively low lung volum...
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,mild hyperinflation and flattened diaphragms i...
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,there is a small nodular opacity in the left l...
4,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,unchanged appearance of the intact sternotomy ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3648,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,ap portable chest radiograph obtained . there ...
3649,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,tracheostomy is unchanged in position . a righ...
3650,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,a portable frontal chest radiograph demonstrat...
3651,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,increased retrocardiac opacity compared to the...
