In [1]:
import os
import pandas as pd

from collections import defaultdict
from torch.utils.data import DataLoader
from sklearn.neighbors import KNeighborsClassifier
from configs import configs
from dataset import ChestXRayCaptionDataset
from nltk.translate.bleu_score import corpus_bleu
import torch
import numpy as np
from model import Chexnet
from tqdm import tqdm
from utils import train_transform, evaluate_transform, quantize_probs
from tokenizer import create_tokenizer
from test import evaluation_matrix
from chexpert import chexpert
import time
import faiss
from nlgeval import NLGEval

2091lines [00:00, 190112.93lines/s]


caller: c:\Users\darkenstardragon\Documents\Work\chest-xray-report-gen\text_generation\chexpert.py
Creating Chexpert reward module...
Using 1 GPUs!


In [2]:
metrics_to_omit = [
    'METEOR', 
    'SkipThoughtCS', 
    'EmbeddingAverageCosineSimilarity', 
    'VectorExtremaCosineSimilarity', 
    'GreedyMatchingScore', 
    'EmbeddingAverageCosineSimilairty',
]

nlgeval = NLGEval(metrics_to_omit=metrics_to_omit)  # loads the models

LOAD_RANDOM_PROJECTION_DATA = False
BUILD_CACHED_MAPS = False
USE_CACHED_MAPS = True
SAVE_PROJECTED = True
time_file_path = 'results/inference_time.csv'
nlg_file_path = 'results/results_nlg.csv'


def filename(k, seed, project_dim):
    d = {
        'train_x': configs['mimic_dir'] + 'baseline_data/' +  f'train_image_embeddings_{project_dim}_{seed}.npy',
        'train_y': configs['mimic_dir'] + 'baseline_data/' +  f'train_captions.npy',
        'val_x': configs['mimic_dir'] +'baseline_data/' +  f'val_image_embeddings_{project_dim}_{seed}.npy',
        'val_y': configs['mimic_dir'] +'baseline_data/' +  f'val_captions.npy',
        'test_x': configs['mimic_dir'] +'baseline_data/' +  f'test_image_embeddings_{project_dim}_{seed}.npy',
        'test_y': configs['mimic_dir'] +'baseline_data/' +  f'test_captions.npy',
    }

    return d[k]


In [3]:
tokenizer = create_tokenizer()
checkpoint = torch.load('weights/pretrained_encoder/pretrained_enc_epoch_5_2022-03-08_15-43-47.540586.pth.tar')
print(f"loaded epoch {checkpoint['epoch']+1} model, val_loss: {checkpoint['val_loss']}")
encoder = checkpoint['encoder'].cuda()

train_probs_quantized = np.load(configs['mimic_dir'] + 'baseline_data/train_probs_quantized.npy')
val_probs_quantized = np.load(configs['mimic_dir'] + 'baseline_data/val_probs_quantized.npy')
test_probs_quantized = np.load(configs['mimic_dir'] + 'baseline_data/test_probs_quantized.npy')


train_loader = DataLoader(
    ChestXRayCaptionDataset('train', transform=train_transform),
    batch_size=16,
    shuffle=False,
    num_workers=4,
    pin_memory=True,
)

val_loader = DataLoader(
    ChestXRayCaptionDataset('val', transform=evaluate_transform),
    batch_size=16,
    shuffle=False,
    num_workers=0,
    pin_memory=True,
)

test_loader = DataLoader(
    ChestXRayCaptionDataset('test', transform=evaluate_transform),
    batch_size=16,
    shuffle=False,
    num_workers=0,
    pin_memory=True,
)

len_train_loader = 16740
len_val_loader = 131
len_test_loader = 229

2091lines [00:00, 174269.56lines/s]


loaded epoch 5 model, val_loss: 0.28202417492866516


In [4]:
def generate_image_embeddings_random(encoder, data_loader, projection_matrix, project_every=2):
    # With random projection
    encoder.eval()
    image_embeddings = []
    captions = []
    batch = []
    with torch.no_grad():
        for i, (img, caption, _) in enumerate(tqdm(data_loader)):
            img = img.cuda()
            encoded_img, _ = encoder(img)
            batch.append(encoded_img.cpu())
            captions.append(caption.cpu())
            if ((i+1) % project_every) == 0 or (i+1) == len(data_loader):
                batch = torch.cat(batch).reshape(-1, 1024*8*8).numpy()
                batch = np.matmul(batch, projection_matrix)
                image_embeddings.append(batch)
                batch = []

    image_embeddings = np.vstack(image_embeddings)
    captions = torch.cat(captions).numpy()
    return image_embeddings, captions

def generate_image_embeddings_save_every(encoder, data_split, data_loader, save_every=1024):
    encoder.eval()
    image_embeddings = []
    captions = []
    file_index = 0

    with torch.no_grad():
        for i, (img, caption, _) in enumerate(tqdm(data_loader)):
            img = img.cuda()
            encoded_img, _ = encoder(img)
            image_embeddings.append(encoded_img.cpu())
            captions.append(caption.cpu())

            if ((i+1) % save_every) == 0 or (i+1) == len(data_loader):
                # stack
                image_embeddings = torch.cat(image_embeddings).reshape(-1, 1024*8*8).numpy()
                captions = torch.cat(captions).numpy()

                # save
                np.save(configs['mimic_dir'] + f'raw_embeddings/{data_split}/feature_maps_{file_index}.npy', image_embeddings)
                np.save(configs['mimic_dir'] + f'raw_embeddings/{data_split}/captions_{file_index}.npy', captions)

                # clear and update
                image_embeddings = []
                captions = []
                file_index += 1

def generate_probs_save_every(encoder, data_split, data_loader, save_every=1024):
    encoder.eval()
    probs = []
    file_index = 0

    with torch.no_grad():
        for i, (img, _, _) in enumerate(tqdm(data_loader)):
            img = img.cuda()
            _, prob = encoder(img)
            probs.append(prob.cpu())

            if ((i+1) % save_every) == 0 or (i+1) == len(data_loader):
                # stack
                probs = torch.cat(probs).numpy()

                # save
                np.save(configs['mimic_dir'] + f'raw_embeddings/{data_split}/probs_{file_index}.npy', probs)

                # clear and update
                probs = []
                file_index += 1

def random_project(len_data_loader, data_split, projection_matrix, save_every=1024, project_every=256):
    n_split = len_data_loader // save_every + 1
    print(f"{n_split=}")
    projected_image_embeddings = []
    captions = []
    
    for file_index in tqdm(range(n_split)):
        feat_maps = np.load(configs['mimic_dir'] + f'raw_embeddings/{data_split}/feature_maps_{file_index}.npy')
        caps = np.load(configs['mimic_dir'] + f'raw_embeddings/{data_split}/captions_{file_index}.npy')
        captions.append(caps)

        # project
        feat_maps = np.array_split(feat_maps, project_every)
        for batch in feat_maps:
            proj = np.matmul(batch, projection_matrix)
            projected_image_embeddings.append(proj)

    projected_image_embeddings = np.vstack(projected_image_embeddings)
    captions = np.vstack(captions)
    
    return projected_image_embeddings, captions

In [5]:
# generate_probs_save_every(encoder, 'train', train_loader, save_every=1024)
# generate_probs_save_every(encoder, 'val', val_loader, save_every=1024)
# generate_probs_save_every(encoder, 'test', test_loader, save_every=1024)

In [6]:
def get_vectors(SEED, RANDOM_PROJECT_DIM, load=False):
    """
    End to end train, val, test projected vectors function
    Input:
        SEED (int): Seed of the random projection matrix
        RANDOM_PROJECT_DIM: Dimension of the random projection matrix
    Output:
        train_x, train_y, val_x, val_y, test_x, test_y
    """

    """
    Load Cached Vectors
    """
    if load:
        
        file_suffix = f"_{RANDOM_PROJECT_DIM}_{SEED}.npy"
        file_prefix = configs['mimic_dir'] + 'baseline_data/'
        file_exists = os.path.exists(file_prefix + 'train_image_embeddings' + file_suffix)

        if file_exists:
            train_image_embeddings = np.load(file_prefix + 'train_image_embeddings' + file_suffix)
            train_captions = np.load(file_prefix + 'train_captions.npy')
            val_image_embeddings = np.load(file_prefix + 'val_image_embeddings' + file_suffix)
            val_captions = np.load(file_prefix + 'val_captions.npy')
            test_image_embeddings = np.load(file_prefix + 'test_image_embeddings' + file_suffix)
            test_captions = np.load(file_prefix + 'test_captions.npy')
            return train_image_embeddings, train_captions, val_image_embeddings, val_captions, test_image_embeddings, test_captions
        else:
            print(f"Attempting to load file with seed={SEED} and project_dim={RANDOM_PROJECT_DIM} but doesn't exist")
    
    print(f"Random projecting with {SEED=}, {RANDOM_PROJECT_DIM=}")
    # Create a whole new projection
    rng = np.random.RandomState(SEED)
    # Gaussian random projection
    projection_matrix = rng.normal(0.0, 1/RANDOM_PROJECT_DIM, (65536, RANDOM_PROJECT_DIM))

    """
    Train vectors
    """

    print("Projecting train vectors...")

    if USE_CACHED_MAPS:
        # New method: Predict first, cache them, then project
        if BUILD_CACHED_MAPS:
            generate_image_embeddings_save_every(encoder, 'train', train_loader, save_every=1024)

        train_image_embeddings, train_captions = random_project(len_train_loader, 'train', projection_matrix, save_every=1024, project_every=64)
        print(train_image_embeddings.shape)
        print(train_captions.shape)
        if SAVE_PROJECTED:
            np.save(filename('train_x', seed=SEED, project_dim=RANDOM_PROJECT_DIM), train_image_embeddings)
            np.save(filename('train_y', seed=SEED, project_dim=RANDOM_PROJECT_DIM), train_captions)
    else:
        # Old method: Project as we predict
        if LOAD_RANDOM_PROJECTION_DATA:
            # Use cached projection
            # train_image_embeddings = np.load(filename['train_x'])
            # train_captions = np.load(filename['train_y'])
            print(train_image_embeddings.shape)
            print(train_captions.shape)
        else:
            project_every = 256
            train_image_embeddings, train_captions = generate_image_embeddings_random(encoder, train_loader, projection_matrix, project_every=project_every)
            print(train_image_embeddings.shape)
            print(train_captions.shape)
            # np.save(filename['train_x'], train_image_embeddings)
            # np.save(filename['train_y'], train_captions)
    
    """
    Val & Test vectors
    """

    print("Projecting val & test vectors...")

    if USE_CACHED_MAPS:
        # New method: Predict first, cache them, then project
        if BUILD_CACHED_MAPS:
            generate_image_embeddings_save_every(encoder, 'val', val_loader, save_every=1024)
            generate_image_embeddings_save_every(encoder, 'test', test_loader, save_every=1024)

        val_image_embeddings, val_captions = random_project(len_val_loader, 'val', projection_matrix, save_every=1024, project_every=64)
        print(val_image_embeddings.shape)
        print(val_captions.shape)
        if SAVE_PROJECTED:
            np.save(filename('val_x', seed=SEED, project_dim=RANDOM_PROJECT_DIM), val_image_embeddings)
            np.save(filename('val_y', seed=SEED, project_dim=RANDOM_PROJECT_DIM), val_captions)
        test_image_embeddings, test_captions = random_project(len_test_loader, 'test', projection_matrix, save_every=1024, project_every=64)
        print(test_image_embeddings.shape)
        print(test_captions.shape)
        if SAVE_PROJECTED:
            np.save(filename('test_x', seed=SEED, project_dim=RANDOM_PROJECT_DIM), test_image_embeddings)
            np.save(filename('test_y', seed=SEED, project_dim=RANDOM_PROJECT_DIM), test_captions)

    else:
        if LOAD_RANDOM_PROJECTION_DATA:
            # val_image_embeddings = np.load(filename['val_x'])
            # val_captions = np.load(filename['val_y'])
            # test_image_embeddings = np.load(filename['test_x'])
            # test_captions = np.load(filename['test_y'])
            print(val_image_embeddings.shape)
            print(val_captions.shape)
            print(test_image_embeddings.shape)
            print(test_captions.shape)
        else:
            project_every = 256
            val_image_embeddings, val_captions = generate_image_embeddings_random(encoder, val_loader, projection_matrix, project_every=project_every)
            print(val_image_embeddings.shape)
            print(val_captions.shape)
            test_image_embeddings, test_captions = generate_image_embeddings_random(encoder, test_loader, projection_matrix, project_every=project_every)
            print(test_image_embeddings.shape)
            print(test_captions.shape)
            # np.save(filename['val_x'], val_image_embeddings)
            # np.save(filename['val_y'], val_captions)
            # np.save(filename['test_x'], test_image_embeddings)
            # np.save(filename['test_y'], test_captions)
    
    return train_image_embeddings, train_captions, val_image_embeddings, val_captions, test_image_embeddings, test_captions

In [7]:
class SimilaritySearch:
    def fit(self, xb, yb):
        pass

    def predict(self, xq):
        pass

class OneNearestNeighbor(SimilaritySearch):
    def __init__(self):
        self.yb = None
        self.knn = KNeighborsClassifier(n_neighbors=1)

    def fit(self, xb, yb):
        indices = [*range(xb.shape[0])]
        self.knn.fit(xb.astype(np.float32), indices)
        self.yb = yb.astype(np.float32)

    def predict(self, xq):
        dists, indices = self.knn.kneighbors(xq.astype(np.float32))
        yq = np.array([self.yb[i] for i in indices])
        yq = yq.reshape(xq.shape[0], self.yb.shape[1])
        return yq

class FaissFlatIndexL2CPU(SimilaritySearch):
    def __init__(self):
        self.yb = None
        self.index = None

    def fit(self, xb, yb):
        dim = xb.shape[1]
        self.index = faiss.IndexFlatL2(dim)
        self.index.add(xb.astype(np.float32))
        self.yb = yb.astype(np.float32)

    def predict(self, xq):
        dists, indices = self.index.search(xq.astype(np.float32), 1)
        yq = np.array([self.yb[i] for i in indices])
        yq = yq.reshape(xq.shape[0], self.yb.shape[1])
        return yq

class FaissFlatIndexL2GPU(SimilaritySearch):
    def __init__(self):
        self.res = faiss.StandardGpuResources()
        self.yb = None
        self.gpu_index = None

    def fit(self, xb, yb):
        dim = xb.shape[1]
        self.gpu_index = faiss.index_cpu_to_gpu(self.res, 0, faiss.IndexFlatL2(dim))
        self.gpu_index.add(xb.astype(np.float32))
        self.yb = yb.astype(np.float32)

    def predict(self, xq):
        dists, indices = self.gpu_index.search(xq.astype(np.float32), 1)
        yq = np.array([self.yb[i] for i in indices])
        yq = yq.reshape(xq.shape[0], self.yb.shape[1])
        return yq

class FaissHNSW32(SimilaritySearch):
    def __init__(self):
        self.yb = None
        self.index = None

    def fit(self, xb, yb):
        dim = xb.shape[1]
        self.index = faiss.IndexHNSWFlat(dim, 32)
        self.index.add(xb.astype(np.float32))
        self.yb = yb.astype(np.float32)

    def predict(self, xq):
        dists, indices = self.index.search(xq.astype(np.float32), 1)
        yq = np.array([self.yb[i] for i in indices])
        yq = yq.reshape(xq.shape[0], self.yb.shape[1])
        return yq

class FaissLSH32(SimilaritySearch):
    def __init__(self):
        self.yb = None
        self.index = None

    def fit(self, xb, yb):
        dim = xb.shape[1]
        self.index = faiss.IndexLSH(dim, 32)
        self.index.add(xb.astype(np.float32))
        self.yb = yb.astype(np.float32)

    def predict(self, xq):
        dists, indices = self.index.search(xq.astype(np.float32), 1)
        yq = np.array([self.yb[i] for i in indices])
        yq = yq.reshape(xq.shape[0], self.yb.shape[1])
        return yq

In [8]:
class SimilaritySearchCoarse2Fine(SimilaritySearch):
    def assign_labels(self, train_labels):
        self.train_labels = train_labels

    def assign_encoder(self, encoder):
        self.encoder = encoder

class OneNearestNeighborCoarse2Fine(SimilaritySearchCoarse2Fine):
    def __init__(self):
        # Dict label -> feature maps
        self.map = defaultdict(lambda: [])
        self.reports = defaultdict(lambda: [])
        self.knns = dict()
        self.assign_labels(train_probs_quantized)
    
    def fit(self, xb, yb):
        binary_strings = [''.join(label.astype(str)) for label in self.train_labels]
        # for label, feat_maps in tqdm(zip(binary_strings, xb), total=len(binary_strings)):
        for i in range(len(binary_strings)):
            label = binary_strings[i]
            feat_maps = xb[i]
            report = yb[i]
            self.map[label].append(feat_maps)
            self.reports[label].append(report)

        for label, feat_maps_list in self.map.items():
            indices = [*range(len(feat_maps_list))]
            self.knns[label] = KNeighborsClassifier(n_neighbors=1)
            self.knns[label].fit(np.array(feat_maps_list).astype(np.float32), indices)

    def predict(self, xq, x_image):
        _, probs = encoder(x_image)
        labels = quantize_probs(probs.detach().cpu().numpy())
        labels = [''.join(label.astype(str)) for label in labels]
        results = []
        no_label_count = 0
        for label, feat_map in zip(labels, xq):
            # Might not found exact label in training
            # Might need to search in near edit distance
            if label in self.knns.keys():
                dists, index = self.knns[label].kneighbors(feat_map.astype(np.float32).reshape(1, -1))
                yq = np.array(self.reports[label][index[0][0]]).reshape(-1)
            else:
                # Handle no label
                similar_labels_list = self.get_similar_binaries(label)
                similar_feature_maps = []
                similar_reports = []
                for sim_label in similar_labels_list:
                    if sim_label in self.map.keys():
                        # Need to make sure the sim label exists
                        similar_feature_maps.extend(self.map[sim_label])
                        similar_reports.extend(self.reports[sim_label])
                
                # Create temp knn
                temp_knn = KNeighborsClassifier(n_neighbors=1)
                indices = [*range(len(similar_feature_maps))]
                temp_knn.fit(similar_feature_maps, indices)
                dists, index = temp_knn.kneighbors(feat_map.astype(np.float32).reshape(1, -1))
                yq = np.array(similar_reports[index[0][0]]).reshape(-1)
                no_label_count += 1 # For monitoring purpose

            results.append(yq)

        # if no_label_count > 0:
        #     print(f"No label count: {no_label_count}")
        return np.array(results)

    def get_similar_binaries(self, binary_str: str):
        """
        Returns a list of similar binary string with edit distance = 1
        """
        res = []
        for i in range(len(binary_str)):
            res.append(binary_str[:i] + str(int(not(bool(int(binary_str[i]))))) + binary_str[i+1:])
        return res

In [9]:
def predict(model, embeddings, batch_size=64, image_loader=None):
    """
    Predict captions given a similarity search model and image embeddings.

    If the model is Coarse2Fine, an image_loader must be given for encoder
    For predicting diseases in coarse searching step.

    Parameters:
        model (SimilaritySearch):                   Similarity search model to evaluate
        embeddings (numpy.array):                   Image embeddings of size (sample_size, project_dim)
        decode (bool):                              Whether to decode the report or not
        batch_size (int):                           How many samples to do sim search per iteration
        image_loader (torch.utils.data.DataLoader): DataLoader that can load images for encoder

    Returns:
        Predicted captions (numpy.array)
        Total time to predict (float)
    """

    captions = []
    total_time = 0.0

    # Check if model is Coarse2Fine
    # image_loader must be given if so
    assert isinstance(model, SimilaritySearchCoarse2Fine) == (image_loader != None), (
        f"isinstance={isinstance(model, SimilaritySearchCoarse2Fine)} but image_loader != None is {image_loader != None}"
    )

    if isinstance(model, SimilaritySearchCoarse2Fine) and image_loader != None:
        print(f"forced batch size: {image_loader.batch_size}")
        data_loader = DataLoader(
            embeddings,
            batch_size=image_loader.batch_size,
            num_workers=0,
            pin_memory=True,
        )

        assert len(data_loader) == len(image_loader), (
            f"{len(data_loader)=}, {len(image_loader)=}"
        )

        for j, (batch, (image, _, _)) in enumerate(tqdm(zip(data_loader, image_loader), total=len(data_loader))):
            start = time.time()
            yq = model.predict(batch.numpy(), image.cuda())
            total_time += time.time() - start
            captions.extend(yq)
    else:
        data_loader = DataLoader(
            embeddings,
            batch_size=batch_size,
            num_workers=0,
            pin_memory=True,
        )
        for j, batch in enumerate(tqdm(data_loader)):
            start = time.time()
            yq = model.predict(batch.numpy())
            total_time += time.time() - start
            captions.extend(yq)

    captions = np.array(captions).reshape(embeddings.shape[0], -1)

    return captions, total_time

def evaluate_clinical(true_captions, pred_captions, batch_size):
    """
    Evaluate clinical accuracy of predicted reports vs ground truth
    Using VisualCheXbert to compare between ground truth and predicted reports

    Parameters:
        true_captions (Iterable): Ground truth
        pred_captions (Iterable): Predicted reports
        batch_size (int): Size of the batch for VisualCheXbert to predict each iteration

    Returns:
        Evaluation matrix (Precision, recall, F1) of each disease and micro/macro avg (pandas.DataFrame)
    """

    true_df = []
    pred_df = []
    print(true_captions.shape[0], pred_captions.shape[0])
    true_loader = DataLoader(
        true_captions, 
        batch_size=batch_size,
        num_workers=0,
        pin_memory=True
    )
    pred_loader = DataLoader(
        pred_captions, 
        batch_size=batch_size,
        num_workers=0,
        pin_memory=True
    )

    for t in tqdm(true_loader):
        labels = chexpert(t, tokenizer)
        true_df.append(labels)

    for p in tqdm(pred_loader):
        labels = chexpert(p, tokenizer)
        pred_df.append(labels)
    
    true_df = pd.concat(true_df).reset_index(drop=True)
    pred_df = pd.concat(pred_df).reset_index(drop=True)
    return evaluation_matrix(true_df, pred_df)

# def calculate_bleu_scores(true_captions, pred_captions):
#     """
#     Calculates BLEU 1-4 scores based on NLTK functionality

#     Parameters:
#         true_captions: List of reference sentences
#         pred_captions: List of generated sentences

#     Returns:
#         bleu_1, bleu_2, bleu_3, bleu_4: BLEU scores

#     """
#     # Put each sentence in references in a list
#     # Because nltk accepts list of possible references for each sample
#     true_captions = [[e.split()] for e in true_captions]
#     pred_captions = [e.split() for e in pred_captions]

#     bleu_1 = np.round(corpus_bleu(true_captions, pred_captions, weights=(1.0, 0., 0., 0.)), decimals=4)
#     bleu_2 = np.round(corpus_bleu(true_captions, pred_captions, weights=(0.50, 0.50, 0., 0.)), decimals=4)
#     bleu_3 = np.round(corpus_bleu(true_captions, pred_captions, weights=(0.33, 0.33, 0.33, 0.)), decimals=4)
#     bleu_4 = np.round(corpus_bleu(true_captions, pred_captions, weights=(0.25, 0.25, 0.25, 0.25)), decimals=4)
#     return bleu_1, bleu_2, bleu_3, bleu_4 

def calculate_nlg_metrics(true_captions, pred_captions):
    """
    Calculate BLEU 1-4, ROGUE_L, and CIDEr score using nlg-eval library
    Parameters:
        true_captions: List of reference sentences
        pred_captions: List of generated sentences

    Returns:
        metrics_dict (dictionary): Dictionary containing all metrics above
    """

    # nlg-eval requires user to do this to references
    true_captions = [true_captions]
    metrics_dict = nlgeval.compute_metrics(true_captions, pred_captions)
    return metrics_dict

def write_time_file(model, project_dim, seed, val_time, test_time):
    with open(time_file_path, 'a') as f:
        model_type = type(model).__name__
        f.write(f"{model_type},{pd.Timestamp.now()},{project_dim},{seed},{val_time},{test_time}\n")

def write_nlg_file(model, project_dim, seed, split, metrics_dict):
    with open(nlg_file_path, 'a') as f:
        model_type = type(model).__name__
        ls = [
            model_type, pd.Timestamp.now(), project_dim, seed, split,
            metrics_dict['Bleu_1'], metrics_dict['Bleu_2'], metrics_dict['Bleu_3'],
            metrics_dict['Bleu_4'],metrics_dict['ROUGE_L'], metrics_dict['CIDEr'],
        ]
        # f.write((f"{model_type},{pd.Timestamp.now()},{project_dim},{seed},{split},"
        # f"{metrics_dict['Bleu_1']},{metrics_dict['Bleu_2']},{metrics_dict['Bleu_3']},"
        # f"{metrics_dict['Bleu_4']},{metrics_dict['ROGUE_L']},{metrics_dict['CIDEr']}\n"
        # ))
        print(','.join([str(e) for e in ls]))
        f.write(','.join([str(e) for e in ls]) + '\n')

def evaluate_all(model, val_image_embeddings, val_captions, test_image_embeddings, test_captions, seed, project_dim):
    """
    Evaluate the model with val and test set. Will write results in directory results/.
    Also will log time taken to predict with function write_time_file.

    Metrics evaluated: 
        Clinical accuracy, 
        TODO: BLEU score

    Parameters:
        model (SimilaritySearch):           Similarity search model to evaluate
        val_image_embeddings (numpy.array): Image embeddings of size (sample_size, project_dim)
        val_captions (numpy.array):         Encoded captions of size (sample_size, max_caption_len)
        test_image_embeddings (numpy.array): Image embeddings of size (sample_size, project_dim)
        test_captions (numpy.array):        Encoded captions of size (sample_size, max_caption_len)
        seed (int):                         (For logging purpose only) Seed used to random project
        project_dim (int):                  Random project dimension
    """

    if isinstance(model, SimilaritySearchCoarse2Fine):
        # Coarse2Fine models needs an image loader for the encoder to predict 14 diseases
        predicted_reports, val_time = predict(model, val_image_embeddings, batch_size=1024, image_loader=val_loader)
    else:
        predicted_reports, val_time = predict(model, val_image_embeddings, batch_size=1024)
    print(f"Time taken to predict val: {val_time:.3f} seconds")

    # NLG Metrics
    decoded_predicted_reports = tokenizer.decode(predicted_reports)
    decoded_true_reports = tokenizer.decode(val_captions)
    val_metrics_dict = calculate_nlg_metrics(decoded_true_reports, decoded_predicted_reports)
    print(val_metrics_dict)
    write_nlg_file(model, project_dim, seed, 'val', val_metrics_dict)

    # Clinical Accuracy
    # val_eval_matrix = evaluate_clinical(val_captions, predicted_reports, batch_size=8)

    # Save Results to csv
    # val_eval_matrix.to_csv(
    #     f'results/{type(model).__name__}_val_results_{project_dim}_{seed}.csv', index=False
    # )
    # print(val_eval_matrix)

    if isinstance(model, SimilaritySearchCoarse2Fine):
        # Coarse2Fine models needs an image loader for the encoder to predict 14 diseases
        predicted_reports, test_time = predict(model, test_image_embeddings, batch_size=1024, image_loader=test_loader)
    else:
        predicted_reports, test_time = predict(model, test_image_embeddings, batch_size=1024)
    print(f"Time taken to predict test: {test_time:.3f} seconds")

    # NLG Metrics
    decoded_predicted_reports = tokenizer.decode(predicted_reports)
    decoded_true_reports = tokenizer.decode(test_captions)
    test_metrics_dict = calculate_nlg_metrics(decoded_true_reports, decoded_predicted_reports)
    print(test_metrics_dict)
    write_nlg_file(model, project_dim, seed, 'test', test_metrics_dict)

    # Clinical Accuracy
    # test_eval_matrix = evaluate_clinical(test_captions, predicted_reports, batch_size=8)

    # Save Results to csv
    # test_eval_matrix.to_csv(
    #     f'results/{type(model).__name__}_test_results_{project_dim}_{seed}.csv', index=False
    # )
    # print(test_eval_matrix)

    # Save time taken to csv
    # write_time_file(model, project_dim, seed, val_time, test_time)

In [10]:
def e2e_benchmark(model_class, seed, project_dim, load=False):
    """
    Perform vector acquisition, 1-NN, and evaluate on val and test set, and save results into files
    """

    # Get vector
    train_image_embeddings, train_captions, val_image_embeddings, val_captions, test_image_embeddings, test_captions = get_vectors(seed, project_dim, load=load)

    # Model selection
    model = model_class()
    if isinstance(model, SimilaritySearchCoarse2Fine):
        model.assign_encoder(encoder)

    # Train
    model.fit(train_image_embeddings, train_captions)

    # Evaluate
    evaluate_all(model, val_image_embeddings, val_captions, test_image_embeddings, test_captions, seed, project_dim)

In [11]:
# models = [OneNearestNeighbor, OneNearestNeighborCoarse2Fine]
# seeds = [2000, 3000, 4000]
# dims = [128, 256, 512, 1024, 2048,]

# for model_class in models:
#     for dim in dims:
#         for seed in seeds:
#             # path_to_check = f'results/{model_class.__name__}_test_results_{dim}_{seed}.csv'
#             # exist = os.path.exists(path_to_check)
#             # if not exist:
#             print(model_class.__name__, dim, seed)
#             e2e_benchmark(model_class, seed=seed, project_dim=dim, load=True)

In [12]:
models = [OneNearestNeighbor, OneNearestNeighborCoarse2Fine]
seeds = [0]
dims = [4096, 8192]

for model_class in models:
    for dim in dims:
        for seed in seeds:
            # path_to_check = f'results/{model_class.__name__}_test_results_{dim}_{seed}.csv'
            # exist = os.path.exists(path_to_check)
            # if not exist:
            print(model_class.__name__, dim, seed)
            e2e_benchmark(model_class, seed=seed, project_dim=dim, load=True)

OneNearestNeighbor 4096 0


100%|██████████| 3/3 [01:28<00:00, 29.57s/it]


Time taken to predict val: 86.169 seconds
{'Bleu_1': 0.3831014493200462, 'Bleu_2': 0.2261327925554635, 'Bleu_3': 0.14586729140405622, 'Bleu_4': 0.09953168578521623, 'ROUGE_L': 0.2762005955467192, 'CIDEr': 0.15458831320661218}
OneNearestNeighbor,2022-05-03 15:19:21.624131,4096,0,val,0.3831014493200462,0.2261327925554635,0.14586729140405622,0.09953168578521623,0.2762005955467192,0.15458831320661218


100%|██████████| 4/4 [02:15<00:00, 33.92s/it]


Time taken to predict test: 135.228 seconds
{'Bleu_1': 0.3447961682021063, 'Bleu_2': 0.18715684503239993, 'Bleu_3': 0.10987197669793941, 'Bleu_4': 0.06877964076958654, 'ROUGE_L': 0.23495707900266835, 'CIDEr': 0.08226999209838405}
OneNearestNeighbor,2022-05-03 15:21:55.772275,4096,0,test,0.3447961682021063,0.18715684503239993,0.10987197669793941,0.06877964076958654,0.23495707900266835,0.08226999209838405
OneNearestNeighbor 8192 0


100%|██████████| 3/3 [02:53<00:00, 57.94s/it]


Time taken to predict val: 169.451 seconds
{'Bleu_1': 0.3871605490898203, 'Bleu_2': 0.22728770372520876, 'Bleu_3': 0.1460331847338342, 'Bleu_4': 0.09971030849304723, 'ROUGE_L': 0.2759508156633303, 'CIDEr': 0.15275302515361233}
OneNearestNeighbor,2022-05-03 15:29:49.586626,8192,0,val,0.3871605490898203,0.22728770372520876,0.1460331847338342,0.09971030849304723,0.2759508156633303,0.15275302515361233


100%|██████████| 4/4 [04:19<00:00, 64.77s/it]


Time taken to predict test: 258.158 seconds
{'Bleu_1': 0.3454086447677871, 'Bleu_2': 0.1883578218631788, 'Bleu_3': 0.11115490481256213, 'Bleu_4': 0.07010046875111835, 'ROUGE_L': 0.23542945147266883, 'CIDEr': 0.07968202935293578}
OneNearestNeighbor,2022-05-03 15:34:26.001236,8192,0,test,0.3454086447677871,0.1883578218631788,0.11115490481256213,0.07010046875111835,0.23542945147266883,0.07968202935293578
OneNearestNeighborCoarse2Fine 4096 0
forced batch size: 16


100%|██████████| 131/131 [22:09<00:00, 10.15s/it]


Time taken to predict val: 1319.073 seconds
{'Bleu_1': 0.3867615369373467, 'Bleu_2': 0.22814281896641755, 'Bleu_3': 0.14698385870330452, 'Bleu_4': 0.10055462105665386, 'ROUGE_L': 0.27727018051677726, 'CIDEr': 0.1585761137936198}
OneNearestNeighborCoarse2Fine,2022-05-03 15:58:01.377536,4096,0,val,0.3867615369373467,0.22814281896641755,0.14698385870330452,0.10055462105665386,0.27727018051677726,0.1585761137936198
forced batch size: 16


100%|██████████| 229/229 [13:43<00:00,  3.60s/it]


Time taken to predict test: 812.476 seconds
{'Bleu_1': 0.34280064358632384, 'Bleu_2': 0.18643096850531704, 'Bleu_3': 0.10990971009637332, 'Bleu_4': 0.0690075559338383, 'ROUGE_L': 0.2341157421352432, 'CIDEr': 0.07322299716053964}
OneNearestNeighborCoarse2Fine,2022-05-03 16:11:59.562597,4096,0,test,0.34280064358632384,0.18643096850531704,0.10990971009637332,0.0690075559338383,0.2341157421352432,0.07322299716053964
OneNearestNeighborCoarse2Fine 8192 0
forced batch size: 16


100%|██████████| 131/131 [44:16<00:00, 20.28s/it]


Time taken to predict val: 2649.305 seconds
{'Bleu_1': 0.38634549491303666, 'Bleu_2': 0.2280552271136684, 'Bleu_3': 0.1470700774127665, 'Bleu_4': 0.10064868763305572, 'ROUGE_L': 0.27723266044993267, 'CIDEr': 0.15873037363879317}
OneNearestNeighborCoarse2Fine,2022-05-03 16:59:43.407265,8192,0,val,0.38634549491303666,0.2280552271136684,0.1470700774127665,0.10064868763305572,0.27723266044993267,0.15873037363879317
forced batch size: 16


100%|██████████| 229/229 [26:55<00:00,  7.06s/it]


Time taken to predict test: 1602.994 seconds
{'Bleu_1': 0.3434323042117288, 'Bleu_2': 0.18668214252042056, 'Bleu_3': 0.11002225961032984, 'Bleu_4': 0.0695910268170324, 'ROUGE_L': 0.23515237991127258, 'CIDEr': 0.07663434906770622}
OneNearestNeighborCoarse2Fine,2022-05-03 17:26:54.667768,8192,0,test,0.3434323042117288,0.18668214252042056,0.11002225961032984,0.0695910268170324,0.23515237991127258,0.07663434906770622


In [12]:
def chexpertify(captions, batch_size):
    """
    Returns a dataframe with labels and captions alongside
    Example usage:

        true_df = chexpertify(test_captions, batch_size=12)
        pred_df = chexpertify(test_predicted_reports, batch_size=12)

    """
    df = []
    data_loader = DataLoader(
        captions, 
        batch_size=batch_size,
        num_workers=0,
        pin_memory=True
    )

    for batch in tqdm(data_loader):
        labels = chexpert(batch, tokenizer)
        df.append(labels)


    df = pd.concat(df).reset_index(drop=True)
    df['captions'] = tokenizer.decode(captions)
    return df

def e2e_chexpertify(model_class, seed, project_dim, load=False):
    """
    Perform vector acquisition, 1-NN, and evaluate on val and test set, and save results into files
    """

    # Get vector
    train_image_embeddings, train_captions, val_image_embeddings, val_captions, test_image_embeddings, test_captions = get_vectors(seed, project_dim, load=load)

    # Model selection
    model = model_class()
    if isinstance(model, SimilaritySearchCoarse2Fine):
        model.assign_encoder(encoder)

    # Train
    model.fit(train_image_embeddings, train_captions)

    # Get val captions
    if isinstance(model, SimilaritySearchCoarse2Fine):
        # Coarse2Fine models needs an image loader for the encoder to predict 14 diseases
        predicted_reports, val_time = predict(model, val_image_embeddings, batch_size=1024, image_loader=val_loader)
    else:
        predicted_reports, val_time = predict(model, val_image_embeddings, batch_size=1024)
    print(f"Time taken to predict val: {val_time:.3f} seconds")

    val_pred_df = chexpertify(predicted_reports, batch_size=12)
    val_true_df = chexpertify(val_captions, batch_size=12)

    # Save Results to csv
    val_pred_df.to_csv(
        f'chexpertify/{type(model).__name__}_val_pred_df_{project_dim}_{seed}.csv', index=False
    )
    val_true_df.to_csv(
        f'chexpertify/{type(model).__name__}_val_true_df_{project_dim}_{seed}.csv', index=False
    )
    print(val_pred_df.head())
    print(val_true_df.head())

    # Get test captions
    if isinstance(model, SimilaritySearchCoarse2Fine):
        # Coarse2Fine models needs an image loader for the encoder to predict 14 diseases
        predicted_reports, test_time = predict(model, test_image_embeddings, batch_size=1024, image_loader=test_loader)
    else:
        predicted_reports, test_time = predict(model, test_image_embeddings, batch_size=1024)
    print(f"Time taken to predict test: {test_time:.3f} seconds")

    test_pred_df = chexpertify(predicted_reports, batch_size=12)
    test_true_df = chexpertify(test_captions, batch_size=12)

    # Save Results to csv
    test_pred_df.to_csv(
        f'chexpertify/{type(model).__name__}_test_pred_df_{project_dim}_{seed}.csv', index=False
    )
    test_true_df.to_csv(
        f'chexpertify/{type(model).__name__}_test_true_df_{project_dim}_{seed}.csv', index=False
    )
    print(test_pred_df.head())
    print(test_true_df.head())

def load_chexpertify_results(model_class, seed, project_dim):
    val_pred_df = pd.read_csv(f'chexpertify/{type(model_class()).__name__}_val_pred_df_{project_dim}_{seed}.csv')
    val_true_df = pd.read_csv(f'chexpertify/{type(model_class()).__name__}_val_true_df_{project_dim}_{seed}.csv')
    test_pred_df = pd.read_csv(f'chexpertify/{type(model_class()).__name__}_test_pred_df_{project_dim}_{seed}.csv')
    test_true_df = pd.read_csv(f'chexpertify/{type(model_class()).__name__}_test_true_df_{project_dim}_{seed}.csv')
    return val_pred_df, val_true_df, test_pred_df, test_true_df

In [13]:
e2e_chexpertify(OneNearestNeighbor, 0, 128, load=True)
e2e_chexpertify(OneNearestNeighbor, 0, 256, load=True)

100%|██████████| 3/3 [00:09<00:00,  3.13s/it]


Time taken to predict val: 9.380 seconds


100%|██████████| 174/174 [00:28<00:00,  6.01it/s]
100%|██████████| 174/174 [00:25<00:00,  6.92it/s]


   Enlarged Cardiomediastinum  Cardiomegaly  Lung Opacity  Lung Lesion  Edema  \
0                         1.0           1.0           0.0          0.0    0.0   
1                         0.0           0.0           0.0          0.0    0.0   
2                         1.0           1.0           1.0          0.0    0.0   
3                         1.0           1.0           1.0          0.0    1.0   
4                         1.0           1.0           1.0          0.0    1.0   

   Consolidation  Pneumonia  Atelectasis  Pneumothorax  Pleural Effusion  \
0            0.0        0.0          0.0           0.0               0.0   
1            0.0        0.0          0.0           0.0               0.0   
2            1.0        1.0          0.0           0.0               1.0   
3            1.0        1.0          1.0           0.0               1.0   
4            1.0        1.0          1.0           0.0               1.0   

   Pleural Other  Fracture  Support Devices  No Finding 

100%|██████████| 4/4 [00:16<00:00,  4.19s/it]


Time taken to predict test: 16.728 seconds


100%|██████████| 305/305 [00:56<00:00,  5.40it/s]
100%|██████████| 305/305 [00:50<00:00,  6.05it/s]


   Enlarged Cardiomediastinum  Cardiomegaly  Lung Opacity  Lung Lesion  Edema  \
0                         0.0           0.0           1.0          0.0    0.0   
1                         1.0           1.0           1.0          0.0    0.0   
2                         0.0           0.0           0.0          0.0    0.0   
3                         0.0           0.0           0.0          0.0    0.0   
4                         0.0           0.0           0.0          0.0    0.0   

   Consolidation  Pneumonia  Atelectasis  Pneumothorax  Pleural Effusion  \
0            0.0        0.0          1.0           0.0               0.0   
1            0.0        0.0          1.0           0.0               0.0   
2            0.0        0.0          0.0           0.0               0.0   
3            0.0        0.0          0.0           0.0               0.0   
4            0.0        0.0          0.0           0.0               0.0   

   Pleural Other  Fracture  Support Devices  No Finding 

100%|██████████| 3/3 [00:11<00:00,  3.78s/it]


Time taken to predict val: 11.335 seconds


100%|██████████| 174/174 [00:29<00:00,  5.80it/s]
100%|██████████| 174/174 [00:25<00:00,  6.81it/s]


   Enlarged Cardiomediastinum  Cardiomegaly  Lung Opacity  Lung Lesion  Edema  \
0                         1.0           1.0           1.0          0.0    1.0   
1                         1.0           1.0           1.0          0.0    0.0   
2                         1.0           1.0           1.0          0.0    1.0   
3                         1.0           1.0           1.0          0.0    0.0   
4                         1.0           1.0           1.0          0.0    1.0   

   Consolidation  Pneumonia  Atelectasis  Pneumothorax  Pleural Effusion  \
0            0.0        0.0          0.0           0.0               0.0   
1            1.0        1.0          1.0           0.0               1.0   
2            1.0        1.0          1.0           0.0               1.0   
3            0.0        0.0          1.0           0.0               1.0   
4            1.0        1.0          1.0           0.0               1.0   

   Pleural Other  Fracture  Support Devices  No Finding 

100%|██████████| 4/4 [00:19<00:00,  4.95s/it]


Time taken to predict test: 19.770 seconds


100%|██████████| 305/305 [00:55<00:00,  5.47it/s]
100%|██████████| 305/305 [00:50<00:00,  6.07it/s]


   Enlarged Cardiomediastinum  Cardiomegaly  Lung Opacity  Lung Lesion  Edema  \
0                         0.0           0.0           0.0          0.0    0.0   
1                         0.0           0.0           0.0          0.0    0.0   
2                         1.0           1.0           1.0          1.0    1.0   
3                         0.0           0.0           0.0          0.0    0.0   
4                         0.0           0.0           0.0          0.0    0.0   

   Consolidation  Pneumonia  Atelectasis  Pneumothorax  Pleural Effusion  \
0            0.0        0.0          0.0           0.0               0.0   
1            0.0        0.0          0.0           0.0               0.0   
2            1.0        1.0          1.0           0.0               1.0   
3            0.0        1.0          0.0           0.0               0.0   
4            0.0        0.0          0.0           0.0               0.0   

   Pleural Other  Fracture  Support Devices  No Finding 

In [31]:
val_pred_df, val_true_df, test_pred_df, test_true_df = load_chexpertify_results(OneNearestNeighborCoarse2Fine, 0, 128)

In [32]:
val_pred_df

Unnamed: 0,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices,No Finding,captions
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,rightsided terminates in the low svc without e...
1,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,the cardiac silhouette is mildly enlarged but ...
2,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,compared with prior there has been no signific...
3,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,blunting at the left costophrenic may represen...
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,there is no change . and rightsided chest tube...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2080,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,multiple bilateral focal concerning for pneumo...
2081,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,there is a persistent lower lingular opacifica...
2082,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,the patient is status post right upper lobe re...
2083,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,frontal and lateral views of the chest were ob...


### Test from remote run on server

Requirements: `predicted_val_captions.npy` and `predicted_test_captions.npy`

In [8]:
pred_captions = np.load('remote_server/predicted_val_captions.npy')
until = pred_captions.shape[0]
true_captions = np.load('mimic_cxr/raw_embeddings/val/captions_0.npy')[:until]
evaluate_clinical(true_captions, pred_captions, batch_size=12)

100%|██████████| 174/174 [00:52<00:00,  3.29it/s]
100%|██████████| 174/174 [01:05<00:00,  2.64it/s]


Metrics,Recall,Precision,F1
Enlarged Cardiomediastinum,0.729483,0.658436,0.692141
Cardiomegaly,0.657244,0.613861,0.634812
Lung Opacity,0.744152,0.712885,0.728183
Lung Lesion,0.266355,0.360759,0.306452
Edema,0.603448,0.550562,0.575793
Consolidation,0.620123,0.683258,0.650161
Pneumonia,0.438119,0.517544,0.474531
Atelectasis,0.624299,0.660079,0.641691
Pneumothorax,0.148148,0.171429,0.15894
Pleural Effusion,0.640426,0.650108,0.64523


In [12]:
pred_captions = np.load('remote_server/predicted_test_captions.npy')
until = pred_captions.shape[0]
true_captions = np.load('mimic_cxr/raw_embeddings/test/captions_0.npy')[:until]
evaluate_clinical(true_captions, pred_captions, batch_size=12)

100%|██████████| 42/42 [00:15<00:00,  2.72it/s]
100%|██████████| 42/42 [00:09<00:00,  4.50it/s]


Metrics,Recall,Precision,F1
Enlarged Cardiomediastinum,0.746429,0.741135,0.743772
Cardiomegaly,0.691304,0.679487,0.685345
Lung Opacity,0.784722,0.733766,0.758389
Lung Lesion,0.2,0.367647,0.259067
Edema,0.565934,0.559783,0.562842
Consolidation,0.480952,0.554945,0.515306
Pneumonia,0.413408,0.544118,0.469841
Atelectasis,0.610465,0.486111,0.541237
Pneumothorax,0.071429,0.142857,0.095238
Pleural Effusion,0.527607,0.530864,0.529231
