In [27]:
import logging
import math
import os
from pathlib import Path
import json


import torch
import pandas as pd
import hydra
from omegaconf import OmegaConf
from rich.pretty import pprint

from src.data.data_pipeline import data_pipeline
from src.factories import (
    get_callbacks,
    get_dataloaders,
    get_datasets,
    get_lookups,
    get_lr_scheduler,
    get_metric_collections,
    get_model,
    get_optimizer,
    get_text_encoder,
    get_transform,
)
from src.trainer.trainer import Trainer
from src.utils.seed import set_seed


In [28]:
### Parameters for inference ###
model_to_load = 'mimic_axa_cpt_hierarchical'
k=5 

# Makes all necessary imports

## Import config

In [29]:
#set directory 
dir_all_models = Path('files')
model_checkpoints = dir_all_models/model_to_load

#load config file
cfg = OmegaConf.load(model_checkpoints/'config.yaml')

In [30]:
# Check if CUDA_VISIBLE_DEVICES is set
if "CUDA_VISIBLE_DEVICES" not in os.environ:
    if cfg.gpu != -1 and cfg.gpu is not None and cfg.gpu != "":
        os.environ["CUDA_VISIBLE_DEVICES"] = (
            ",".join([str(gpu) for gpu in cfg.gpu])
            if isinstance(cfg.gpu, list)
            else str(cfg.gpu)
        )

    else:
        os.environ["CUDA_VISIBLE_DEVICES"] = ""

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pprint(f"Device: {device}")
pprint(f"CUDA_VISIBLE_DEVICES: {os.environ['CUDA_VISIBLE_DEVICES']}")

In [31]:
set_seed(cfg.seed)

## Import model

In [32]:
data = data_pipeline(config=cfg.data)

text_encoder = get_text_encoder(
    config=cfg.text_encoder, data_dir=cfg.data.dir, texts=data.get_train_documents
) #not needed
label_transform = get_transform(
    config=cfg.label_transform,
    targets=data.all_targets,
    load_transform_path=cfg.load_model,
) #not needed
text_transform = get_transform(
    config=cfg.text_transform,
    texts=data.get_train_documents,
    text_encoder=text_encoder,
    load_transform_path=cfg.load_model,
)

lookups = get_lookups(
    config=cfg.lookup,
    data=data,
    label_transform=label_transform,
    text_transform=text_transform,
)

model = get_model(
        config=cfg.model, data_info=lookups.data_info, text_encoder=text_encoder, label_transform = label_transform
    )
model.to(device)
model_weights = torch.load(model_checkpoints/"best_model.pt", map_location=device)
model.load_state_dict(model_weights['model'])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at RoBERTa-base-PM-M3-Voc/RoBERTa-base-PM-M3-Voc-hf were not used when initializing RobertaModel: ['lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

# Inference

## Prepare functions

In [33]:
model.eval()

def prepare_inputs(text, text_transform, chunk_size):
    tokenized_text = text_transform.transform(text)
    token_ids, attention_mask = tokenized_text.values()
    data = text_transform.seq2batch(token_ids, chunk_size=chunk_size)
    attention_mask = text_transform.seq2batch(
        attention_mask, chunk_size=chunk_size
    )
    return data, attention_mask

def run_model(input_ids, attention_mask):
    with torch.no_grad():
        logits = model(input_ids, attention_mask)
        logits = torch.sigmoid(logits)
    return logits

In [34]:
def logits_to_df(logits):
    # Define your tensor and JSON data
    tensor_probs = logits[0]
    target2index_path = model_checkpoints/'target2index.json'
    with open(target2index_path, 'r') as json_file:
        target2index = json.load(json_file)

    # Create an empty DataFrame
    df = pd.DataFrame(columns=target2index)

    # Populate the DataFrame with probabilities
    data_to_append = {}

    for target, index in target2index.items():
        probability = tensor_probs[index].item()
        data_to_append[str(target)] = probability
    new_record = pd.DataFrame([data_to_append])

    return pd.concat([df, new_record], ignore_index=True)

def select_top_k(k, df):
    result_df_values = pd.DataFrame()

    #create columns 
    for i in range(k):
        result_df_values['top{}_column'.format(i+1)] = None
        result_df_values['top{}_value'.format(i+1)] = None
    
    #set values to the new columns
    for i  in range(len(df)):
        top_k = df.iloc[i].sort_values(ascending= False)[:k]
        keys = top_k.index
        values = top_k.values
        
        keys_values = [item for pair in zip(keys, values) for item in pair]
    
        result_df_values.loc[i] = keys_values

    return result_df_values


## Get raw results

In [35]:
text = "The patient underwent surgery on his right eye. Carcinoma of circumference 5mm found by biopsy. No other trauma detected expect an ankle profound wound. "
input_ids, attention_mask = prepare_inputs(text, text_transform, cfg.dataset.configs.chunk_size)
run_model(input_ids, attention_mask)

tensor([[0.0002, 0.0002, 0.0005,  ..., 0.0010, 0.0001, 0.0002]])

## Analyse results

In [36]:
result_df= logits_to_df(logits)
select_top_k(k, result_df)


Unnamed: 0,top1_column,top1_value,top2_column,top2_value,top3_column,top3_value,top4_column,top4_value,top5_column,top5_value
0,99204,0.497726,97010,0.176998,99291,0.074879,99284,0.063306,99232,0.041571
